/*
 * Decompiled with CFR 0.152.
 */
package opennlp.tools.tokenize;

import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.regex.Pattern;
import opennlp.tools.dictionary.Dictionary;
import opennlp.tools.ml.ArrayMath;
import opennlp.tools.ml.EventTrainer;
import opennlp.tools.ml.Probabilistic;
import opennlp.tools.ml.TrainerFactory;
import opennlp.tools.ml.model.MaxentModel;
import opennlp.tools.models.ModelType;
import opennlp.tools.tokenize.AbstractTokenizer;
import opennlp.tools.tokenize.TokSpanEventStream;
import opennlp.tools.tokenize.TokenContextGenerator;
import opennlp.tools.tokenize.TokenSample;
import opennlp.tools.tokenize.TokenizerFactory;
import opennlp.tools.tokenize.TokenizerModel;
import opennlp.tools.tokenize.WhitespaceTokenizer;
import opennlp.tools.util.DownloadUtil;
import opennlp.tools.util.ObjectStream;
import opennlp.tools.util.Span;
import opennlp.tools.util.StringList;
import opennlp.tools.util.TrainingParameters;

public class TokenizerME
extends AbstractTokenizer
implements Probabilistic {
    public static final String SPLIT = "T";
    public static final String NO_SPLIT = "F";
    private final Pattern alphanumeric;
    private final MaxentModel model;
    private final TokenContextGenerator cg;
    private final boolean useAlphaNumericOptimization;
    private final List<Double> tokProbs;
    private final List<Span> newTokens;
    private final Dictionary abbDict;

    public TokenizerME(String language) throws IOException {
        this(DownloadUtil.downloadModel(language, ModelType.TOKENIZER, TokenizerModel.class));
    }

    public TokenizerME(TokenizerModel model) {
        this(model, model.getAbbreviations());
    }

    public TokenizerME(TokenizerModel model, Dictionary abbDict) {
        this.model = model.getMaxentModel();
        this.abbDict = abbDict;
        TokenizerFactory factory = model.getFactory();
        this.cg = factory.getContextGenerator();
        this.alphanumeric = factory.getAlphaNumericPattern();
        this.useAlphaNumericOptimization = factory.isUseAlphaNumericOptimization();
        this.newTokens = new ArrayList<Span>();
        this.tokProbs = new ArrayList<Double>(50);
    }

    public double[] probs() {
        return ArrayMath.toDoubleArray(this.tokProbs);
    }

    @Deprecated(forRemoval=true, since="2.5.5")
    public double[] getTokenProbabilities() {
        return this.probs();
    }

    public Span[] tokenizePos(String d) {
        WhitespaceTokenizer whitespaceTokenizer = WhitespaceTokenizer.INSTANCE;
        whitespaceTokenizer.setKeepNewLines(this.keepNewLines);
        Span[] tokens = whitespaceTokenizer.tokenizePos(d);
        this.newTokens.clear();
        this.tokProbs.clear();
        for (Span s : tokens) {
            String tok = d.substring(s.getStart(), s.getEnd());
            if (tok.length() < 2) {
                this.newTokens.add(s);
                this.tokProbs.add(1.0);
                continue;
            }
            if (this.useAlphaNumericOptimization() && this.alphanumeric.matcher(tok).matches()) {
                this.newTokens.add(s);
                this.tokProbs.add(1.0);
                continue;
            }
            int start = s.getStart();
            int end = s.getEnd();
            int origStart = s.getStart();
            double tokenProb = 1.0;
            for (int j = origStart + 1; j < end; ++j) {
                double[] probs = this.model.eval(this.cg.getContext(tok, j - origStart));
                String best = this.model.getBestOutcome(probs);
                tokenProb *= probs[this.model.getIndex(best)];
                if (!best.equals(SPLIT)) continue;
                if (this.isAcceptableAbbreviation(tok)) {
                    this.newTokens.add(new Span(start, end));
                    this.tokProbs.add(tokenProb);
                    long numberOfDots = tok.codePoints().filter(ch -> ch == 46).count();
                    start = (j += (int)numberOfDots) + 1;
                } else {
                    this.newTokens.add(new Span(start, j));
                    this.tokProbs.add(tokenProb);
                    start = j;
                }
                tokenProb = 1.0;
            }
            if (start >= end) continue;
            this.newTokens.add(new Span(start, end));
            this.tokProbs.add(tokenProb);
        }
        Span[] spans = new Span[this.newTokens.size()];
        this.newTokens.toArray(spans);
        return spans;
    }

    public static TokenizerModel train(ObjectStream<TokenSample> samples, TokenizerFactory factory, TrainingParameters mlParams) throws IOException {
        HashMap<String, String> manifestInfoEntries = new HashMap<String, String>();
        TokSpanEventStream eventStream = new TokSpanEventStream(samples, factory.isUseAlphaNumericOptimization(), factory.getAlphaNumericPattern(), factory.getContextGenerator());
        EventTrainer<TrainingParameters> trainer = TrainerFactory.getEventTrainer(mlParams, manifestInfoEntries);
        MaxentModel maxentModel = trainer.train((ObjectStream)eventStream);
        return new TokenizerModel(maxentModel, manifestInfoEntries, factory);
    }

    public boolean useAlphaNumericOptimization() {
        return this.useAlphaNumericOptimization;
    }

    protected boolean isAcceptableAbbreviation(CharSequence s) {
        if (this.abbDict == null) {
            return false;
        }
        return this.abbDict.contains(new StringList(s.toString()));
    }
}

