lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/KoreanTokenizer.java - lucene-solr - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License.  You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
 package org.apache.lucene.analysis.ko;

 import static java.lang.Character.UnicodeScript;

 import java.io.IOException;
 import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.EnumMap;
 import java.util.List;
 import org.apache.lucene.analysis.Tokenizer;
 import org.apache.lucene.analysis.ko.dict.CharacterDefinition;
 import org.apache.lucene.analysis.ko.dict.ConnectionCosts;
 import org.apache.lucene.analysis.ko.dict.Dictionary;
 import org.apache.lucene.analysis.ko.dict.TokenInfoDictionary;
 import org.apache.lucene.analysis.ko.dict.TokenInfoFST;
 import org.apache.lucene.analysis.ko.dict.UnknownDictionary;
 import org.apache.lucene.analysis.ko.dict.UserDictionary;
 import org.apache.lucene.analysis.ko.tokenattributes.PartOfSpeechAttribute;
 import org.apache.lucene.analysis.ko.tokenattributes.ReadingAttribute;
 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
 import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
 import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
 import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute;
 import org.apache.lucene.analysis.util.RollingCharBuffer;
 import org.apache.lucene.util.ArrayUtil;
 import org.apache.lucene.util.AttributeFactory;
 import org.apache.lucene.util.IntsRef;
 import org.apache.lucene.util.RamUsageEstimator;
 import org.apache.lucene.util.fst.FST;

 /**
  * Tokenizer for Korean that uses morphological analysis.
  *
  * <p>This tokenizer sets a number of additional attributes:
  *
  * <ul>
  *   <li>{@link PartOfSpeechAttribute} containing part-of-speech.
  *   <li>{@link ReadingAttribute} containing reading.
  * </ul>
  *
  * <p>This tokenizer uses a rolling Viterbi search to find the least cost segmentation (path) of the
  * incoming characters.
  *
  * @lucene.experimental
  */
 public final class KoreanTokenizer extends Tokenizer {

   /** Token type reflecting the original source of this token */
   public enum Type {
     /** Known words from the system dictionary. */
     KNOWN,
     /** Unknown words (heuristically segmented). */
     UNKNOWN,
     /** Known words from the user dictionary. */
     USER
   }

   /**
    * Decompound mode: this determines how the tokenizer handles {@link POS.Type#COMPOUND}, {@link
    * POS.Type#INFLECT} and {@link POS.Type#PREANALYSIS} tokens.
    */
   public enum DecompoundMode {
     /** No decomposition for compound. */
     NONE,

     /** Decompose compounds and discards the original form (default). */
     DISCARD,

     /** Decompose compounds and keeps the original form. */
     MIXED
   }

   /** Default mode for the decompound of tokens ({@link DecompoundMode#DISCARD}. */
   public static final DecompoundMode DEFAULT_DECOMPOUND = DecompoundMode.DISCARD;

   private static final boolean VERBOSE = false;

   // For safety:
   private static final int MAX_UNKNOWN_WORD_LENGTH = 1024;
   private static final int MAX_BACKTRACE_GAP = 1024;

   private final EnumMap<Type, Dictionary> dictionaryMap = new EnumMap<>(Type.class);

   private final TokenInfoFST fst;
   private final TokenInfoDictionary dictionary;
   private final UnknownDictionary unkDictionary;
   private final ConnectionCosts costs;
   private final UserDictionary userDictionary;
   private final CharacterDefinition characterDefinition;

   private final FST.Arc<Long> arc = new FST.Arc<>();
   private final FST.BytesReader fstReader;
   private final IntsRef wordIdRef = new IntsRef();

   private final FST.BytesReader userFSTReader;
   private final TokenInfoFST userFST;

   private final boolean discardPunctuation;
   private final DecompoundMode mode;
   private final boolean outputUnknownUnigrams;

   private final RollingCharBuffer buffer = new RollingCharBuffer();

   private final WrappedPositionArray positions = new WrappedPositionArray();

   // True once we've hit the EOF from the input reader:
   private boolean end;

   // Last absolute position we backtraced from:
   private int lastBackTracePos;

   // Next absolute position to process:
   private int pos;

   // Already parsed, but not yet passed to caller, tokens:
   private final List<Token> pending = new ArrayList<>();

   private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
   private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
   private final PositionIncrementAttribute posIncAtt =
       addAttribute(PositionIncrementAttribute.class);
   private final PositionLengthAttribute posLengthAtt = addAttribute(PositionLengthAttribute.class);
   private final PartOfSpeechAttribute posAtt = addAttribute(PartOfSpeechAttribute.class);
   private final ReadingAttribute readingAtt = addAttribute(ReadingAttribute.class);

   /**
    * Creates a new KoreanTokenizer with default parameters.
    *
    * <p>Uses the default AttributeFactory.
    */
   public KoreanTokenizer() {
     this(DEFAULT_TOKEN_ATTRIBUTE_FACTORY, null, DEFAULT_DECOMPOUND, false, true);
   }

   /**
    * Create a new KoreanTokenizer using the system and unknown dictionaries shipped with Lucene.
    *
    * @param factory the AttributeFactory to use
    * @param userDictionary Optional: if non-null, user dictionary.
    * @param mode Decompound mode.
    * @param outputUnknownUnigrams if true outputs unigrams for unknown words.
    */
   public KoreanTokenizer(
       AttributeFactory factory,
       UserDictionary userDictionary,
       DecompoundMode mode,
       boolean outputUnknownUnigrams) {
     this(factory, userDictionary, mode, outputUnknownUnigrams, true);
   }

   /**
    * Create a new KoreanTokenizer using the system and unknown dictionaries shipped with Lucene.
    *
    * @param factory the AttributeFactory to use
    * @param userDictionary Optional: if non-null, user dictionary.
    * @param mode Decompound mode.
    * @param outputUnknownUnigrams if true outputs unigrams for unknown words.
    * @param discardPunctuation true if punctuation tokens should be dropped from the output.
    */
   public KoreanTokenizer(
       AttributeFactory factory,
       UserDictionary userDictionary,
       DecompoundMode mode,
       boolean outputUnknownUnigrams,
       boolean discardPunctuation) {
     this(
         factory,
         TokenInfoDictionary.getInstance(),
         UnknownDictionary.getInstance(),
         ConnectionCosts.getInstance(),
         userDictionary,
         mode,
         outputUnknownUnigrams,
         discardPunctuation);
   }

   /**
    * Create a new KoreanTokenizer supplying a custom system dictionary and unknown dictionary. This
    * constructor provides an entry point for users that want to construct custom language models
    * that can be used as input to {@link org.apache.lucene.analysis.ko.util.DictionaryBuilder}.
    *
    * @param factory the AttributeFactory to use
    * @param systemDictionary a custom known token dictionary
    * @param unkDictionary a custom unknown token dictionary
    * @param connectionCosts custom token transition costs
    * @param userDictionary Optional: if non-null, user dictionary.
    * @param mode Decompound mode.
    * @param outputUnknownUnigrams if true outputs unigrams for unknown words.
    * @param discardPunctuation true if punctuation tokens should be dropped from the output.
    * @lucene.experimental
    */
   public KoreanTokenizer(
       AttributeFactory factory,
       TokenInfoDictionary systemDictionary,
       UnknownDictionary unkDictionary,
       ConnectionCosts connectionCosts,
       UserDictionary userDictionary,
       DecompoundMode mode,
       boolean outputUnknownUnigrams,
       boolean discardPunctuation) {
     super(factory);
     this.dictionary = systemDictionary;
     this.fst = dictionary.getFST();
     this.unkDictionary = unkDictionary;
     this.characterDefinition = unkDictionary.getCharacterDefinition();
     this.costs = connectionCosts;
     this.userDictionary = userDictionary;
     fstReader = fst.getBytesReader();
     if (userDictionary != null) {
       userFST = userDictionary.getFST();
       userFSTReader = userFST.getBytesReader();
     } else {
       userFST = null;
       userFSTReader = null;
     }
     this.mode = mode;
     this.outputUnknownUnigrams = outputUnknownUnigrams;
     this.discardPunctuation = discardPunctuation;
     buffer.reset(this.input);

     resetState();

     dictionaryMap.put(Type.KNOWN, dictionary);
     dictionaryMap.put(Type.UNKNOWN, unkDictionary);
     dictionaryMap.put(Type.USER, userDictionary);
   }

   private GraphvizFormatter dotOut;

   /** Expert: set this to produce graphviz (dot) output of the Viterbi lattice */
   public void setGraphvizFormatter(GraphvizFormatter dotOut) {
     this.dotOut = dotOut;
   }

   @Override
   public void close() throws IOException {
     super.close();
     buffer.reset(input);
   }

   @Override
   public void reset() throws IOException {
     super.reset();
     buffer.reset(input);
     resetState();
   }

   private void resetState() {
     positions.reset();
     pos = 0;
     end = false;
     lastBackTracePos = 0;
     pending.clear();

     // Add BOS:
     positions.get(0).add(0, 0, -1, -1, -1, -1, Type.KNOWN);
   }

   @Override
   public void end() throws IOException {
     super.end();
     // Set final offset
     int finalOffset = correctOffset(pos);
     offsetAtt.setOffset(finalOffset, finalOffset);
   }

   // Holds all back pointers arriving to this position:
   static final class Position {

     int pos;

     int count;

     // maybe single int array * 5?
     int[] costs = new int[8];
     int[] lastRightID = new int[8];
     int[] backPos = new int[8];
     int[] backWordPos = new int[8];
     int[] backIndex = new int[8];
     int[] backID = new int[8];
     Type[] backType = new Type[8];

     public void grow() {
       costs = ArrayUtil.grow(costs, 1 + count);
       lastRightID = ArrayUtil.grow(lastRightID, 1 + count);
       backPos = ArrayUtil.grow(backPos, 1 + count);
       backWordPos = ArrayUtil.grow(backWordPos, 1 + count);
       backIndex = ArrayUtil.grow(backIndex, 1 + count);
       backID = ArrayUtil.grow(backID, 1 + count);

       // NOTE: sneaky: grow separately because
       // ArrayUtil.grow will otherwise pick a different
       // length than the int[]s we just grew:
       final Type[] newBackType = new Type[backID.length];
       System.arraycopy(backType, 0, newBackType, 0, backType.length);
       backType = newBackType;
     }

     public void add(
         int cost,
         int lastRightID,
         int backPos,
         int backRPos,
         int backIndex,
         int backID,
         Type backType) {
       // NOTE: this isn't quite a true Viterbi search,
       // because we should check if lastRightID is
       // already present here, and only update if the new
       // cost is less than the current cost, instead of
       // simply appending.  However, that will likely hurt
       // performance (usually we add a lastRightID only once),
       // and it means we actually create the full graph
       // intersection instead of a "normal" Viterbi lattice:
       if (count == costs.length) {
         grow();
       }
       this.costs[count] = cost;
       this.lastRightID[count] = lastRightID;
       this.backPos[count] = backPos;
       this.backWordPos[count] = backRPos;
       this.backIndex[count] = backIndex;
       this.backID[count] = backID;
       this.backType[count] = backType;
       count++;
     }

     public void reset() {
       count = 0;
     }
   }

   /**
    * Returns the space penalty associated with the provided {@link POS.Tag}.
    *
    * @param leftPOS the left part of speech of the current token.
    * @param numSpaces the number of spaces before the current token.
    */
   private int computeSpacePenalty(POS.Tag leftPOS, int numSpaces) {
     int spacePenalty = 0;
     if (numSpaces > 0) {
       // TODO we should extract the penalty (left-space-penalty-factor) from the dicrc file.
       switch (leftPOS) {
         case E:
         case J:
         case VCP:
         case XSA:
         case XSN:
         case XSV:
           spacePenalty = 3000;
           break;

         default:
           break;
       }
     }
     return spacePenalty;
   }

   private void add(
       Dictionary dict, Position fromPosData, int wordPos, int endPos, int wordID, Type type) {
     final POS.Tag leftPOS = dict.getLeftPOS(wordID);
     final int wordCost = dict.getWordCost(wordID);
     final int leftID = dict.getLeftId(wordID);
     int leastCost = Integer.MAX_VALUE;
     int leastIDX = -1;
     assert fromPosData.count > 0;
     for (int idx = 0; idx < fromPosData.count; idx++) {
       // The number of spaces before the term
       int numSpaces = wordPos - fromPosData.pos;

       // Cost is path cost so far, plus word cost (added at
       // end of loop), plus bigram cost and space penalty cost.
       final int cost =
           fromPosData.costs[idx]
               + costs.get(fromPosData.lastRightID[idx], leftID)
               + computeSpacePenalty(leftPOS, numSpaces);
       if (VERBOSE) {
         System.out.println(
             "      fromIDX="
                 + idx
                 + ": cost="
                 + cost
                 + " (prevCost="
                 + fromPosData.costs[idx]
                 + " wordCost="
                 + wordCost
                 + " bgCost="
                 + costs.get(fromPosData.lastRightID[idx], leftID)
                 + " spacePenalty="
                 + computeSpacePenalty(leftPOS, numSpaces)
                 + ") leftID="
                 + leftID
                 + " leftPOS="
                 + leftPOS.name()
                 + ")");
       }
       if (cost < leastCost) {
         leastCost = cost;
         leastIDX = idx;
         if (VERBOSE) {
           System.out.println("        **");
         }
       }
     }

     leastCost += wordCost;

     if (VERBOSE) {
       System.out.println(
           "      + cost="
               + leastCost
               + " wordID="
               + wordID
               + " leftID="
               + leftID
               + " leastIDX="
               + leastIDX
               + " toPos="
               + endPos
               + " toPos.idx="
               + positions.get(endPos).count);
     }

     positions
         .get(endPos)
         .add(leastCost, dict.getRightId(wordID), fromPosData.pos, wordPos, leastIDX, wordID, type);
   }

   @Override
   public boolean incrementToken() throws IOException {

     // parse() is able to return w/o producing any new
     // tokens, when the tokens it had produced were entirely
     // punctuation.  So we loop here until we get a real
     // token or we end:
     while (pending.size() == 0) {
       if (end) {
         return false;
       }

       // Push Viterbi forward some more:
       parse();
     }

     final Token token = pending.remove(pending.size() - 1);

     int length = token.getLength();
     clearAttributes();
     assert length > 0;
     // System.out.println("off=" + token.getOffset() + " len=" + length + " vs " +
     // token.getSurfaceForm().length);
     termAtt.copyBuffer(token.getSurfaceForm(), token.getOffset(), length);
     offsetAtt.setOffset(correctOffset(token.getStartOffset()), correctOffset(token.getEndOffset()));
     posAtt.setToken(token);
     readingAtt.setToken(token);
     posIncAtt.setPositionIncrement(token.getPositionIncrement());
     posLengthAtt.setPositionLength(token.getPositionLength());
     if (VERBOSE) {
       System.out.println(Thread.currentThread().getName() + ":    incToken: return token=" + token);
     }
     return true;
   }

   // TODO: make generic'd version of this "circular array"?
   // It's a bit tricky because we do things to the Position
   // (eg, set .pos = N on reuse)...
   static final class WrappedPositionArray {
     private Position[] positions = new Position[8];

     public WrappedPositionArray() {
       for (int i = 0; i < positions.length; i++) {
         positions[i] = new Position();
       }
     }

     // Next array index to write to in positions:
     private int nextWrite;

     // Next position to write:
     private int nextPos;

     // How many valid Position instances are held in the
     // positions array:
     private int count;

     public void reset() {
       nextWrite--;
       while (count > 0) {
         if (nextWrite == -1) {
           nextWrite = positions.length - 1;
         }
         positions[nextWrite--].reset();
         count--;
       }
       nextWrite = 0;
       nextPos = 0;
       count = 0;
     }

     /**
      * Get Position instance for this absolute position; this is allowed to be arbitrarily far "in
      * the future" but cannot be before the last freeBefore.
      */
     public Position get(int pos) {
       while (pos >= nextPos) {
         // System.out.println("count=" + count + " vs len=" + positions.length);
         if (count == positions.length) {
           Position[] newPositions =
               new Position[ArrayUtil.oversize(1 + count, RamUsageEstimator.NUM_BYTES_OBJECT_REF)];
           // System.out.println("grow positions " + newPositions.length);
           System.arraycopy(positions, nextWrite, newPositions, 0, positions.length - nextWrite);
           System.arraycopy(positions, 0, newPositions, positions.length - nextWrite, nextWrite);
           for (int i = positions.length; i < newPositions.length; i++) {
             newPositions[i] = new Position();
           }
           nextWrite = positions.length;
           positions = newPositions;
         }
         if (nextWrite == positions.length) {
           nextWrite = 0;
         }
         // Should have already been reset:
         assert positions[nextWrite].count == 0;
         positions[nextWrite++].pos = nextPos++;
         count++;
       }
       assert inBounds(pos);
       final int index = getIndex(pos);
       assert positions[index].pos == pos;
       return positions[index];
     }

     public int getNextPos() {
       return nextPos;
     }

     // For assert:
     private boolean inBounds(int pos) {
       return pos < nextPos && pos >= nextPos - count;
     }

     private int getIndex(int pos) {
       int index = nextWrite - (nextPos - pos);
       if (index < 0) {
         index += positions.length;
       }
       return index;
     }

     public void freeBefore(int pos) {
       final int toFree = count - (nextPos - pos);
       assert toFree >= 0;
       assert toFree <= count;
       int index = nextWrite - count;
       if (index < 0) {
         index += positions.length;
       }
       for (int i = 0; i < toFree; i++) {
         if (index == positions.length) {
           index = 0;
         }
         // System.out.println("  fb idx=" + index);
         positions[index].reset();
         index++;
       }
       count -= toFree;
     }
   }

   /* Incrementally parse some more characters.  This runs
    * the viterbi search forwards "enough" so that we
    * generate some more tokens.  How much forward depends on
    * the chars coming in, since some chars could cause
    * longer-lasting ambiguity in the parsing.  Once the
    * ambiguity is resolved, then we back trace, produce
    * the pending tokens, and return. */
   private void parse() throws IOException {
     if (VERBOSE) {
       System.out.println("\nPARSE");
     }

     // Index of the last character of unknown word:
     int unknownWordEndIndex = -1;

     // Maximum posAhead of user word in the entire input
     int userWordMaxPosAhead = -1;

     // Advances over each position (character):
     while (buffer.get(pos) != -1) {
       final Position posData = positions.get(pos);
       final boolean isFrontier = positions.getNextPos() == pos + 1;

       if (posData.count == 0) {
         // No arcs arrive here; move to next position:
         if (VERBOSE) {
           System.out.println("    no arcs in; skip pos=" + pos);
         }
         pos++;
         continue;
       }

       if (pos > lastBackTracePos && posData.count == 1 && isFrontier) {
         // We are at a "frontier", and only one node is
         // alive, so whatever the eventual best path is must
         // come through this node.  So we can safely commit
         // to the prefix of the best path at this point:
         backtrace(posData, 0);

         // Re-base cost so we don't risk int overflow:
         posData.costs[0] = 0;
         if (pending.size() > 0) {
           return;
         } else {
           // This means the backtrace only produced
           // punctuation tokens, so we must keep parsing.
         }
       }

       if (pos - lastBackTracePos >= MAX_BACKTRACE_GAP) {
         // Safety: if we've buffered too much, force a
         // backtrace now.  We find the least-cost partial
         // path, across all paths, backtrace from it, and
         // then prune all others.  Note that this, in
         // general, can produce the wrong result, if the
         // total best path did not in fact back trace
         // through this partial best path.  But it's the
         // best we can do... (short of not having a
         // safety!).

         // First pass: find least cost partial path so far,
         // including ending at future positions:
         int leastIDX = -1;
         int leastCost = Integer.MAX_VALUE;
         Position leastPosData = null;
         for (int pos2 = pos; pos2 < positions.getNextPos(); pos2++) {
           final Position posData2 = positions.get(pos2);
           for (int idx = 0; idx < posData2.count; idx++) {
             // System.out.println("    idx=" + idx + " cost=" + cost);
             final int cost = posData2.costs[idx];
             if (cost < leastCost) {
               leastCost = cost;
               leastIDX = idx;
               leastPosData = posData2;
             }
           }
         }

         // We will always have at least one live path:
         assert leastIDX != -1;

         // Second pass: prune all but the best path:
         for (int pos2 = pos; pos2 < positions.getNextPos(); pos2++) {
           final Position posData2 = positions.get(pos2);
           if (posData2 != leastPosData) {
             posData2.reset();
           } else {
             if (leastIDX != 0) {
               posData2.costs[0] = posData2.costs[leastIDX];
               posData2.lastRightID[0] = posData2.lastRightID[leastIDX];
               posData2.backPos[0] = posData2.backPos[leastIDX];
               posData2.backWordPos[0] = posData2.backWordPos[leastIDX];
               posData2.backIndex[0] = posData2.backIndex[leastIDX];
               posData2.backID[0] = posData2.backID[leastIDX];
               posData2.backType[0] = posData2.backType[leastIDX];
             }
             posData2.count = 1;
           }
         }

         backtrace(leastPosData, 0);

         // Re-base cost so we don't risk int overflow:
         Arrays.fill(leastPosData.costs, 0, leastPosData.count, 0);

         if (pos != leastPosData.pos) {
           // We jumped into a future position:
           assert pos < leastPosData.pos;
           pos = leastPosData.pos;
         }
         if (pending.size() > 0) {
           return;
         } else {
           // This means the backtrace only produced
           // punctuation tokens, so we must keep parsing.
           continue;
         }
       }

       if (VERBOSE) {
         System.out.println(
             "\n  extend @ pos="
                 + pos
                 + " char="
                 + (char) buffer.get(pos)
                 + " hex="
                 + Integer.toHexString(buffer.get(pos)));
       }

       if (VERBOSE) {
         System.out.println("    " + posData.count + " arcs in");
       }

       // Move to the first character that is not a whitespace.
       // The whitespaces are added as a prefix for the term that we extract,
       // this information is then used when computing the cost for the term using
       // the space penalty factor.
       // They are removed when the final tokens are generated.
       if (Character.getType(buffer.get(pos)) == Character.SPACE_SEPARATOR) {
         int nextChar = buffer.get(++pos);
         while (nextChar != -1 && Character.getType(nextChar) == Character.SPACE_SEPARATOR) {
           pos++;
           nextChar = buffer.get(pos);
         }
       }
       if (buffer.get(pos) == -1) {
         pos = posData.pos;
       }

       boolean anyMatches = false;

       // First try user dict:
       if (userFST != null) {
         userFST.getFirstArc(arc);
         int output = 0;
         int maxPosAhead = 0;
         int outputMaxPosAhead = 0;
         int arcFinalOutMaxPosAhead = 0;

         for (int posAhead = pos; ; posAhead++) {
           final int ch = buffer.get(posAhead);
           if (ch == -1) {
             break;
           }
           if (userFST.findTargetArc(ch, arc, arc, posAhead == pos, userFSTReader) == null) {
             break;
           }
           output += arc.output().intValue();
           if (arc.isFinal()) {
             maxPosAhead = posAhead;
             outputMaxPosAhead = output;
             arcFinalOutMaxPosAhead = arc.nextFinalOutput().intValue();
             anyMatches = true;
           }
         }

         // Longest matching for user word
         if (anyMatches && maxPosAhead > userWordMaxPosAhead) {
           if (VERBOSE) {
             System.out.println(
                 "    USER word "
                     + new String(buffer.get(pos, maxPosAhead + 1))
                     + " toPos="
                     + (maxPosAhead + 1));
           }
           add(
               userDictionary,
               posData,
               pos,
               maxPosAhead + 1,
               outputMaxPosAhead + arcFinalOutMaxPosAhead,
               Type.USER);
           userWordMaxPosAhead = Math.max(userWordMaxPosAhead, maxPosAhead);
         }
       }

       // TODO: we can be more aggressive about user
       // matches?  if we are "under" a user match then don't
       // extend KNOWN/UNKNOWN paths?

       if (!anyMatches) {
         // Next, try known dictionary matches
         fst.getFirstArc(arc);
         int output = 0;

         for (int posAhead = pos; ; posAhead++) {
           final int ch = buffer.get(posAhead);
           if (ch == -1) {
             break;
           }
           // System.out.println("    match " + (char) ch + " posAhead=" + posAhead);

           if (fst.findTargetArc(ch, arc, arc, posAhead == pos, fstReader) == null) {
             break;
           }

           output += arc.output().intValue();

           // Optimization: for known words that are too-long
           // (compound), we should pre-compute the 2nd
           // best segmentation and store it in the
           // dictionary instead of recomputing it each time a
           // match is found.

           if (arc.isFinal()) {
             dictionary.lookupWordIds(output + arc.nextFinalOutput().intValue(), wordIdRef);
             if (VERBOSE) {
               System.out.println(
                   "    KNOWN word "
                       + new String(buffer.get(pos, posAhead - pos + 1))
                       + " toPos="
                       + (posAhead + 1)
                       + " "
                       + wordIdRef.length
                       + " wordIDs");
             }
             for (int ofs = 0; ofs < wordIdRef.length; ofs++) {
               add(
                   dictionary,
                   posData,
                   pos,
                   posAhead + 1,
                   wordIdRef.ints[wordIdRef.offset + ofs],
                   Type.KNOWN);
               anyMatches = true;
             }
           }
         }
       }

       if (unknownWordEndIndex > posData.pos) {
         pos++;
         continue;
       }

       final char firstCharacter = (char) buffer.get(pos);
       if (!anyMatches || characterDefinition.isInvoke(firstCharacter)) {

         // Find unknown match:
         int characterId = characterDefinition.getCharacterClass(firstCharacter);
         // NOTE: copied from UnknownDictionary.lookup:
         int unknownWordLength;
         if (!characterDefinition.isGroup(firstCharacter)) {
           unknownWordLength = 1;
         } else {
           // Extract unknown word. Characters with the same script are considered to be part of
           // unknown word
           unknownWordLength = 1;
           UnicodeScript scriptCode = UnicodeScript.of(firstCharacter);
           final boolean isPunct = isPunctuation(firstCharacter);
           final boolean isDigit = Character.isDigit(firstCharacter);
           for (int posAhead = pos + 1; unknownWordLength < MAX_UNKNOWN_WORD_LENGTH; posAhead++) {
             int next = buffer.get(posAhead);
             if (next == -1) {
               break;
             }
             char ch = (char) next;
             int chType = Character.getType(ch);
             UnicodeScript sc = UnicodeScript.of(next);
             boolean sameScript =
                 isSameScript(scriptCode, sc)
                     // Non-spacing marks inherit the script of their base character,
                     // following recommendations from UTR #24.
                     || chType == Character.NON_SPACING_MARK;

             if (sameScript
                 // split on punctuation
                 && isPunctuation(ch, chType) == isPunct
                 // split on digit
                 && Character.isDigit(ch) == isDigit
                 && characterDefinition.isGroup(ch)) {
               unknownWordLength++;
             } else {
               break;
             }
             // Update the script code and character class if the original script
             // is Inherited or Common.
             if (isCommonOrInherited(scriptCode) && isCommonOrInherited(sc) == false) {
               scriptCode = sc;
               characterId = characterDefinition.getCharacterClass(ch);
             }
           }
         }

         unkDictionary.lookupWordIds(
             characterId, wordIdRef); // characters in input text are supposed to be the same
         if (VERBOSE) {
           System.out.println(
               "    UNKNOWN word len=" + unknownWordLength + " " + wordIdRef.length + " wordIDs");
         }
         for (int ofs = 0; ofs < wordIdRef.length; ofs++) {
           add(
               unkDictionary,
               posData,
               pos,
               pos + unknownWordLength,
               wordIdRef.ints[wordIdRef.offset + ofs],
               Type.UNKNOWN);
         }
       }

       pos++;
     }

     end = true;

     if (pos > 0) {

       final Position endPosData = positions.get(pos);
       int leastCost = Integer.MAX_VALUE;
       int leastIDX = -1;
       if (VERBOSE) {
         System.out.println("  end: " + endPosData.count + " nodes");
       }
       for (int idx = 0; idx < endPosData.count; idx++) {
         // Add EOS cost:
         final int cost = endPosData.costs[idx] + costs.get(endPosData.lastRightID[idx], 0);
         // System.out.println("    idx=" + idx + " cost=" + cost + " (pathCost=" +
         // endPosData.costs[idx] + " bgCost=" + costs.get(endPosData.lastRightID[idx], 0) + ")
         // backPos=" + endPosData.backPos[idx]);
         if (cost < leastCost) {
           leastCost = cost;
           leastIDX = idx;
         }
       }

       backtrace(endPosData, leastIDX);
     } else {
       // No characters in the input string; return no tokens!
     }
   }

   // the pending list.  The pending list is then in-reverse
   // (last token should be returned first).
   private void backtrace(final Position endPosData, final int fromIDX) {
     final int endPos = endPosData.pos;

     if (VERBOSE) {
       System.out.println(
           "\n  backtrace: endPos="
               + endPos
               + " pos="
               + pos
               + "; "
               + (pos - lastBackTracePos)
               + " characters; last="
               + lastBackTracePos
               + " cost="
               + endPosData.costs[fromIDX]);
     }

     final char[] fragment = buffer.get(lastBackTracePos, endPos - lastBackTracePos);

     if (dotOut != null) {
       dotOut.onBacktrace(this, positions, lastBackTracePos, endPosData, fromIDX, fragment, end);
     }

     int pos = endPos;
     int bestIDX = fromIDX;

     // TODO: sort of silly to make Token instances here; the
     // back trace has all info needed to generate the
     // token.  So, we could just directly set the attrs,
     // from the backtrace, in incrementToken w/o ever
     // creating Token; we'd have to defer calling freeBefore
     // until after the backtrace was fully "consumed" by
     // incrementToken.

     while (pos > lastBackTracePos) {
       // System.out.println("BT: back pos=" + pos + " bestIDX=" + bestIDX);
       final Position posData = positions.get(pos);
       assert bestIDX < posData.count;

       int backPos = posData.backPos[bestIDX];
       int backWordPos = posData.backWordPos[bestIDX];
       assert backPos >= lastBackTracePos
           : "backPos=" + backPos + " vs lastBackTracePos=" + lastBackTracePos;
       // the length of the word without the whitespaces at the beginning.
       int length = pos - backWordPos;
       Type backType = posData.backType[bestIDX];
       int backID = posData.backID[bestIDX];
       int nextBestIDX = posData.backIndex[bestIDX];
       // the start of the word after the whitespace at the beginning.
       final int fragmentOffset = backWordPos - lastBackTracePos;
       assert fragmentOffset >= 0;

       final Dictionary dict = getDict(backType);

       if (outputUnknownUnigrams && backType == Type.UNKNOWN) {
         // outputUnknownUnigrams converts unknown word into unigrams:
         for (int i = length - 1; i >= 0; i--) {
           int charLen = 1;
           if (i > 0 && Character.isLowSurrogate(fragment[fragmentOffset + i])) {
             i--;
             charLen = 2;
           }
           final DictionaryToken token =
               new DictionaryToken(
                   Type.UNKNOWN,
                   unkDictionary,
                   CharacterDefinition.NGRAM,
                   fragment,
                   fragmentOffset + i,
                   charLen,
                   backWordPos + i,
                   backWordPos + i + charLen);
           pending.add(token);
           if (VERBOSE) {
             System.out.println("    add token=" + pending.get(pending.size() - 1));
           }
         }
       } else {
         final DictionaryToken token =
             new DictionaryToken(
                 backType,
                 dict,
                 backID,
                 fragment,
                 fragmentOffset,
                 length,
                 backWordPos,
                 backWordPos + length);
         if (token.getPOSType() == POS.Type.MORPHEME || mode == DecompoundMode.NONE) {
           if (shouldFilterToken(token) == false) {
             pending.add(token);
             if (VERBOSE) {
               System.out.println("    add token=" + pending.get(pending.size() - 1));
             }
           }
         } else {
           Dictionary.Morpheme[] morphemes = token.getMorphemes();
           if (morphemes == null) {
             pending.add(token);
             if (VERBOSE) {
               System.out.println("    add token=" + pending.get(pending.size() - 1));
             }
           } else {
             int endOffset = backWordPos + length;
             int posLen = 0;
             // decompose the compound
             for (int i = morphemes.length - 1; i >= 0; i--) {
               final Dictionary.Morpheme morpheme = morphemes[i];
               final Token compoundToken;
               if (token.getPOSType() == POS.Type.COMPOUND) {
                 assert endOffset - morpheme.surfaceForm.length() >= 0;
                 compoundToken =
                     new DecompoundToken(
                         morpheme.posTag,
                         morpheme.surfaceForm,
                         endOffset - morpheme.surfaceForm.length(),
                         endOffset);
               } else {
                 compoundToken =
                     new DecompoundToken(
                         morpheme.posTag,
                         morpheme.surfaceForm,
                         token.getStartOffset(),
                         token.getEndOffset());
               }
               if (i == 0 && mode == DecompoundMode.MIXED) {
                 compoundToken.setPositionIncrement(0);
               }
               ++posLen;
               endOffset -= morpheme.surfaceForm.length();
               pending.add(compoundToken);
               if (VERBOSE) {
                 System.out.println("    add token=" + pending.get(pending.size() - 1));
               }
             }
             if (mode == DecompoundMode.MIXED) {
               token.setPositionLength(Math.max(1, posLen));
               pending.add(token);
               if (VERBOSE) {
                 System.out.println("    add token=" + pending.get(pending.size() - 1));
               }
             }
           }
         }
       }
       if (discardPunctuation == false && backWordPos != backPos) {
         // Add a token for whitespaces between terms
         int offset = backPos - lastBackTracePos;
         int len = backWordPos - backPos;
         // System.out.println(offset + " " + fragmentOffset + " " + len + " " + backWordPos + " " +
         // backPos);
         unkDictionary.lookupWordIds(characterDefinition.getCharacterClass(' '), wordIdRef);
         DictionaryToken spaceToken =
             new DictionaryToken(
                 Type.UNKNOWN,
                 unkDictionary,
                 wordIdRef.ints[wordIdRef.offset],
                 fragment,
                 offset,
                 len,
                 backPos,
                 backPos + len);
         pending.add(spaceToken);
       }

       pos = backPos;
       bestIDX = nextBestIDX;
     }

     lastBackTracePos = endPos;

     if (VERBOSE) {
       System.out.println("  freeBefore pos=" + endPos);
     }
     // Notify the circular buffers that we are done with
     // these positions:
     buffer.freeBefore(endPos);
     positions.freeBefore(endPos);
   }

   Dictionary getDict(Type type) {
     return dictionaryMap.get(type);
   }

   private boolean shouldFilterToken(Token token) {
     return discardPunctuation && isPunctuation(token.getSurfaceForm()[token.getOffset()]);
   }

   private static boolean isPunctuation(char ch) {
     return isPunctuation(ch, Character.getType(ch));
   }

   private static boolean isPunctuation(char ch, int cid) {
     // special case for Hangul Letter Araea (interpunct)
     if (ch == 0x318D) {
       return true;
     }
     switch (cid) {
       case Character.SPACE_SEPARATOR:
       case Character.LINE_SEPARATOR:
       case Character.PARAGRAPH_SEPARATOR:
       case Character.CONTROL:
       case Character.FORMAT:
       case Character.DASH_PUNCTUATION:
       case Character.START_PUNCTUATION:
       case Character.END_PUNCTUATION:
       case Character.CONNECTOR_PUNCTUATION:
       case Character.OTHER_PUNCTUATION:
       case Character.MATH_SYMBOL:
       case Character.CURRENCY_SYMBOL:
       case Character.MODIFIER_SYMBOL:
       case Character.OTHER_SYMBOL:
       case Character.INITIAL_QUOTE_PUNCTUATION:
       case Character.FINAL_QUOTE_PUNCTUATION:
         return true;
       default:
         return false;
     }
   }

   private static boolean isCommonOrInherited(UnicodeScript script) {
     return script == UnicodeScript.INHERITED || script == UnicodeScript.COMMON;
   }

   /** Determine if two scripts are compatible. */
   private static boolean isSameScript(UnicodeScript scriptOne, UnicodeScript scriptTwo) {
     return scriptOne == scriptTwo
         || isCommonOrInherited(scriptOne)
         || isCommonOrInherited(scriptTwo);
   }
 }