lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/KoreanTokenizer.java - lucene-solr - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License.  You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
 package org.apache.lucene.analysis.ko;

 import java.io.IOException;
 import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.EnumMap;
 import java.util.List;

 import org.apache.lucene.analysis.Tokenizer;
 import org.apache.lucene.analysis.ko.dict.CharacterDefinition;
 import org.apache.lucene.analysis.ko.dict.ConnectionCosts;
 import org.apache.lucene.analysis.ko.dict.Dictionary;
 import org.apache.lucene.analysis.ko.dict.TokenInfoDictionary;
 import org.apache.lucene.analysis.ko.dict.TokenInfoFST;
 import org.apache.lucene.analysis.ko.dict.UnknownDictionary;
 import org.apache.lucene.analysis.ko.dict.UserDictionary;
 import org.apache.lucene.analysis.ko.tokenattributes.PartOfSpeechAttribute;
 import org.apache.lucene.analysis.ko.tokenattributes.ReadingAttribute;
 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
 import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
 import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
 import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute;
 import org.apache.lucene.analysis.util.RollingCharBuffer;
 import org.apache.lucene.util.ArrayUtil;
 import org.apache.lucene.util.AttributeFactory;
 import org.apache.lucene.util.IntsRef;
 import org.apache.lucene.util.RamUsageEstimator;
 import org.apache.lucene.util.fst.FST;

 import static java.lang.Character.UnicodeScript;

 /**
  * Tokenizer for Korean that uses morphological analysis.
  * <p>
  * This tokenizer sets a number of additional attributes:
  * <ul>
  *   <li>{@link PartOfSpeechAttribute} containing part-of-speech.
  *   <li>{@link ReadingAttribute} containing reading.
  * </ul>
  * <p>
  * This tokenizer uses a rolling Viterbi search to find the
  * least cost segmentation (path) of the incoming characters.
  * @lucene.experimental
  */
 public final class KoreanTokenizer extends Tokenizer {

   /**
    * Token type reflecting the original source of this token
    */
   public enum Type {
     /**
      * Known words from the system dictionary.
      */
     KNOWN,
     /**
      * Unknown words (heuristically segmented).
      */
     UNKNOWN,
     /**
      * Known words from the user dictionary.
      */
     USER
   }

   /**
    * Decompound mode: this determines how the tokenizer handles
    * {@link POS.Type#COMPOUND}, {@link POS.Type#INFLECT} and {@link POS.Type#PREANALYSIS} tokens.
    */
   public enum DecompoundMode {
     /**
      * No decomposition for compound.
      */
     NONE,

     /**
      * Decompose compounds and discards the original form (default).
      */
     DISCARD,

     /**
      * Decompose compounds and keeps the original form.
      */
     MIXED
   }

   /**
    * Default mode for the decompound of tokens ({@link DecompoundMode#DISCARD}.
    */
   public static final DecompoundMode DEFAULT_DECOMPOUND = DecompoundMode.DISCARD;

   private static final boolean VERBOSE = false;

   // For safety:
   private static final int MAX_UNKNOWN_WORD_LENGTH = 1024;
   private static final int MAX_BACKTRACE_GAP = 1024;

   private final EnumMap<Type, Dictionary> dictionaryMap = new EnumMap<>(Type.class);

   private final TokenInfoFST fst;
   private final TokenInfoDictionary dictionary;
   private final UnknownDictionary unkDictionary;
   private final ConnectionCosts costs;
   private final UserDictionary userDictionary;
   private final CharacterDefinition characterDefinition;

   private final FST.Arc<Long> arc = new FST.Arc<>();
   private final FST.BytesReader fstReader;
   private final IntsRef wordIdRef = new IntsRef();

   private final FST.BytesReader userFSTReader;
   private final TokenInfoFST userFST;

   private final boolean discardPunctuation;
   private final DecompoundMode mode;
   private final boolean outputUnknownUnigrams;

   private final RollingCharBuffer buffer = new RollingCharBuffer();

   private final WrappedPositionArray positions = new WrappedPositionArray();

   // True once we've hit the EOF from the input reader:
   private boolean end;

   // Last absolute position we backtraced from:
   private int lastBackTracePos;

   // Next absolute position to process:
   private int pos;

   // Already parsed, but not yet passed to caller, tokens:
   private final List<Token> pending = new ArrayList<>();

   private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
   private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
   private final PositionIncrementAttribute posIncAtt = addAttribute(PositionIncrementAttribute.class);
   private final PositionLengthAttribute posLengthAtt = addAttribute(PositionLengthAttribute.class);
   private final PartOfSpeechAttribute posAtt = addAttribute(PartOfSpeechAttribute.class);
   private final ReadingAttribute readingAtt = addAttribute(ReadingAttribute.class);


   /**
    * Creates a new KoreanTokenizer with default parameters.
    * <p>
    * Uses the default AttributeFactory.
    */
   public KoreanTokenizer() {
     this(DEFAULT_TOKEN_ATTRIBUTE_FACTORY, null, DEFAULT_DECOMPOUND, false, true);
   }

   /**
    * Create a new KoreanTokenizer using the system and unknown dictionaries shipped with Lucene.
    *
    * @param factory the AttributeFactory to use
    * @param userDictionary Optional: if non-null, user dictionary.
    * @param mode Decompound mode.
    * @param outputUnknownUnigrams if true outputs unigrams for unknown words.
    */
   public KoreanTokenizer(AttributeFactory factory, UserDictionary userDictionary, DecompoundMode mode, boolean outputUnknownUnigrams) {
     this(factory, userDictionary, mode, outputUnknownUnigrams, true);
   }

   /**
    * Create a new KoreanTokenizer using the system and unknown dictionaries shipped with Lucene.
    *
    * @param factory the AttributeFactory to use
    * @param userDictionary Optional: if non-null, user dictionary.
    * @param mode Decompound mode.
    * @param outputUnknownUnigrams if true outputs unigrams for unknown words.
    * @param discardPunctuation true if punctuation tokens should be dropped from the output.
    */
   public KoreanTokenizer(AttributeFactory factory, UserDictionary userDictionary, DecompoundMode mode, boolean outputUnknownUnigrams, boolean discardPunctuation) {
     this(factory,
         TokenInfoDictionary.getInstance(),
         UnknownDictionary.getInstance(),
         ConnectionCosts.getInstance(),
         userDictionary, mode, outputUnknownUnigrams, discardPunctuation);
   }

   /**
    * <p>Create a new KoreanTokenizer supplying a custom system dictionary and unknown dictionary.
    * This constructor provides an entry point for users that want to construct custom language models
    * that can be used as input to {@link org.apache.lucene.analysis.ko.util.DictionaryBuilder}.</p>
    *
    * @param factory the AttributeFactory to use
    * @param systemDictionary a custom known token dictionary
    * @param unkDictionary a custom unknown token dictionary
    * @param connectionCosts custom token transition costs
    * @param userDictionary Optional: if non-null, user dictionary.
    * @param mode Decompound mode.
    * @param outputUnknownUnigrams if true outputs unigrams for unknown words.
    * @param discardPunctuation true if punctuation tokens should be dropped from the output.
    * @lucene.experimental
    */
   public KoreanTokenizer(AttributeFactory factory,
                          TokenInfoDictionary systemDictionary,
                          UnknownDictionary unkDictionary,
                          ConnectionCosts connectionCosts,
                          UserDictionary userDictionary,
                          DecompoundMode mode,
                          boolean outputUnknownUnigrams,
                          boolean discardPunctuation) {
     super(factory);
     this.dictionary = systemDictionary;
     this.fst = dictionary.getFST();
     this.unkDictionary = unkDictionary;
     this.characterDefinition = unkDictionary.getCharacterDefinition();
     this.costs = connectionCosts;
     this.userDictionary = userDictionary;
     fstReader = fst.getBytesReader();
     if (userDictionary != null) {
       userFST = userDictionary.getFST();
       userFSTReader = userFST.getBytesReader();
     } else {
       userFST = null;
       userFSTReader = null;
     }
     this.mode = mode;
     this.outputUnknownUnigrams = outputUnknownUnigrams;
     this.discardPunctuation = discardPunctuation;
     buffer.reset(this.input);

     resetState();

     dictionaryMap.put(Type.KNOWN, dictionary);
     dictionaryMap.put(Type.UNKNOWN, unkDictionary);
     dictionaryMap.put(Type.USER, userDictionary);
   }

   private GraphvizFormatter dotOut;

   /** Expert: set this to produce graphviz (dot) output of
    *  the Viterbi lattice */
   public void setGraphvizFormatter(GraphvizFormatter dotOut) {
     this.dotOut = dotOut;
   }

   @Override
   public void close() throws IOException {
     super.close();
     buffer.reset(input);
   }

   @Override
   public void reset() throws IOException {
     super.reset();
     buffer.reset(input);
     resetState();
   }

   private void resetState() {
     positions.reset();
     pos = 0;
     end = false;
     lastBackTracePos = 0;
     pending.clear();

     // Add BOS:
     positions.get(0).add(0, 0, -1, -1, -1, -1, Type.KNOWN);
   }

   @Override
   public void end() throws IOException {
     super.end();
     // Set final offset
     int finalOffset = correctOffset(pos);
     offsetAtt.setOffset(finalOffset, finalOffset);
   }

   // Holds all back pointers arriving to this position:
   final static class Position {

     int pos;

     int count;

     // maybe single int array * 5?
     int[] costs = new int[8];
     int[] lastRightID = new int[8];
     int[] backPos = new int[8];
     int[] backWordPos = new int[8];
     int[] backIndex = new int[8];
     int[] backID = new int[8];
     Type[] backType = new Type[8];

     public void grow() {
       costs = ArrayUtil.grow(costs, 1+count);
       lastRightID = ArrayUtil.grow(lastRightID, 1+count);
       backPos = ArrayUtil.grow(backPos, 1+count);
       backWordPos = ArrayUtil.grow(backWordPos, 1+count);
       backIndex = ArrayUtil.grow(backIndex, 1+count);
       backID = ArrayUtil.grow(backID, 1+count);

       // NOTE: sneaky: grow separately because
       // ArrayUtil.grow will otherwise pick a different
       // length than the int[]s we just grew:
       final Type[] newBackType = new Type[backID.length];
       System.arraycopy(backType, 0, newBackType, 0, backType.length);
       backType = newBackType;
     }

     public void add(int cost, int lastRightID, int backPos, int backRPos, int backIndex, int backID, Type backType) {
       // NOTE: this isn't quite a true Viterbi search,
       // because we should check if lastRightID is
       // already present here, and only update if the new
       // cost is less than the current cost, instead of
       // simply appending.  However, that will likely hurt
       // performance (usually we add a lastRightID only once),
       // and it means we actually create the full graph
       // intersection instead of a "normal" Viterbi lattice:
       if (count == costs.length) {
         grow();
       }
       this.costs[count] = cost;
       this.lastRightID[count] = lastRightID;
       this.backPos[count] = backPos;
       this.backWordPos[count] = backRPos;
       this.backIndex[count] = backIndex;
       this.backID[count] = backID;
       this.backType[count] = backType;
       count++;
     }

     public void reset() {
       count = 0;
     }
   }

   /**
    * Returns the space penalty associated with the provided {@link POS.Tag}.
    *
    * @param leftPOS the left part of speech of the current token.
    * @param numSpaces the number of spaces before the current token.
    */
   private int computeSpacePenalty(POS.Tag leftPOS, int numSpaces) {
     int spacePenalty = 0;
     if (numSpaces > 0) {
       // TODO we should extract the penalty (left-space-penalty-factor) from the dicrc file.
       switch (leftPOS) {
         case E:
         case J:
         case VCP:
         case XSA:
         case XSN:
         case XSV:
           spacePenalty = 3000;
           break;

         default:
           break;
       }
     }
     return spacePenalty;

   }

   private void add(Dictionary dict, Position fromPosData, int wordPos, int endPos, int wordID, Type type) {
     final POS.Tag leftPOS = dict.getLeftPOS(wordID);
     final int wordCost = dict.getWordCost(wordID);
     final int leftID = dict.getLeftId(wordID);
     int leastCost = Integer.MAX_VALUE;
     int leastIDX = -1;
     assert fromPosData.count > 0;
     for(int idx=0;idx<fromPosData.count;idx++) {
       // The number of spaces before the term
       int numSpaces = wordPos - fromPosData.pos;

       // Cost is path cost so far, plus word cost (added at
       // end of loop), plus bigram cost and space penalty cost.
       final int cost = fromPosData.costs[idx] + costs.get(fromPosData.lastRightID[idx], leftID) + computeSpacePenalty(leftPOS, numSpaces);
       if (VERBOSE) {
         System.out.println("      fromIDX=" + idx + ": cost=" + cost + " (prevCost=" + fromPosData.costs[idx] + " wordCost=" + wordCost + " bgCost=" + costs.get(fromPosData.lastRightID[idx], leftID) +
             " spacePenalty=" + computeSpacePenalty(leftPOS, numSpaces) + ") leftID=" + leftID + " leftPOS=" + leftPOS.name() + ")");
       }
       if (cost < leastCost) {
         leastCost = cost;
         leastIDX = idx;
         if (VERBOSE) {
           System.out.println("        **");
         }
       }
     }

     leastCost += wordCost;

     if (VERBOSE) {
       System.out.println("      + cost=" + leastCost + " wordID=" + wordID + " leftID=" + leftID + " leastIDX=" + leastIDX + " toPos=" + endPos + " toPos.idx=" + positions.get(endPos).count);
     }

     positions.get(endPos).add(leastCost, dict.getRightId(wordID), fromPosData.pos, wordPos, leastIDX, wordID, type);
   }

   @Override
   public boolean incrementToken() throws IOException {

     // parse() is able to return w/o producing any new
     // tokens, when the tokens it had produced were entirely
     // punctuation.  So we loop here until we get a real
     // token or we end:
     while (pending.size() == 0) {
       if (end) {
         return false;
       }

       // Push Viterbi forward some more:
       parse();
     }

     final Token token = pending.remove(pending.size()-1);

     int length = token.getLength();
     clearAttributes();
     assert length > 0;
     //System.out.println("off=" + token.getOffset() + " len=" + length + " vs " + token.getSurfaceForm().length);
     termAtt.copyBuffer(token.getSurfaceForm(), token.getOffset(), length);
     offsetAtt.setOffset(correctOffset(token.getStartOffset()), correctOffset(token.getEndOffset()));
     posAtt.setToken(token);
     readingAtt.setToken(token);
     posIncAtt.setPositionIncrement(token.getPositionIncrement());
     posLengthAtt.setPositionLength(token.getPositionLength());
     if (VERBOSE) {
       System.out.println(Thread.currentThread().getName() + ":    incToken: return token=" + token);
     }
     return true;
   }

   // TODO: make generic'd version of this "circular array"?
   // It's a bit tricky because we do things to the Position
   // (eg, set .pos = N on reuse)...
   static final class WrappedPositionArray {
     private Position[] positions = new Position[8];

     public WrappedPositionArray() {
       for(int i=0;i<positions.length;i++) {
         positions[i] = new Position();
       }
     }

     // Next array index to write to in positions:
     private int nextWrite;

     // Next position to write:
     private int nextPos;

     // How many valid Position instances are held in the
     // positions array:
     private int count;

     public void reset() {
       nextWrite--;
       while(count > 0) {
         if (nextWrite == -1) {
           nextWrite = positions.length - 1;
         }
         positions[nextWrite--].reset();
         count--;
       }
       nextWrite = 0;
       nextPos = 0;
       count = 0;
     }

     /** Get Position instance for this absolute position;
      *  this is allowed to be arbitrarily far "in the
      *  future" but cannot be before the last freeBefore. */
     public Position get(int pos) {
       while(pos >= nextPos) {
         //System.out.println("count=" + count + " vs len=" + positions.length);
         if (count == positions.length) {
           Position[] newPositions = new Position[ArrayUtil.oversize(1+count, RamUsageEstimator.NUM_BYTES_OBJECT_REF)];
           //System.out.println("grow positions " + newPositions.length);
           System.arraycopy(positions, nextWrite, newPositions, 0, positions.length-nextWrite);
           System.arraycopy(positions, 0, newPositions, positions.length-nextWrite, nextWrite);
           for(int i=positions.length;i<newPositions.length;i++) {
             newPositions[i] = new Position();
           }
           nextWrite = positions.length;
           positions = newPositions;
         }
         if (nextWrite == positions.length) {
           nextWrite = 0;
         }
         // Should have already been reset:
         assert positions[nextWrite].count == 0;
         positions[nextWrite++].pos = nextPos++;
         count++;
       }
       assert inBounds(pos);
       final int index = getIndex(pos);
       assert positions[index].pos == pos;
       return positions[index];
     }

     public int getNextPos() {
       return nextPos;
     }

     // For assert:
     private boolean inBounds(int pos) {
       return pos < nextPos && pos >= nextPos - count;
     }

     private int getIndex(int pos) {
       int index = nextWrite - (nextPos - pos);
       if (index < 0) {
         index += positions.length;
       }
       return index;
     }

     public void freeBefore(int pos) {
       final int toFree = count - (nextPos - pos);
       assert toFree >= 0;
       assert toFree <= count;
       int index = nextWrite - count;
       if (index < 0) {
         index += positions.length;
       }
       for(int i=0;i<toFree;i++) {
         if (index == positions.length) {
           index = 0;
         }
         //System.out.println("  fb idx=" + index);
         positions[index].reset();
         index++;
       }
       count -= toFree;
     }
   }

   /* Incrementally parse some more characters.  This runs
    * the viterbi search forwards "enough" so that we
    * generate some more tokens.  How much forward depends on
    * the chars coming in, since some chars could cause
    * longer-lasting ambiguity in the parsing.  Once the
    * ambiguity is resolved, then we back trace, produce
    * the pending tokens, and return. */
   private void parse() throws IOException {
     if (VERBOSE) {
       System.out.println("\nPARSE");
     }

     // Index of the last character of unknown word:
     int unknownWordEndIndex = -1;

     // Maximum posAhead of user word in the entire input
     int userWordMaxPosAhead = -1;

     // Advances over each position (character):
     while (buffer.get(pos) != -1) {
       final Position posData = positions.get(pos);
       final boolean isFrontier = positions.getNextPos() == pos + 1;

       if (posData.count == 0) {
         // No arcs arrive here; move to next position:
         if (VERBOSE) {
           System.out.println("    no arcs in; skip pos=" + pos);
         }
         pos++;
         continue;
       }

       if (pos > lastBackTracePos && posData.count == 1 && isFrontier) {
         // We are at a "frontier", and only one node is
         // alive, so whatever the eventual best path is must
         // come through this node.  So we can safely commit
         // to the prefix of the best path at this point:
         backtrace(posData, 0);

         // Re-base cost so we don't risk int overflow:
         posData.costs[0] = 0;
         if (pending.size() > 0) {
           return;
         } else {
           // This means the backtrace only produced
           // punctuation tokens, so we must keep parsing.
         }
       }

       if (pos - lastBackTracePos >= MAX_BACKTRACE_GAP) {
         // Safety: if we've buffered too much, force a
         // backtrace now.  We find the least-cost partial
         // path, across all paths, backtrace from it, and
         // then prune all others.  Note that this, in
         // general, can produce the wrong result, if the
         // total best path did not in fact back trace
         // through this partial best path.  But it's the
         // best we can do... (short of not having a
         // safety!).

         // First pass: find least cost partial path so far,
         // including ending at future positions:
         int leastIDX = -1;
         int leastCost = Integer.MAX_VALUE;
         Position leastPosData = null;
         for (int pos2 = pos; pos2 < positions.getNextPos(); pos2++) {
           final Position posData2 = positions.get(pos2);
           for (int idx = 0; idx < posData2.count; idx++) {
             //System.out.println("    idx=" + idx + " cost=" + cost);
             final int cost = posData2.costs[idx];
             if (cost < leastCost) {
               leastCost = cost;
               leastIDX = idx;
               leastPosData = posData2;
             }
           }
         }

         // We will always have at least one live path:
         assert leastIDX != -1;

         // Second pass: prune all but the best path:
         for (int pos2 = pos; pos2 < positions.getNextPos(); pos2++) {
           final Position posData2 = positions.get(pos2);
           if (posData2 != leastPosData) {
             posData2.reset();
           } else {
             if (leastIDX != 0) {
               posData2.costs[0] = posData2.costs[leastIDX];
               posData2.lastRightID[0] = posData2.lastRightID[leastIDX];
               posData2.backPos[0] = posData2.backPos[leastIDX];
               posData2.backWordPos[0] = posData2.backWordPos[leastIDX];
               posData2.backIndex[0] = posData2.backIndex[leastIDX];
               posData2.backID[0] = posData2.backID[leastIDX];
               posData2.backType[0] = posData2.backType[leastIDX];
             }
             posData2.count = 1;
           }
         }

         backtrace(leastPosData, 0);

         // Re-base cost so we don't risk int overflow:
         Arrays.fill(leastPosData.costs, 0, leastPosData.count, 0);

         if (pos != leastPosData.pos) {
           // We jumped into a future position:
           assert pos < leastPosData.pos;
           pos = leastPosData.pos;
         }
         if (pending.size() > 0) {
           return;
         } else {
           // This means the backtrace only produced
           // punctuation tokens, so we must keep parsing.
           continue;
         }
       }

       if (VERBOSE) {
         System.out.println("\n  extend @ pos=" + pos + " char=" + (char) buffer.get(pos) + " hex=" + Integer.toHexString(buffer.get(pos)));
       }

       if (VERBOSE) {
         System.out.println("    " + posData.count + " arcs in");
       }

       // We add single space separator as prefixes of the terms that we extract.
       // This information is needed to compute the space penalty factor of each term.
       // These whitespace prefixes are removed when the final tokens are generated, or
       // added as separated tokens when discardPunctuation is unset.
       if (Character.getType(buffer.get(pos)) == Character.SPACE_SEPARATOR) {
         if (buffer.get(++pos) == -1) {
           pos = posData.pos;
         }
       }

       boolean anyMatches = false;

       // First try user dict:
       if (userFST != null) {
         userFST.getFirstArc(arc);
         int output = 0;
         int maxPosAhead = 0;
         int outputMaxPosAhead = 0;
         int arcFinalOutMaxPosAhead = 0;

         for (int posAhead = pos; ; posAhead++) {
           final int ch = buffer.get(posAhead);
           if (ch == -1) {
             break;
           }
           if (userFST.findTargetArc(ch, arc, arc, posAhead == pos, userFSTReader) == null) {
             break;
           }
           output += arc.output().intValue();
           if (arc.isFinal()) {
             maxPosAhead = posAhead;
             outputMaxPosAhead = output;
             arcFinalOutMaxPosAhead = arc.nextFinalOutput().intValue();
             anyMatches = true;
           }
         }

         // Longest matching for user word
         if (anyMatches && maxPosAhead > userWordMaxPosAhead) {
           if (VERBOSE) {
             System.out.println("    USER word " + new String(buffer.get(pos, maxPosAhead + 1)) + " toPos=" + (maxPosAhead + 1));
           }
           add(userDictionary, posData, pos, maxPosAhead + 1, outputMaxPosAhead + arcFinalOutMaxPosAhead, Type.USER);
           userWordMaxPosAhead = Math.max(userWordMaxPosAhead, maxPosAhead);
         }
       }

       // TODO: we can be more aggressive about user
       // matches?  if we are "under" a user match then don't
       // extend KNOWN/UNKNOWN paths?

       if (!anyMatches) {
         // Next, try known dictionary matches
         fst.getFirstArc(arc);
         int output = 0;

         for (int posAhead = pos; ; posAhead++) {
           final int ch = buffer.get(posAhead);
           if (ch == -1) {
             break;
           }
           //System.out.println("    match " + (char) ch + " posAhead=" + posAhead);

           if (fst.findTargetArc(ch, arc, arc, posAhead == pos, fstReader) == null) {
             break;
           }

           output += arc.output().intValue();

           // Optimization: for known words that are too-long
           // (compound), we should pre-compute the 2nd
           // best segmentation and store it in the
           // dictionary instead of recomputing it each time a
           // match is found.

           if (arc.isFinal()) {
             dictionary.lookupWordIds(output + arc.nextFinalOutput().intValue(), wordIdRef);
             if (VERBOSE) {
               System.out.println("    KNOWN word " + new String(buffer.get(pos, posAhead - pos + 1)) + " toPos=" + (posAhead + 1) + " " + wordIdRef.length + " wordIDs");
             }
             for (int ofs = 0; ofs < wordIdRef.length; ofs++) {
               add(dictionary, posData, pos, posAhead + 1, wordIdRef.ints[wordIdRef.offset + ofs], Type.KNOWN);
               anyMatches = true;
             }
           }
         }
       }

       if (unknownWordEndIndex > posData.pos) {
         pos++;
         continue;
       }

       final char firstCharacter = (char) buffer.get(pos);
       if (!anyMatches || characterDefinition.isInvoke(firstCharacter)) {

         // Find unknown match:
         int characterId = characterDefinition.getCharacterClass(firstCharacter);
         // NOTE: copied from UnknownDictionary.lookup:
         int unknownWordLength;
         if (!characterDefinition.isGroup(firstCharacter)) {
           unknownWordLength = 1;
         } else {
           // Extract unknown word. Characters with the same script are considered to be part of unknown word
           unknownWordLength = 1;
           UnicodeScript scriptCode = UnicodeScript.of(firstCharacter);
           final boolean isPunct = isPunctuation(firstCharacter);
           final boolean isDigit = Character.isDigit(firstCharacter);
           for (int posAhead = pos + 1; unknownWordLength < MAX_UNKNOWN_WORD_LENGTH; posAhead++) {
             int next = buffer.get(posAhead);
             if (next == -1) {
               break;
             }
             char ch = (char) next;
             int chType = Character.getType(ch);
             UnicodeScript sc = UnicodeScript.of(next);
             boolean sameScript = isSameScript(scriptCode, sc)
                 // Non-spacing marks inherit the script of their base character,
                 // following recommendations from UTR #24.
                 || chType == Character.NON_SPACING_MARK;

             if (sameScript
                   // split on punctuation
                   && isPunctuation(ch, chType) == isPunct
                   // split on digit
                   && Character.isDigit(ch) == isDigit
                   && characterDefinition.isGroup(ch)) {
               unknownWordLength++;
             } else {
               break;
             }
             // Update the script code and character class if the original script
             // is Inherited or Common.
             if (isCommonOrInherited(scriptCode) && isCommonOrInherited(sc) == false) {
               scriptCode = sc;
               characterId = characterDefinition.getCharacterClass(ch);
             }
           }
         }

         unkDictionary.lookupWordIds(characterId, wordIdRef); // characters in input text are supposed to be the same
         if (VERBOSE) {
           System.out.println("    UNKNOWN word len=" + unknownWordLength + " " + wordIdRef.length + " wordIDs");
         }
         for (int ofs = 0; ofs < wordIdRef.length; ofs++) {
           add(unkDictionary, posData, pos, pos + unknownWordLength, wordIdRef.ints[wordIdRef.offset + ofs], Type.UNKNOWN);
         }
       }

       pos++;
     }

     end = true;

     if (pos > 0) {

       final Position endPosData = positions.get(pos);
       int leastCost = Integer.MAX_VALUE;
       int leastIDX = -1;
       if (VERBOSE) {
         System.out.println("  end: " + endPosData.count + " nodes");
       }
       for(int idx=0;idx<endPosData.count;idx++) {
         // Add EOS cost:
         final int cost = endPosData.costs[idx] + costs.get(endPosData.lastRightID[idx], 0);
         //System.out.println("    idx=" + idx + " cost=" + cost + " (pathCost=" + endPosData.costs[idx] + " bgCost=" + costs.get(endPosData.lastRightID[idx], 0) + ") backPos=" + endPosData.backPos[idx]);
         if (cost < leastCost) {
           leastCost = cost;
           leastIDX = idx;
         }
       }

       backtrace(endPosData, leastIDX);
     } else {
       // No characters in the input string; return no tokens!
     }
   }

   // the pending list.  The pending list is then in-reverse
   // (last token should be returned first).
   private void backtrace(final Position endPosData, final int fromIDX) {
     final int endPos = endPosData.pos;

     if (VERBOSE) {
       System.out.println("\n  backtrace: endPos=" + endPos + " pos=" + pos + "; " + (pos - lastBackTracePos) + " characters; last=" + lastBackTracePos + " cost=" + endPosData.costs[fromIDX]);
     }

     final char[] fragment = buffer.get(lastBackTracePos, endPos-lastBackTracePos);

     if (dotOut != null) {
       dotOut.onBacktrace(this, positions, lastBackTracePos, endPosData, fromIDX, fragment, end);
     }

     int pos = endPos;
     int bestIDX = fromIDX;

     // TODO: sort of silly to make Token instances here; the
     // back trace has all info needed to generate the
     // token.  So, we could just directly set the attrs,
     // from the backtrace, in incrementToken w/o ever
     // creating Token; we'd have to defer calling freeBefore
     // until after the backtrace was fully "consumed" by
     // incrementToken.

     while (pos > lastBackTracePos) {
       //System.out.println("BT: back pos=" + pos + " bestIDX=" + bestIDX);
       final Position posData = positions.get(pos);
       assert bestIDX < posData.count;

       int backPos = posData.backPos[bestIDX];
       int backWordPos = posData.backWordPos[bestIDX];
       assert backPos >= lastBackTracePos: "backPos=" + backPos + " vs lastBackTracePos=" + lastBackTracePos;
       // the length of the word without the whitespaces at the beginning.
       int length = pos - backWordPos;
       Type backType = posData.backType[bestIDX];
       int backID = posData.backID[bestIDX];
       int nextBestIDX = posData.backIndex[bestIDX];
       // the start of the word after the whitespace at the beginning.
       final int fragmentOffset = backWordPos - lastBackTracePos;
       assert fragmentOffset >= 0;

       final Dictionary dict = getDict(backType);

       if (outputUnknownUnigrams && backType == Type.UNKNOWN) {
         // outputUnknownUnigrams converts unknown word into unigrams:
         for (int i = length - 1; i >= 0; i--) {
           int charLen = 1;
           if (i > 0 && Character.isLowSurrogate(fragment[fragmentOffset + i])) {
             i--;
             charLen = 2;
           }
           final DictionaryToken token = new DictionaryToken(Type.UNKNOWN,
               unkDictionary,
               CharacterDefinition.NGRAM,
               fragment,
               fragmentOffset+i,
               charLen,
               backWordPos+i,
               backWordPos+i+charLen
           );
           pending.add(token);
           if (VERBOSE) {
             System.out.println("    add token=" + pending.get(pending.size() - 1));
           }
         }
       } else {
         final DictionaryToken token = new DictionaryToken(backType,
             dict,
             backID,
             fragment,
             fragmentOffset,
             length,
             backWordPos,
             backWordPos + length
         );
         if (token.getPOSType() == POS.Type.MORPHEME || mode == DecompoundMode.NONE) {
           if (shouldFilterToken(token) == false) {
             pending.add(token);
             if (VERBOSE) {
               System.out.println("    add token=" + pending.get(pending.size() - 1));
             }
           }
         } else {
           Dictionary.Morpheme[] morphemes = token.getMorphemes();
           if (morphemes == null) {
             pending.add(token);
             if (VERBOSE) {
               System.out.println("    add token=" + pending.get(pending.size() - 1));
             }
           } else {
             int endOffset = backWordPos + length;
             int posLen = 0;
             // decompose the compound
             for (int i = morphemes.length - 1; i >= 0; i--) {
               final Dictionary.Morpheme morpheme = morphemes[i];
               final Token compoundToken;
               if (token.getPOSType() == POS.Type.COMPOUND) {
                 assert endOffset - morpheme.surfaceForm.length() >= 0;
                 compoundToken = new DecompoundToken(morpheme.posTag, morpheme.surfaceForm,
                     endOffset - morpheme.surfaceForm.length(), endOffset);
               } else {
                 compoundToken = new DecompoundToken(morpheme.posTag, morpheme.surfaceForm, token.getStartOffset(), token.getEndOffset());
               }
               if (i == 0 && mode == DecompoundMode.MIXED) {
                 compoundToken.setPositionIncrement(0);
               }
               ++ posLen;
               endOffset -= morpheme.surfaceForm.length();
               pending.add(compoundToken);
               if (VERBOSE) {
                 System.out.println("    add token=" + pending.get(pending.size() - 1));
               }
             }
             if (mode == DecompoundMode.MIXED) {
               token.setPositionLength(Math.max(1, posLen));
               pending.add(token);
               if (VERBOSE) {
                 System.out.println("    add token=" + pending.get(pending.size() - 1));
               }
             }
           }
         }
       }
       if (discardPunctuation == false && backWordPos != backPos) {
         // Add a token for whitespaces between terms
         int offset = backPos - lastBackTracePos;
         int len = backWordPos - backPos;
         //System.out.println(offset + " " + fragmentOffset + " " + len + " " + backWordPos + " " + backPos);
         unkDictionary.lookupWordIds(characterDefinition.getCharacterClass(' '), wordIdRef);
         DictionaryToken spaceToken = new DictionaryToken(Type.UNKNOWN, unkDictionary,
             wordIdRef.ints[wordIdRef.offset], fragment, offset, len, backPos, backPos+len);
         pending.add(spaceToken);
       }

       pos = backPos;
       bestIDX = nextBestIDX;
     }

     lastBackTracePos = endPos;

     if (VERBOSE) {
       System.out.println("  freeBefore pos=" + endPos);
     }
     // Notify the circular buffers that we are done with
     // these positions:
     buffer.freeBefore(endPos);
     positions.freeBefore(endPos);
   }

   Dictionary getDict(Type type) {
     return dictionaryMap.get(type);
   }

   private boolean shouldFilterToken(Token token) {
     return discardPunctuation && isPunctuation(token.getSurfaceForm()[token.getOffset()]);
   }

   private static boolean isPunctuation(char ch) {
     return isPunctuation(ch, Character.getType(ch));
   }

   private static boolean isPunctuation(char ch, int cid) {
     // special case for Hangul Letter Araea (interpunct)
     if (ch == 0x318D) {
       return true;
     }
     switch(cid) {
       case Character.SPACE_SEPARATOR:
       case Character.LINE_SEPARATOR:
       case Character.PARAGRAPH_SEPARATOR:
       case Character.CONTROL:
       case Character.FORMAT:
       case Character.DASH_PUNCTUATION:
       case Character.START_PUNCTUATION:
       case Character.END_PUNCTUATION:
       case Character.CONNECTOR_PUNCTUATION:
       case Character.OTHER_PUNCTUATION:
       case Character.MATH_SYMBOL:
       case Character.CURRENCY_SYMBOL:
       case Character.MODIFIER_SYMBOL:
       case Character.OTHER_SYMBOL:
       case Character.INITIAL_QUOTE_PUNCTUATION:
       case Character.FINAL_QUOTE_PUNCTUATION:
         return true;
       default:
         return false;
     }
   }

   private static boolean isCommonOrInherited(UnicodeScript script) {
     return script == UnicodeScript.INHERITED ||
         script == UnicodeScript.COMMON;
   }

   /** Determine if two scripts are compatible. */
   private static boolean isSameScript(UnicodeScript scriptOne, UnicodeScript scriptTwo) {
     return scriptOne == scriptTwo
         || isCommonOrInherited(scriptOne)
         || isCommonOrInherited(scriptTwo);
   }
 }