lucene/analysis/common/src/java/org/apache/lucene/analysis/synonym/SynonymFilter.java - lucene-solr - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License.  You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
 package org.apache.lucene.analysis.synonym;

 import java.io.IOException;

 import org.apache.lucene.analysis.TokenFilter;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.core.FlattenGraphFilter;
 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
 import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
 import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
 import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute;
 import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
 import org.apache.lucene.store.ByteArrayDataInput;
 import org.apache.lucene.util.ArrayUtil;
 import org.apache.lucene.util.AttributeSource;
 import org.apache.lucene.util.BytesRef;
 import org.apache.lucene.util.CharsRef;
 import org.apache.lucene.util.CharsRefBuilder;
 import org.apache.lucene.util.fst.FST;

 /**
  * Matches single or multi word synonyms in a token stream.
  * This token stream cannot properly handle position
  * increments != 1, ie, you should place this filter before
  * filtering out stop words.
  *
  * <p>Note that with the current implementation, parsing is
  * greedy, so whenever multiple parses would apply, the rule
  * starting the earliest and parsing the most tokens wins.
  * For example if you have these rules:
  *
  * <pre>
  *   a -&gt; x
  *   a b -&gt; y
  *   b c d -&gt; z
  * </pre>
  *
  * Then input <code>a b c d e</code> parses to <code>y b c
  * d</code>, ie the 2nd rule "wins" because it started
  * earliest and matched the most input tokens of other rules
  * starting at that point.
  *
  * <p>A future improvement to this filter could allow
  * non-greedy parsing, such that the 3rd rule would win, and
  * also separately allow multiple parses, such that all 3
  * rules would match, perhaps even on a rule by rule
  * basis.</p>
  *
  * <p><b>NOTE</b>: when a match occurs, the output tokens
  * associated with the matching rule are "stacked" on top of
  * the input stream (if the rule had
  * <code>keepOrig=true</code>) and also on top of another
  * matched rule's output tokens.  This is not a correct
  * solution, as really the output should be an arbitrary
  * graph/lattice.  For example, with the above match, you
  * would expect an exact <code>PhraseQuery</code> <code>"y b
  * c"</code> to match the parsed tokens, but it will fail to
  * do so.  This limitation is necessary because Lucene's
  * TokenStream (and index) cannot yet represent an arbitrary
  * graph.</p>
  *
  * <p><b>NOTE</b>: If multiple incoming tokens arrive on the
  * same position, only the first token at that position is
  * used for parsing.  Subsequent tokens simply pass through
  * and are not parsed.  A future improvement would be to
  * allow these tokens to also be matched.</p>
  *
  * @deprecated Use {@link SynonymGraphFilter} instead, but be sure to also
  * use {@link FlattenGraphFilter} at index time (not at search time) as well.
  */

 // TODO: maybe we should resolve token -> wordID then run
 // FST on wordIDs, for better perf?

 // TODO: a more efficient approach would be Aho/Corasick's
 // algorithm
 // http://en.wikipedia.org/wiki/Aho%E2%80%93Corasick_string_matching_algorithm
 // It improves over the current approach here
 // because it does not fully re-start matching at every
 // token.  For example if one pattern is "a b c x"
 // and another is "b c d" and the input is "a b c d", on
 // trying to parse "a b c x" but failing when you got to x,
 // rather than starting over again your really should
 // immediately recognize that "b c d" matches at the next
 // input.  I suspect this won't matter that much in
 // practice, but it's possible on some set of synonyms it
 // will.  We'd have to modify Aho/Corasick to enforce our
 // conflict resolving (eg greedy matching) because that algo
 // finds all matches.  This really amounts to adding a .*
 // closure to the FST and then determinizing it.
 //
 // Another possible solution is described at http://www.cis.uni-muenchen.de/people/Schulz/Pub/dictle5.ps

 @Deprecated
 public final class SynonymFilter extends TokenFilter {

   public static final String TYPE_SYNONYM = "SYNONYM";

   private final SynonymMap synonyms;

   private final boolean ignoreCase;
   private final int rollBufferSize;

   private int captureCount;

   // TODO: we should set PositionLengthAttr too...

   private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
   private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class);
   private final PositionLengthAttribute posLenAtt = addAttribute(PositionLengthAttribute.class);
   private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class);
   private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);

   // How many future input tokens have already been matched
   // to a synonym; because the matching is "greedy" we don't
   // try to do any more matching for such tokens:
   private int inputSkipCount;

   // Hold all buffered (read ahead) stacked input tokens for
   // a future position.  When multiple tokens are at the
   // same position, we only store (and match against) the
   // term for the first token at the position, but capture
   // state for (and enumerate) all other tokens at this
   // position:
   private static class PendingInput {
     final CharsRefBuilder term = new CharsRefBuilder();
     AttributeSource.State state;
     boolean keepOrig;
     boolean matched;
     boolean consumed = true;
     int startOffset;
     int endOffset;

     public void reset() {
       state = null;
       consumed = true;
       keepOrig = false;
       matched = false;
     }
   };

   // Rolling buffer, holding pending input tokens we had to
   // clone because we needed to look ahead, indexed by
   // position:
   private final PendingInput[] futureInputs;

   // Holds pending output synonyms for one future position:
   private static class PendingOutputs {
     CharsRefBuilder[] outputs;
     int[] endOffsets;
     int[] posLengths;
     int upto;
     int count;
     int posIncr = 1;
     int lastEndOffset;
     int lastPosLength;

     public PendingOutputs() {
       outputs = new CharsRefBuilder[1];
       endOffsets = new int[1];
       posLengths = new int[1];
     }

     public void reset() {
       upto = count = 0;
       posIncr = 1;
     }

     public CharsRef pullNext() {
       assert upto < count;
       lastEndOffset = endOffsets[upto];
       lastPosLength = posLengths[upto];
       final CharsRefBuilder result = outputs[upto++];
       posIncr = 0;
       if (upto == count) {
         reset();
       }
       return result.get();
     }

     public int getLastEndOffset() {
       return lastEndOffset;
     }

     public int getLastPosLength() {
       return lastPosLength;
     }

     public void add(char[] output, int offset, int len, int endOffset, int posLength) {
       if (count == outputs.length) {
         outputs = ArrayUtil.grow(outputs, count+1);
       }
       if (count == endOffsets.length) {
         final int[] next = new int[ArrayUtil.oversize(1+count, Integer.BYTES)];
         System.arraycopy(endOffsets, 0, next, 0, count);
         endOffsets = next;
       }
       if (count == posLengths.length) {
         final int[] next = new int[ArrayUtil.oversize(1+count, Integer.BYTES)];
         System.arraycopy(posLengths, 0, next, 0, count);
         posLengths = next;
       }
       if (outputs[count] == null) {
         outputs[count] = new CharsRefBuilder();
       }
       outputs[count].copyChars(output, offset, len);
       // endOffset can be -1, in which case we should simply
       // use the endOffset of the input token, or X >= 0, in
       // which case we use X as the endOffset for this output
       endOffsets[count] = endOffset;
       posLengths[count] = posLength;
       count++;
     }
   };

   private final ByteArrayDataInput bytesReader = new ByteArrayDataInput();

   // Rolling buffer, holding stack of pending synonym
   // outputs, indexed by position:
   private final PendingOutputs[] futureOutputs;

   // Where (in rolling buffers) to write next input saved state:
   private int nextWrite;

   // Where (in rolling buffers) to read next input saved state:
   private int nextRead;

   // True once we've read last token
   private boolean finished;

   private final FST.Arc<BytesRef> scratchArc;

   private final FST<BytesRef> fst;

   private final FST.BytesReader fstReader;


   private final BytesRef scratchBytes = new BytesRef();
   private final CharsRefBuilder scratchChars = new CharsRefBuilder();

   /**
    * @param input input tokenstream
    * @param synonyms synonym map
    * @param ignoreCase case-folds input for matching with {@link Character#toLowerCase(int)}.
    *                   Note, if you set this to true, it's your responsibility to lowercase
    *                   the input entries when you create the {@link SynonymMap}
    */
   public SynonymFilter(TokenStream input, SynonymMap synonyms, boolean ignoreCase) {
     super(input);
     this.synonyms = synonyms;
     this.ignoreCase = ignoreCase;
     this.fst = synonyms.fst;
     if (fst == null) {
       throw new IllegalArgumentException("fst must be non-null");
     }
     this.fstReader = fst.getBytesReader();

     // Must be 1+ so that when roll buffer is at full
     // lookahead we can distinguish this full buffer from
     // the empty buffer:
     rollBufferSize = 1+synonyms.maxHorizontalContext;

     futureInputs = new PendingInput[rollBufferSize];
     futureOutputs = new PendingOutputs[rollBufferSize];
     for(int pos=0;pos<rollBufferSize;pos++) {
       futureInputs[pos] = new PendingInput();
       futureOutputs[pos] = new PendingOutputs();
     }

     //System.out.println("FSTFilt maxH=" + synonyms.maxHorizontalContext);

     scratchArc = new FST.Arc<>();
   }

   private void capture() {
     captureCount++;
     //System.out.println("  capture slot=" + nextWrite);
     final PendingInput input = futureInputs[nextWrite];

     input.state = captureState();
     input.consumed = false;
     input.term.copyChars(termAtt.buffer(), 0, termAtt.length());

     nextWrite = rollIncr(nextWrite);

     // Buffer head should never catch up to tail:
     assert nextWrite != nextRead;
   }

   /*
    This is the core of this TokenFilter: it locates the
    synonym matches and buffers up the results into
    futureInputs/Outputs.

    NOTE: this calls input.incrementToken and does not
    capture the state if no further tokens were checked.  So
    caller must then forward state to our caller, or capture:
   */
   private int lastStartOffset;
   private int lastEndOffset;

   private void parse() throws IOException {
     //System.out.println("\nS: parse");

     assert inputSkipCount == 0;

     int curNextRead = nextRead;

     // Holds the longest match we've seen so far:
     BytesRef matchOutput = null;
     int matchInputLength = 0;
     int matchEndOffset = -1;

     BytesRef pendingOutput = fst.outputs.getNoOutput();
     fst.getFirstArc(scratchArc);

     assert scratchArc.output() == fst.outputs.getNoOutput();

     int tokenCount = 0;

     byToken:
     while(true) {

       // Pull next token's chars:
       final char[] buffer;
       final int bufferLen;
       //System.out.println("  cycle nextRead=" + curNextRead + " nextWrite=" + nextWrite);

       int inputEndOffset = 0;

       if (curNextRead == nextWrite) {

         // We used up our lookahead buffer of input tokens
         // -- pull next real input token:

         if (finished) {
           break;
         } else  {
           //System.out.println("  input.incrToken");
           assert futureInputs[nextWrite].consumed;
           // Not correct: a syn match whose output is longer
           // than its input can set future inputs keepOrig
           // to true:
           //assert !futureInputs[nextWrite].keepOrig;
           if (input.incrementToken()) {
             buffer = termAtt.buffer();
             bufferLen = termAtt.length();
             final PendingInput input = futureInputs[nextWrite];
             lastStartOffset = input.startOffset = offsetAtt.startOffset();
             lastEndOffset = input.endOffset = offsetAtt.endOffset();
             inputEndOffset = input.endOffset;
             //System.out.println("  new token=" + new String(buffer, 0, bufferLen));
             if (nextRead != nextWrite) {
               capture();
             } else {
               input.consumed = false;
             }

           } else {
             // No more input tokens
             //System.out.println("      set end");
             finished = true;
             break;
           }
         }
       } else {
         // Still in our lookahead
         buffer = futureInputs[curNextRead].term.chars();
         bufferLen = futureInputs[curNextRead].term.length();
         inputEndOffset = futureInputs[curNextRead].endOffset;
         //System.out.println("  old token=" + new String(buffer, 0, bufferLen));
       }

       tokenCount++;

       // Run each char in this token through the FST:
       int bufUpto = 0;
       while(bufUpto < bufferLen) {
         final int codePoint = Character.codePointAt(buffer, bufUpto, bufferLen);
         if (fst.findTargetArc(ignoreCase ? Character.toLowerCase(codePoint) : codePoint, scratchArc, scratchArc, fstReader) == null) {
           //System.out.println("    stop");
           break byToken;
         }

         // Accum the output
         pendingOutput = fst.outputs.add(pendingOutput, scratchArc.output());
         //System.out.println("    char=" + buffer[bufUpto] + " output=" + pendingOutput + " arc.output=" + scratchArc.output);
         bufUpto += Character.charCount(codePoint);
       }

       // OK, entire token matched; now see if this is a final
       // state:
       if (scratchArc.isFinal()) {
         matchOutput = fst.outputs.add(pendingOutput, scratchArc.nextFinalOutput());
         matchInputLength = tokenCount;
         matchEndOffset = inputEndOffset;
         //System.out.println("  found matchLength=" + matchInputLength + " output=" + matchOutput);
       }

       // See if the FST wants to continue matching (ie, needs to
       // see the next input token):
       if (fst.findTargetArc(SynonymMap.WORD_SEPARATOR, scratchArc, scratchArc, fstReader) == null) {
         // No further rules can match here; we're done
         // searching for matching rules starting at the
         // current input position.
         break;
       } else {
         // More matching is possible -- accum the output (if
         // any) of the WORD_SEP arc:
         pendingOutput = fst.outputs.add(pendingOutput, scratchArc.output());
         if (nextRead == nextWrite) {
           capture();
         }
       }

       curNextRead = rollIncr(curNextRead);
     }

     if (nextRead == nextWrite && !finished) {
       //System.out.println("  skip write slot=" + nextWrite);
       nextWrite = rollIncr(nextWrite);
     }

     if (matchOutput != null) {
       //System.out.println("  add matchLength=" + matchInputLength + " output=" + matchOutput);
       inputSkipCount = matchInputLength;
       addOutput(matchOutput, matchInputLength, matchEndOffset);
     } else if (nextRead != nextWrite) {
       // Even though we had no match here, we set to 1
       // because we need to skip current input token before
       // trying to match again:
       inputSkipCount = 1;
     } else {
       assert finished;
     }

     //System.out.println("  parse done inputSkipCount=" + inputSkipCount + " nextRead=" + nextRead + " nextWrite=" + nextWrite);
   }

   // Interleaves all output tokens onto the futureOutputs:
   private void addOutput(BytesRef bytes, int matchInputLength, int matchEndOffset) {
     bytesReader.reset(bytes.bytes, bytes.offset, bytes.length);

     final int code = bytesReader.readVInt();
     final boolean keepOrig = (code & 0x1) == 0;
     final int count = code >>> 1;
     //System.out.println("  addOutput count=" + count + " keepOrig=" + keepOrig);
     for(int outputIDX=0;outputIDX<count;outputIDX++) {
       synonyms.words.get(bytesReader.readVInt(),
                          scratchBytes);
       //System.out.println("    outIDX=" + outputIDX + " bytes=" + scratchBytes.length);
       scratchChars.copyUTF8Bytes(scratchBytes);
       int lastStart = 0;
       final int chEnd = lastStart + scratchChars.length();
       int outputUpto = nextRead;
       for(int chIDX=lastStart;chIDX<=chEnd;chIDX++) {
         if (chIDX == chEnd || scratchChars.charAt(chIDX) == SynonymMap.WORD_SEPARATOR) {
           final int outputLen = chIDX - lastStart;
           // Caller is not allowed to have empty string in
           // the output:
           assert outputLen > 0: "output contains empty string: " + scratchChars;
           final int endOffset;
           final int posLen;
           if (chIDX == chEnd && lastStart == 0) {
             // This rule had a single output token, so, we set
             // this output's endOffset to the current
             // endOffset (ie, endOffset of the last input
             // token it matched):
             endOffset = matchEndOffset;
             posLen = keepOrig ? matchInputLength : 1;
           } else {
             // This rule has more than one output token; we
             // can't pick any particular endOffset for this
             // case, so, we inherit the endOffset for the
             // input token which this output overlaps:
             endOffset = -1;
             posLen = 1;
           }
           futureOutputs[outputUpto].add(scratchChars.chars(), lastStart, outputLen, endOffset, posLen);
           //System.out.println("      " + new String(scratchChars.chars, lastStart, outputLen) + " outputUpto=" + outputUpto);
           lastStart = 1+chIDX;
           //System.out.println("  slot=" + outputUpto + " keepOrig=" + keepOrig);
           outputUpto = rollIncr(outputUpto);
           assert futureOutputs[outputUpto].posIncr == 1: "outputUpto=" + outputUpto + " vs nextWrite=" + nextWrite;
         }
       }
     }

     int upto = nextRead;
     for(int idx=0;idx<matchInputLength;idx++) {
       futureInputs[upto].keepOrig |= keepOrig;
       futureInputs[upto].matched = true;
       upto = rollIncr(upto);
     }
   }

   // ++ mod rollBufferSize
   private int rollIncr(int count) {
     count++;
     if (count == rollBufferSize) {
       return 0;
     } else {
       return count;
     }
   }

   // for testing
   int getCaptureCount() {
     return captureCount;
   }

   @Override
   public boolean incrementToken() throws IOException {

     //System.out.println("\nS: incrToken inputSkipCount=" + inputSkipCount + " nextRead=" + nextRead + " nextWrite=" + nextWrite);

     while(true) {

       // First play back any buffered future inputs/outputs
       // w/o running parsing again:
       while (inputSkipCount != 0) {

         // At each position, we first output the original
         // token

         // TODO: maybe just a PendingState class, holding
         // both input & outputs?
         final PendingInput input = futureInputs[nextRead];
         final PendingOutputs outputs = futureOutputs[nextRead];

         //System.out.println("  cycle nextRead=" + nextRead + " nextWrite=" + nextWrite + " inputSkipCount="+ inputSkipCount + " input.keepOrig=" + input.keepOrig + " input.consumed=" + input.consumed + " input.state=" + input.state);

         if (!input.consumed && (input.keepOrig || !input.matched)) {
           if (input.state != null) {
             // Return a previously saved token (because we
             // had to lookahead):
             restoreState(input.state);
           } else {
             // Pass-through case: return token we just pulled
             // but didn't capture:
             assert inputSkipCount == 1: "inputSkipCount=" + inputSkipCount + " nextRead=" + nextRead;
           }
           input.reset();
           if (outputs.count > 0) {
             outputs.posIncr = 0;
           } else {
             nextRead = rollIncr(nextRead);
             inputSkipCount--;
           }
           //System.out.println("  return token=" + termAtt.toString());
           return true;
         } else if (outputs.upto < outputs.count) {
           // Still have pending outputs to replay at this
           // position
           input.reset();
           final int posIncr = outputs.posIncr;
           final CharsRef output = outputs.pullNext();
           clearAttributes();
           termAtt.copyBuffer(output.chars, output.offset, output.length);
           typeAtt.setType(TYPE_SYNONYM);
           int endOffset = outputs.getLastEndOffset();
           if (endOffset == -1) {
             endOffset = input.endOffset;
           }
           offsetAtt.setOffset(input.startOffset, endOffset);
           posIncrAtt.setPositionIncrement(posIncr);
           posLenAtt.setPositionLength(outputs.getLastPosLength());
           if (outputs.count == 0) {
             // Done with the buffered input and all outputs at
             // this position
             nextRead = rollIncr(nextRead);
             inputSkipCount--;
           }
           //System.out.println("  return token=" + termAtt.toString());
           return true;
         } else {
           // Done with the buffered input and all outputs at
           // this position
           input.reset();
           nextRead = rollIncr(nextRead);
           inputSkipCount--;
         }
       }

       if (finished && nextRead == nextWrite) {
         // End case: if any output syns went beyond end of
         // input stream, enumerate them now:
         final PendingOutputs outputs = futureOutputs[nextRead];
         if (outputs.upto < outputs.count) {
           final int posIncr = outputs.posIncr;
           final CharsRef output = outputs.pullNext();
           futureInputs[nextRead].reset();
           if (outputs.count == 0) {
             nextWrite = nextRead = rollIncr(nextRead);
           }
           clearAttributes();
           // Keep offset from last input token:
           offsetAtt.setOffset(lastStartOffset, lastEndOffset);
           termAtt.copyBuffer(output.chars, output.offset, output.length);
           typeAtt.setType(TYPE_SYNONYM);
           //System.out.println("  set posIncr=" + outputs.posIncr + " outputs=" + outputs);
           posIncrAtt.setPositionIncrement(posIncr);
           //System.out.println("  return token=" + termAtt.toString());
           return true;
         } else {
           return false;
         }
       }

       // Find new synonym matches:
       parse();
     }
   }

   @Override
   public void reset() throws IOException {
     super.reset();
     captureCount = 0;
     finished = false;
     inputSkipCount = 0;
     nextRead = nextWrite = 0;

     // In normal usage these resets would not be needed,
     // since they reset-as-they-are-consumed, but the app
     // may not consume all input tokens (or we might hit an
     // exception), in which case we have leftover state
     // here:
     for (PendingInput input : futureInputs) {
       input.reset();
     }
     for (PendingOutputs output : futureOutputs) {
       output.reset();
     }
   }
 }
	/*
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/
	package org.apache.lucene.analysis.synonym;

	import java.io.IOException;

	import org.apache.lucene.analysis.TokenFilter;
	import org.apache.lucene.analysis.TokenStream;
	import org.apache.lucene.analysis.core.FlattenGraphFilter;
	import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
	import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
	import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
	import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute;
	import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
	import org.apache.lucene.store.ByteArrayDataInput;
	import org.apache.lucene.util.ArrayUtil;
	import org.apache.lucene.util.AttributeSource;
	import org.apache.lucene.util.BytesRef;
	import org.apache.lucene.util.CharsRef;
	import org.apache.lucene.util.CharsRefBuilder;
	import org.apache.lucene.util.fst.FST;

	/**
	* Matches single or multi word synonyms in a token stream.
	* This token stream cannot properly handle position
	* increments != 1, ie, you should place this filter before
	* filtering out stop words.
	*
	* <p>Note that with the current implementation, parsing is
	* greedy, so whenever multiple parses would apply, the rule
	* starting the earliest and parsing the most tokens wins.
	* For example if you have these rules:
	*
	* <pre>
	* a -> x
	* a b -> y
	* b c d -> z
	* </pre>
	*
	* Then input <code>a b c d e</code> parses to <code>y b c
	* d</code>, ie the 2nd rule "wins" because it started
	* earliest and matched the most input tokens of other rules
	* starting at that point.
	*
	* <p>A future improvement to this filter could allow
	* non-greedy parsing, such that the 3rd rule would win, and
	* also separately allow multiple parses, such that all 3
	* rules would match, perhaps even on a rule by rule
	* basis.</p>
	*
	* <p><b>NOTE</b>: when a match occurs, the output tokens
	* associated with the matching rule are "stacked" on top of
	* the input stream (if the rule had
	* <code>keepOrig=true</code>) and also on top of another
	* matched rule's output tokens. This is not a correct
	* solution, as really the output should be an arbitrary
	* graph/lattice. For example, with the above match, you
	* would expect an exact <code>PhraseQuery</code> <code>"y b
	* c"</code> to match the parsed tokens, but it will fail to
	* do so. This limitation is necessary because Lucene's
	* TokenStream (and index) cannot yet represent an arbitrary
	* graph.</p>
	*
	* <p><b>NOTE</b>: If multiple incoming tokens arrive on the
	* same position, only the first token at that position is
	* used for parsing. Subsequent tokens simply pass through
	* and are not parsed. A future improvement would be to
	* allow these tokens to also be matched.</p>
	*
	* @deprecated Use {@link SynonymGraphFilter} instead, but be sure to also
	* use {@link FlattenGraphFilter} at index time (not at search time) as well.
	*/

	// TODO: maybe we should resolve token -> wordID then run
	// FST on wordIDs, for better perf?

	// TODO: a more efficient approach would be Aho/Corasick's
	// algorithm
	// http://en.wikipedia.org/wiki/Aho%E2%80%93Corasick_string_matching_algorithm
	// It improves over the current approach here
	// because it does not fully re-start matching at every
	// token. For example if one pattern is "a b c x"
	// and another is "b c d" and the input is "a b c d", on
	// trying to parse "a b c x" but failing when you got to x,
	// rather than starting over again your really should
	// immediately recognize that "b c d" matches at the next
	// input. I suspect this won't matter that much in
	// practice, but it's possible on some set of synonyms it
	// will. We'd have to modify Aho/Corasick to enforce our
	// conflict resolving (eg greedy matching) because that algo
	// finds all matches. This really amounts to adding a .*
	// closure to the FST and then determinizing it.
	//
	// Another possible solution is described at http://www.cis.uni-muenchen.de/people/Schulz/Pub/dictle5.ps

	@Deprecated
	public final class SynonymFilter extends TokenFilter {

	public static final String TYPE_SYNONYM = "SYNONYM";

	private final SynonymMap synonyms;

	private final boolean ignoreCase;
	private final int rollBufferSize;

	private int captureCount;

	// TODO: we should set PositionLengthAttr too...

	private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
	private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class);
	private final PositionLengthAttribute posLenAtt = addAttribute(PositionLengthAttribute.class);
	private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class);
	private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);

	// How many future input tokens have already been matched
	// to a synonym; because the matching is "greedy" we don't
	// try to do any more matching for such tokens:
	private int inputSkipCount;

	// Hold all buffered (read ahead) stacked input tokens for
	// a future position. When multiple tokens are at the
	// same position, we only store (and match against) the
	// term for the first token at the position, but capture
	// state for (and enumerate) all other tokens at this
	// position:
	private static class PendingInput {
	final CharsRefBuilder term = new CharsRefBuilder();
	AttributeSource.State state;
	boolean keepOrig;
	boolean matched;
	boolean consumed = true;
	int startOffset;
	int endOffset;

	public void reset() {
	state = null;
	consumed = true;
	keepOrig = false;
	matched = false;
	}
	};

	// Rolling buffer, holding pending input tokens we had to
	// clone because we needed to look ahead, indexed by
	// position:
	private final PendingInput[] futureInputs;

	// Holds pending output synonyms for one future position:
	private static class PendingOutputs {
	CharsRefBuilder[] outputs;
	int[] endOffsets;
	int[] posLengths;
	int upto;
	int count;
	int posIncr = 1;
	int lastEndOffset;
	int lastPosLength;

	public PendingOutputs() {
	outputs = new CharsRefBuilder[1];
	endOffsets = new int[1];
	posLengths = new int[1];
	}

	public void reset() {
	upto = count = 0;
	posIncr = 1;
	}

	public CharsRef pullNext() {
	assert upto < count;
	lastEndOffset = endOffsets[upto];
	lastPosLength = posLengths[upto];
	final CharsRefBuilder result = outputs[upto++];
	posIncr = 0;
	if (upto == count) {
	reset();
	}
	return result.get();
	}

	public int getLastEndOffset() {
	return lastEndOffset;
	}

	public int getLastPosLength() {
	return lastPosLength;
	}

	public void add(char[] output, int offset, int len, int endOffset, int posLength) {
	if (count == outputs.length) {
	outputs = ArrayUtil.grow(outputs, count+1);
	}
	if (count == endOffsets.length) {
	final int[] next = new int[ArrayUtil.oversize(1+count, Integer.BYTES)];
	System.arraycopy(endOffsets, 0, next, 0, count);
	endOffsets = next;
	}
	if (count == posLengths.length) {
	final int[] next = new int[ArrayUtil.oversize(1+count, Integer.BYTES)];
	System.arraycopy(posLengths, 0, next, 0, count);
	posLengths = next;
	}
	if (outputs[count] == null) {
	outputs[count] = new CharsRefBuilder();
	}
	outputs[count].copyChars(output, offset, len);
	// endOffset can be -1, in which case we should simply
	// use the endOffset of the input token, or X >= 0, in
	// which case we use X as the endOffset for this output
	endOffsets[count] = endOffset;
	posLengths[count] = posLength;
	count++;
	}
	};

	private final ByteArrayDataInput bytesReader = new ByteArrayDataInput();

	// Rolling buffer, holding stack of pending synonym
	// outputs, indexed by position:
	private final PendingOutputs[] futureOutputs;

	// Where (in rolling buffers) to write next input saved state:
	private int nextWrite;

	// Where (in rolling buffers) to read next input saved state:
	private int nextRead;

	// True once we've read last token
	private boolean finished;

	private final FST.Arc<BytesRef> scratchArc;

	private final FST<BytesRef> fst;

	private final FST.BytesReader fstReader;


	private final BytesRef scratchBytes = new BytesRef();
	private final CharsRefBuilder scratchChars = new CharsRefBuilder();

	/**
	* @param input input tokenstream
	* @param synonyms synonym map
	* @param ignoreCase case-folds input for matching with {@link Character#toLowerCase(int)}.
	* Note, if you set this to true, it's your responsibility to lowercase
	* the input entries when you create the {@link SynonymMap}
	*/
	public SynonymFilter(TokenStream input, SynonymMap synonyms, boolean ignoreCase) {
	super(input);
	this.synonyms = synonyms;
	this.ignoreCase = ignoreCase;
	this.fst = synonyms.fst;
	if (fst == null) {
	throw new IllegalArgumentException("fst must be non-null");
	}
	this.fstReader = fst.getBytesReader();

	// Must be 1+ so that when roll buffer is at full
	// lookahead we can distinguish this full buffer from
	// the empty buffer:
	rollBufferSize = 1+synonyms.maxHorizontalContext;

	futureInputs = new PendingInput[rollBufferSize];
	futureOutputs = new PendingOutputs[rollBufferSize];
	for(int pos=0;pos<rollBufferSize;pos++) {
	futureInputs[pos] = new PendingInput();
	futureOutputs[pos] = new PendingOutputs();
	}

	//System.out.println("FSTFilt maxH=" + synonyms.maxHorizontalContext);

	scratchArc = new FST.Arc<>();
	}

	private void capture() {
	captureCount++;
	//System.out.println(" capture slot=" + nextWrite);
	final PendingInput input = futureInputs[nextWrite];

	input.state = captureState();
	input.consumed = false;
	input.term.copyChars(termAtt.buffer(), 0, termAtt.length());

	nextWrite = rollIncr(nextWrite);

	// Buffer head should never catch up to tail:
	assert nextWrite != nextRead;
	}

	/*
	This is the core of this TokenFilter: it locates the
	synonym matches and buffers up the results into
	futureInputs/Outputs.

	NOTE: this calls input.incrementToken and does not
	capture the state if no further tokens were checked. So
	caller must then forward state to our caller, or capture:
	*/
	private int lastStartOffset;
	private int lastEndOffset;

	private void parse() throws IOException {
	//System.out.println("\nS: parse");

	assert inputSkipCount == 0;

	int curNextRead = nextRead;

	// Holds the longest match we've seen so far:
	BytesRef matchOutput = null;
	int matchInputLength = 0;
	int matchEndOffset = -1;

	BytesRef pendingOutput = fst.outputs.getNoOutput();
	fst.getFirstArc(scratchArc);

	assert scratchArc.output() == fst.outputs.getNoOutput();

	int tokenCount = 0;

	byToken:
	while(true) {

	// Pull next token's chars:
	final char[] buffer;
	final int bufferLen;
	//System.out.println(" cycle nextRead=" + curNextRead + " nextWrite=" + nextWrite);

	int inputEndOffset = 0;

	if (curNextRead == nextWrite) {

	// We used up our lookahead buffer of input tokens
	// -- pull next real input token:

	if (finished) {
	break;
	} else {
	//System.out.println(" input.incrToken");
	assert futureInputs[nextWrite].consumed;
	// Not correct: a syn match whose output is longer
	// than its input can set future inputs keepOrig
	// to true:
	//assert !futureInputs[nextWrite].keepOrig;
	if (input.incrementToken()) {
	buffer = termAtt.buffer();
	bufferLen = termAtt.length();
	final PendingInput input = futureInputs[nextWrite];
	lastStartOffset = input.startOffset = offsetAtt.startOffset();
	lastEndOffset = input.endOffset = offsetAtt.endOffset();
	inputEndOffset = input.endOffset;
	//System.out.println(" new token=" + new String(buffer, 0, bufferLen));
	if (nextRead != nextWrite) {
	capture();
	} else {
	input.consumed = false;
	}

	} else {
	// No more input tokens
	//System.out.println(" set end");
	finished = true;
	break;
	}
	}
	} else {
	// Still in our lookahead
	buffer = futureInputs[curNextRead].term.chars();
	bufferLen = futureInputs[curNextRead].term.length();
	inputEndOffset = futureInputs[curNextRead].endOffset;
	//System.out.println(" old token=" + new String(buffer, 0, bufferLen));
	}

	tokenCount++;

	// Run each char in this token through the FST:
	int bufUpto = 0;
	while(bufUpto < bufferLen) {
	final int codePoint = Character.codePointAt(buffer, bufUpto, bufferLen);
	if (fst.findTargetArc(ignoreCase ? Character.toLowerCase(codePoint) : codePoint, scratchArc, scratchArc, fstReader) == null) {
	//System.out.println(" stop");
	break byToken;
	}

	// Accum the output
	pendingOutput = fst.outputs.add(pendingOutput, scratchArc.output());
	//System.out.println(" char=" + buffer[bufUpto] + " output=" + pendingOutput + " arc.output=" + scratchArc.output);
	bufUpto += Character.charCount(codePoint);
	}

	// OK, entire token matched; now see if this is a final
	// state:
	if (scratchArc.isFinal()) {
	matchOutput = fst.outputs.add(pendingOutput, scratchArc.nextFinalOutput());
	matchInputLength = tokenCount;
	matchEndOffset = inputEndOffset;
	//System.out.println(" found matchLength=" + matchInputLength + " output=" + matchOutput);
	}

	// See if the FST wants to continue matching (ie, needs to
	// see the next input token):
	if (fst.findTargetArc(SynonymMap.WORD_SEPARATOR, scratchArc, scratchArc, fstReader) == null) {
	// No further rules can match here; we're done
	// searching for matching rules starting at the
	// current input position.
	break;
	} else {
	// More matching is possible -- accum the output (if
	// any) of the WORD_SEP arc:
	pendingOutput = fst.outputs.add(pendingOutput, scratchArc.output());
	if (nextRead == nextWrite) {
	capture();
	}
	}

	curNextRead = rollIncr(curNextRead);
	}

	if (nextRead == nextWrite && !finished) {
	//System.out.println(" skip write slot=" + nextWrite);
	nextWrite = rollIncr(nextWrite);
	}

	if (matchOutput != null) {
	//System.out.println(" add matchLength=" + matchInputLength + " output=" + matchOutput);
	inputSkipCount = matchInputLength;
	addOutput(matchOutput, matchInputLength, matchEndOffset);
	} else if (nextRead != nextWrite) {
	// Even though we had no match here, we set to 1
	// because we need to skip current input token before
	// trying to match again:
	inputSkipCount = 1;
	} else {
	assert finished;
	}

	//System.out.println(" parse done inputSkipCount=" + inputSkipCount + " nextRead=" + nextRead + " nextWrite=" + nextWrite);
	}

	// Interleaves all output tokens onto the futureOutputs:
	private void addOutput(BytesRef bytes, int matchInputLength, int matchEndOffset) {
	bytesReader.reset(bytes.bytes, bytes.offset, bytes.length);

	final int code = bytesReader.readVInt();
	final boolean keepOrig = (code & 0x1) == 0;
	final int count = code >>> 1;
	//System.out.println(" addOutput count=" + count + " keepOrig=" + keepOrig);
	for(int outputIDX=0;outputIDX<count;outputIDX++) {
	synonyms.words.get(bytesReader.readVInt(),
	scratchBytes);
	//System.out.println(" outIDX=" + outputIDX + " bytes=" + scratchBytes.length);
	scratchChars.copyUTF8Bytes(scratchBytes);
	int lastStart = 0;
	final int chEnd = lastStart + scratchChars.length();
	int outputUpto = nextRead;
	for(int chIDX=lastStart;chIDX<=chEnd;chIDX++) {
	if (chIDX == chEnd \|\| scratchChars.charAt(chIDX) == SynonymMap.WORD_SEPARATOR) {
	final int outputLen = chIDX - lastStart;
	// Caller is not allowed to have empty string in
	// the output:
	assert outputLen > 0: "output contains empty string: " + scratchChars;
	final int endOffset;
	final int posLen;
	if (chIDX == chEnd && lastStart == 0) {
	// This rule had a single output token, so, we set
	// this output's endOffset to the current
	// endOffset (ie, endOffset of the last input
	// token it matched):
	endOffset = matchEndOffset;
	posLen = keepOrig ? matchInputLength : 1;
	} else {
	// This rule has more than one output token; we
	// can't pick any particular endOffset for this
	// case, so, we inherit the endOffset for the
	// input token which this output overlaps:
	endOffset = -1;
	posLen = 1;
	}
	futureOutputs[outputUpto].add(scratchChars.chars(), lastStart, outputLen, endOffset, posLen);
	//System.out.println(" " + new String(scratchChars.chars, lastStart, outputLen) + " outputUpto=" + outputUpto);
	lastStart = 1+chIDX;
	//System.out.println(" slot=" + outputUpto + " keepOrig=" + keepOrig);
	outputUpto = rollIncr(outputUpto);
	assert futureOutputs[outputUpto].posIncr == 1: "outputUpto=" + outputUpto + " vs nextWrite=" + nextWrite;
	}
	}
	}

	int upto = nextRead;
	for(int idx=0;idx<matchInputLength;idx++) {
	futureInputs[upto].keepOrig \|= keepOrig;
	futureInputs[upto].matched = true;
	upto = rollIncr(upto);
	}
	}

	// ++ mod rollBufferSize
	private int rollIncr(int count) {
	count++;
	if (count == rollBufferSize) {
	return 0;
	} else {
	return count;
	}
	}

	// for testing
	int getCaptureCount() {
	return captureCount;
	}

	@Override
	public boolean incrementToken() throws IOException {

	//System.out.println("\nS: incrToken inputSkipCount=" + inputSkipCount + " nextRead=" + nextRead + " nextWrite=" + nextWrite);

	while(true) {

	// First play back any buffered future inputs/outputs
	// w/o running parsing again:
	while (inputSkipCount != 0) {

	// At each position, we first output the original
	// token

	// TODO: maybe just a PendingState class, holding
	// both input & outputs?
	final PendingInput input = futureInputs[nextRead];
	final PendingOutputs outputs = futureOutputs[nextRead];

	//System.out.println(" cycle nextRead=" + nextRead + " nextWrite=" + nextWrite + " inputSkipCount="+ inputSkipCount + " input.keepOrig=" + input.keepOrig + " input.consumed=" + input.consumed + " input.state=" + input.state);

	if (!input.consumed && (input.keepOrig \|\| !input.matched)) {
	if (input.state != null) {
	// Return a previously saved token (because we
	// had to lookahead):
	restoreState(input.state);
	} else {
	// Pass-through case: return token we just pulled
	// but didn't capture:
	assert inputSkipCount == 1: "inputSkipCount=" + inputSkipCount + " nextRead=" + nextRead;
	}
	input.reset();
	if (outputs.count > 0) {
	outputs.posIncr = 0;
	} else {
	nextRead = rollIncr(nextRead);
	inputSkipCount--;
	}
	//System.out.println(" return token=" + termAtt.toString());
	return true;
	} else if (outputs.upto < outputs.count) {
	// Still have pending outputs to replay at this
	// position
	input.reset();
	final int posIncr = outputs.posIncr;
	final CharsRef output = outputs.pullNext();
	clearAttributes();
	termAtt.copyBuffer(output.chars, output.offset, output.length);
	typeAtt.setType(TYPE_SYNONYM);
	int endOffset = outputs.getLastEndOffset();
	if (endOffset == -1) {
	endOffset = input.endOffset;
	}
	offsetAtt.setOffset(input.startOffset, endOffset);
	posIncrAtt.setPositionIncrement(posIncr);
	posLenAtt.setPositionLength(outputs.getLastPosLength());
	if (outputs.count == 0) {
	// Done with the buffered input and all outputs at
	// this position
	nextRead = rollIncr(nextRead);
	inputSkipCount--;
	}
	//System.out.println(" return token=" + termAtt.toString());
	return true;
	} else {
	// Done with the buffered input and all outputs at
	// this position
	input.reset();
	nextRead = rollIncr(nextRead);
	inputSkipCount--;
	}
	}

	if (finished && nextRead == nextWrite) {
	// End case: if any output syns went beyond end of
	// input stream, enumerate them now:
	final PendingOutputs outputs = futureOutputs[nextRead];
	if (outputs.upto < outputs.count) {
	final int posIncr = outputs.posIncr;
	final CharsRef output = outputs.pullNext();
	futureInputs[nextRead].reset();
	if (outputs.count == 0) {
	nextWrite = nextRead = rollIncr(nextRead);
	}
	clearAttributes();
	// Keep offset from last input token:
	offsetAtt.setOffset(lastStartOffset, lastEndOffset);
	termAtt.copyBuffer(output.chars, output.offset, output.length);
	typeAtt.setType(TYPE_SYNONYM);
	//System.out.println(" set posIncr=" + outputs.posIncr + " outputs=" + outputs);
	posIncrAtt.setPositionIncrement(posIncr);
	//System.out.println(" return token=" + termAtt.toString());
	return true;
	} else {
	return false;
	}
	}

	// Find new synonym matches:
	parse();
	}
	}

	@Override
	public void reset() throws IOException {
	super.reset();
	captureCount = 0;
	finished = false;
	inputSkipCount = 0;
	nextRead = nextWrite = 0;

	// In normal usage these resets would not be needed,
	// since they reset-as-they-are-consumed, but the app
	// may not consume all input tokens (or we might hit an
	// exception), in which case we have leftover state
	// here:
	for (PendingInput input : futureInputs) {
	input.reset();
	}
	for (PendingOutputs output : futureOutputs) {
	output.reset();
	}
	}
	}