src/Lucene.Net.Analysis.Common/Analysis/Synonym/SynonymFilter.cs - lucenenet - Git at Google

 using J2N;
 using Lucene.Net.Analysis.TokenAttributes;
 using Lucene.Net.Store;
 using Lucene.Net.Util;
 using Lucene.Net.Util.Fst;
 using System;
 using System.Diagnostics;

 namespace Lucene.Net.Analysis.Synonym
 {
     /*
      * Licensed to the Apache Software Foundation (ASF) under one or more
      * contributor license agreements.  See the NOTICE file distributed with
      * this work for additional information regarding copyright ownership.
      * The ASF licenses this file to You under the Apache License, Version 2.0
      * (the "License"); you may not use this file except in compliance with
      * the License.  You may obtain a copy of the License at
      *
      *     http://www.apache.org/licenses/LICENSE-2.0
      *
      * Unless required by applicable law or agreed to in writing, software
      * distributed under the License is distributed on an "AS IS" BASIS,
      * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
      * See the License for the specific language governing permissions and
      * limitations under the License.
      */

     /// <summary>
     /// Matches single or multi word synonyms in a token stream.
     /// This token stream cannot properly handle position
     /// increments != 1, ie, you should place this filter before
     /// filtering out stop words.
     ///
     /// <para>Note that with the current implementation, parsing is
     /// greedy, so whenever multiple parses would apply, the rule
     /// starting the earliest and parsing the most tokens wins.
     /// For example if you have these rules:
     ///
     /// <code>
     ///   a -> x
     ///   a b -> y
     ///   b c d -> z
     /// </code>
     ///
     /// Then input <c>a b c d e</c> parses to <c>y b c
     /// d</c>, ie the 2nd rule "wins" because it started
     /// earliest and matched the most input tokens of other rules
     /// starting at that point.</para>
     ///
     /// <para>A future improvement to this filter could allow
     /// non-greedy parsing, such that the 3rd rule would win, and
     /// also separately allow multiple parses, such that all 3
     /// rules would match, perhaps even on a rule by rule
     /// basis.</para>
     ///
     /// <para><b>NOTE</b>: when a match occurs, the output tokens
     /// associated with the matching rule are "stacked" on top of
     /// the input stream (if the rule had
     /// <c>keepOrig=true</c>) and also on top of another
     /// matched rule's output tokens.  This is not a correct
     /// solution, as really the output should be an arbitrary
     /// graph/lattice.  For example, with the above match, you
     /// would expect an exact <see cref="Search.PhraseQuery"/> <c>"y b
     /// c"</c> to match the parsed tokens, but it will fail to
     /// do so.  This limitation is necessary because Lucene's
     /// <see cref="TokenStream"/> (and index) cannot yet represent an arbitrary
     /// graph.</para>
     ///
     /// <para><b>NOTE</b>: If multiple incoming tokens arrive on the
     /// same position, only the first token at that position is
     /// used for parsing.  Subsequent tokens simply pass through
     /// and are not parsed.  A future improvement would be to
     /// allow these tokens to also be matched.</para>
     /// </summary>

     // TODO: maybe we should resolve token -> wordID then run
     // FST on wordIDs, for better perf?

     // TODO: a more efficient approach would be Aho/Corasick's
     // algorithm
     // http://en.wikipedia.org/wiki/Aho%E2%80%93Corasick_string_matching_algorithm
     // It improves over the current approach here
     // because it does not fully re-start matching at every
     // token.  For example if one pattern is "a b c x"
     // and another is "b c d" and the input is "a b c d", on
     // trying to parse "a b c x" but failing when you got to x,
     // rather than starting over again your really should
     // immediately recognize that "b c d" matches at the next
     // input.  I suspect this won't matter that much in
     // practice, but it's possible on some set of synonyms it
     // will.  We'd have to modify Aho/Corasick to enforce our
     // conflict resolving (eg greedy matching) because that algo
     // finds all matches.  This really amounts to adding a .*
     // closure to the FST and then determinizing it.

     public sealed class SynonymFilter : TokenFilter
     {
         public const string TYPE_SYNONYM = "SYNONYM";

         private readonly SynonymMap synonyms;

         private readonly bool ignoreCase;
         private readonly int rollBufferSize;

         private int captureCount;

         // TODO: we should set PositionLengthAttr too...

         private readonly ICharTermAttribute termAtt;
         private readonly IPositionIncrementAttribute posIncrAtt;
         private readonly IPositionLengthAttribute posLenAtt;
         private readonly ITypeAttribute typeAtt;
         private readonly IOffsetAttribute offsetAtt;


         // How many future input tokens have already been matched
         // to a synonym; because the matching is "greedy" we don't
         // try to do any more matching for such tokens:
         private int inputSkipCount;

         // Hold all buffered (read ahead) stacked input tokens for
         // a future position.  When multiple tokens are at the
         // same position, we only store (and match against) the
         // term for the first token at the position, but capture
         // state for (and enumerate) all other tokens at this
         // position:
         private class PendingInput
         {
             internal readonly CharsRef term = new CharsRef();
             internal AttributeSource.State state;
             internal bool keepOrig;
             internal bool matched;
             internal bool consumed = true;
             internal int startOffset;
             internal int endOffset;

             public void Reset()
             {
                 state = null;
                 consumed = true;
                 keepOrig = false;
                 matched = false;
             }
         }

         // Rolling buffer, holding pending input tokens we had to
         // clone because we needed to look ahead, indexed by
         // position:
         private readonly PendingInput[] futureInputs;

         // Holds pending output synonyms for one future position:
         private class PendingOutputs
         {
             internal CharsRef[] outputs;
             internal int[] endOffsets;
             internal int[] posLengths;
             internal int upto;
             internal int count;
             internal int posIncr = 1;
             internal int lastEndOffset;
             internal int lastPosLength;

             public PendingOutputs()
             {
                 outputs = new CharsRef[1];
                 endOffsets = new int[1];
                 posLengths = new int[1];
             }

             public virtual void Reset()
             {
                 upto = count = 0;
                 posIncr = 1;
             }

             public virtual CharsRef PullNext()
             {
                 Debug.Assert(upto < count);
                 lastEndOffset = endOffsets[upto];
                 lastPosLength = posLengths[upto];
                 CharsRef result = outputs[upto++];
                 posIncr = 0;
                 if (upto == count)
                 {
                     Reset();
                 }
                 return result;
             }

             public virtual int LastEndOffset
             {
                 get
                 {
                     return lastEndOffset;
                 }
             }

             public virtual int LastPosLength
             {
                 get
                 {
                     return lastPosLength;
                 }
             }

             public virtual void Add(char[] output, int offset, int len, int endOffset, int posLength)
             {
                 if (count == outputs.Length)
                 {
                     CharsRef[] next = new CharsRef[ArrayUtil.Oversize(1 + count, RamUsageEstimator.NUM_BYTES_OBJECT_REF)];
                     Array.Copy(outputs, 0, next, 0, count);
                     outputs = next;
                 }
                 if (count == endOffsets.Length)
                 {
                     int[] next = new int[ArrayUtil.Oversize(1 + count, RamUsageEstimator.NUM_BYTES_INT32)];
                     Array.Copy(endOffsets, 0, next, 0, count);
                     endOffsets = next;
                 }
                 if (count == posLengths.Length)
                 {
                     int[] next = new int[ArrayUtil.Oversize(1 + count, RamUsageEstimator.NUM_BYTES_INT32)];
                     Array.Copy(posLengths, 0, next, 0, count);
                     posLengths = next;
                 }
                 if (outputs[count] == null)
                 {
                     outputs[count] = new CharsRef();
                 }
                 outputs[count].CopyChars(output, offset, len);
                 // endOffset can be -1, in which case we should simply
                 // use the endOffset of the input token, or X >= 0, in
                 // which case we use X as the endOffset for this output
                 endOffsets[count] = endOffset;
                 posLengths[count] = posLength;
                 count++;
             }
         }

         private readonly ByteArrayDataInput bytesReader = new ByteArrayDataInput();

         // Rolling buffer, holding stack of pending synonym
         // outputs, indexed by position:
         private readonly PendingOutputs[] futureOutputs;

         // Where (in rolling buffers) to write next input saved state:
         private int nextWrite;

         // Where (in rolling buffers) to read next input saved state:
         private int nextRead;

         // True once we've read last token
         private bool finished;

         private readonly FST.Arc<BytesRef> scratchArc;

         private readonly FST<BytesRef> fst;

         private readonly FST.BytesReader fstReader;


         private readonly BytesRef scratchBytes = new BytesRef();
         private readonly CharsRef scratchChars = new CharsRef();

         /// <param name="input"> input tokenstream </param>
         /// <param name="synonyms"> synonym map </param>
         /// <param name="ignoreCase"> case-folds input for matching with <see cref="Character.ToLower(int)"/>.
         ///                   Note, if you set this to true, its your responsibility to lowercase
         ///                   the input entries when you create the <see cref="SynonymMap"/> </param>
         public SynonymFilter(TokenStream input, SynonymMap synonyms, bool ignoreCase)
             : base(input)
         {
             termAtt = AddAttribute<ICharTermAttribute>();
             posIncrAtt = AddAttribute<IPositionIncrementAttribute>();
             posLenAtt = AddAttribute<IPositionLengthAttribute>();
             typeAtt = AddAttribute<ITypeAttribute>();
             offsetAtt = AddAttribute<IOffsetAttribute>();

             this.synonyms = synonyms;
             this.ignoreCase = ignoreCase;
             this.fst = synonyms.Fst;
             if (fst == null)
             {
                 throw new System.ArgumentException("fst must be non-null");
             }
             this.fstReader = fst.GetBytesReader();

             // Must be 1+ so that when roll buffer is at full
             // lookahead we can distinguish this full buffer from
             // the empty buffer:
             rollBufferSize = 1 + synonyms.MaxHorizontalContext;

             futureInputs = new PendingInput[rollBufferSize];
             futureOutputs = new PendingOutputs[rollBufferSize];
             for (int pos = 0; pos < rollBufferSize; pos++)
             {
                 futureInputs[pos] = new PendingInput();
                 futureOutputs[pos] = new PendingOutputs();
             }

             //System.out.println("FSTFilt maxH=" + synonyms.maxHorizontalContext);

             scratchArc = new FST.Arc<BytesRef>();
         }

         private void Capture()
         {
             captureCount++;
             //System.out.println("  capture slot=" + nextWrite);
             PendingInput input = futureInputs[nextWrite];

             input.state = CaptureState();
             input.consumed = false;
             input.term.CopyChars(termAtt.Buffer, 0, termAtt.Length);

             nextWrite = RollIncr(nextWrite);

             // Buffer head should never catch up to tail:
             Debug.Assert(nextWrite != nextRead);
         }

         /*
          This is the core of this TokenFilter: it locates the
          synonym matches and buffers up the results into
          futureInputs/Outputs.

          NOTE: this calls input.incrementToken and does not
          capture the state if no further tokens were checked.  So
          caller must then forward state to our caller, or capture:
         */
         private int lastStartOffset;
         private int lastEndOffset;

         private void Parse()
         {
             //System.out.println("\nS: parse");

             Debug.Assert(inputSkipCount == 0);

             int curNextRead = nextRead;

             // Holds the longest match we've seen so far:
             BytesRef matchOutput = null;
             int matchInputLength = 0;
             int matchEndOffset = -1;

             BytesRef pendingOutput = fst.Outputs.NoOutput;
             fst.GetFirstArc(scratchArc);

             Debug.Assert(scratchArc.Output == fst.Outputs.NoOutput);

             int tokenCount = 0;

             while (true)
             {

                 // Pull next token's chars:
                 char[] buffer;
                 int bufferLen;
                 //System.out.println("  cycle nextRead=" + curNextRead + " nextWrite=" + nextWrite);

                 int inputEndOffset = 0;

                 if (curNextRead == nextWrite)
                 {

                     // We used up our lookahead buffer of input tokens
                     // -- pull next real input token:

                     if (finished)
                     {
                         break;
                     }
                     else
                     {
                         //System.out.println("  input.incrToken");
                         Debug.Assert(futureInputs[nextWrite].consumed);
                         // Not correct: a syn match whose output is longer
                         // than its input can set future inputs keepOrig
                         // to true:
                         //assert !futureInputs[nextWrite].keepOrig;
                         if (m_input.IncrementToken())
                         {
                             buffer = termAtt.Buffer;
                             bufferLen = termAtt.Length;
                             PendingInput pendingInput = futureInputs[nextWrite];
                             lastStartOffset = pendingInput.startOffset = offsetAtt.StartOffset;
                             lastEndOffset = pendingInput.endOffset = offsetAtt.EndOffset;
                             inputEndOffset = pendingInput.endOffset;
                             //System.out.println("  new token=" + new String(buffer, 0, bufferLen));
                             if (nextRead != nextWrite)
                             {
                                 Capture();
                             }
                             else
                             {
                                 pendingInput.consumed = false;
                             }

                         }
                         else
                         {
                             // No more input tokens
                             //System.out.println("      set end");
                             finished = true;
                             break;
                         }
                     }
                 }
                 else
                 {
                     // Still in our lookahead
                     buffer = futureInputs[curNextRead].term.Chars;
                     bufferLen = futureInputs[curNextRead].term.Length;
                     inputEndOffset = futureInputs[curNextRead].endOffset;
                     //System.out.println("  old token=" + new String(buffer, 0, bufferLen));
                 }

                 tokenCount++;

                 // Run each char in this token through the FST:
                 int bufUpto = 0;
                 while (bufUpto < bufferLen)
                 {
                     int codePoint = Character.CodePointAt(buffer, bufUpto, bufferLen);
                     if (fst.FindTargetArc(ignoreCase ? Character.ToLower(codePoint) : codePoint, scratchArc, scratchArc, fstReader) == null)
                     {
                         //System.out.println("    stop");
                         goto byTokenBreak;
                     }

                     // Accum the output
                     pendingOutput = fst.Outputs.Add(pendingOutput, scratchArc.Output);
                     //System.out.println("    char=" + buffer[bufUpto] + " output=" + pendingOutput + " arc.output=" + scratchArc.output);
                     bufUpto += Character.CharCount(codePoint);
                 }

                 // OK, entire token matched; now see if this is a final
                 // state:
                 if (scratchArc.IsFinal)
                 {
                     matchOutput = fst.Outputs.Add(pendingOutput, scratchArc.NextFinalOutput);
                     matchInputLength = tokenCount;
                     matchEndOffset = inputEndOffset;
                     //System.out.println("  found matchLength=" + matchInputLength + " output=" + matchOutput);
                 }

                 // See if the FST wants to continue matching (ie, needs to
                 // see the next input token):
                 if (fst.FindTargetArc(SynonymMap.WORD_SEPARATOR, scratchArc, scratchArc, fstReader) == null)
                 {
                     // No further rules can match here; we're done
                     // searching for matching rules starting at the
                     // current input position.
                     break;
                 }
                 else
                 {
                     // More matching is possible -- accum the output (if
                     // any) of the WORD_SEP arc:
                     pendingOutput = fst.Outputs.Add(pendingOutput, scratchArc.Output);
                     if (nextRead == nextWrite)
                     {
                         Capture();
                     }
                 }

                 curNextRead = RollIncr(curNextRead);
             }
             byTokenBreak:

             if (nextRead == nextWrite && !finished)
             {
                 //System.out.println("  skip write slot=" + nextWrite);
                 nextWrite = RollIncr(nextWrite);
             }

             if (matchOutput != null)
             {
                 //System.out.println("  add matchLength=" + matchInputLength + " output=" + matchOutput);
                 inputSkipCount = matchInputLength;
                 AddOutput(matchOutput, matchInputLength, matchEndOffset);
             }
             else if (nextRead != nextWrite)
             {
                 // Even though we had no match here, we set to 1
                 // because we need to skip current input token before
                 // trying to match again:
                 inputSkipCount = 1;
             }
             else
             {
                 Debug.Assert(finished);
             }

             //System.out.println("  parse done inputSkipCount=" + inputSkipCount + " nextRead=" + nextRead + " nextWrite=" + nextWrite);
         }

         // Interleaves all output tokens onto the futureOutputs:
         private void AddOutput(BytesRef bytes, int matchInputLength, int matchEndOffset)
         {
             bytesReader.Reset(bytes.Bytes, bytes.Offset, bytes.Length);

             int code = bytesReader.ReadVInt32();
             bool keepOrig = (code & 0x1) == 0;
             int count = (int)((uint)code >> 1);
             //System.out.println("  addOutput count=" + count + " keepOrig=" + keepOrig);
             for (int outputIDX = 0; outputIDX < count; outputIDX++)
             {
                 synonyms.Words.Get(bytesReader.ReadVInt32(), scratchBytes);
                 //System.out.println("    outIDX=" + outputIDX + " bytes=" + scratchBytes.length);
                 UnicodeUtil.UTF8toUTF16(scratchBytes, scratchChars);
                 int lastStart = scratchChars.Offset;
                 int chEnd = lastStart + scratchChars.Length;
                 int outputUpto = nextRead;
                 for (int chIDX = lastStart; chIDX <= chEnd; chIDX++)
                 {
                     if (chIDX == chEnd || scratchChars.Chars[chIDX] == SynonymMap.WORD_SEPARATOR)
                     {
                         int outputLen = chIDX - lastStart;
                         // Caller is not allowed to have empty string in
                         // the output:
                         Debug.Assert(outputLen > 0, "output contains empty string: " + scratchChars);
                         int endOffset;
                         int posLen;
                         if (chIDX == chEnd && lastStart == scratchChars.Offset)
                         {
                             // This rule had a single output token, so, we set
                             // this output's endOffset to the current
                             // endOffset (ie, endOffset of the last input
                             // token it matched):
                             endOffset = matchEndOffset;
                             posLen = keepOrig ? matchInputLength : 1;
                         }
                         else
                         {
                             // This rule has more than one output token; we
                             // can't pick any particular endOffset for this
                             // case, so, we inherit the endOffset for the
                             // input token which this output overlaps:
                             endOffset = -1;
                             posLen = 1;
                         }
                         futureOutputs[outputUpto].Add(scratchChars.Chars, lastStart, outputLen, endOffset, posLen);
                         //System.out.println("      " + new String(scratchChars.chars, lastStart, outputLen) + " outputUpto=" + outputUpto);
                         lastStart = 1 + chIDX;
                         //System.out.println("  slot=" + outputUpto + " keepOrig=" + keepOrig);
                         outputUpto = RollIncr(outputUpto);
                         Debug.Assert(futureOutputs[outputUpto].posIncr == 1, "outputUpto=" + outputUpto + " vs nextWrite=" + nextWrite);
                     }
                 }
             }

             int upto = nextRead;
             for (int idx = 0; idx < matchInputLength; idx++)
             {
                 futureInputs[upto].keepOrig |= keepOrig;
                 futureInputs[upto].matched = true;
                 upto = RollIncr(upto);
             }
         }

         // ++ mod rollBufferSize
         private int RollIncr(int count)
         {
             count++;
             if (count == rollBufferSize)
             {
                 return 0;
             }
             else
             {
                 return count;
             }
         }

         // for testing
         internal int CaptureCount
         {
             get
             {
                 return captureCount;
             }
         }

         public override bool IncrementToken()
         {

             //System.out.println("\nS: incrToken inputSkipCount=" + inputSkipCount + " nextRead=" + nextRead + " nextWrite=" + nextWrite);

             while (true)
             {

                 // First play back any buffered future inputs/outputs
                 // w/o running parsing again:
                 while (inputSkipCount != 0)
                 {

                     // At each position, we first output the original
                     // token

                     // TODO: maybe just a PendingState class, holding
                     // both input & outputs?
                     PendingInput input = futureInputs[nextRead];
                     PendingOutputs outputs = futureOutputs[nextRead];

                     //System.out.println("  cycle nextRead=" + nextRead + " nextWrite=" + nextWrite + " inputSkipCount="+ inputSkipCount + " input.keepOrig=" + input.keepOrig + " input.consumed=" + input.consumed + " input.state=" + input.state);

                     if (!input.consumed && (input.keepOrig || !input.matched))
                     {
                         if (input.state != null)
                         {
                             // Return a previously saved token (because we
                             // had to lookahead):
                             RestoreState(input.state);
                         }
                         else
                         {
                             // Pass-through case: return token we just pulled
                             // but didn't capture:
                             Debug.Assert(inputSkipCount == 1, "inputSkipCount=" + inputSkipCount + " nextRead=" + nextRead);
                         }
                         input.Reset();
                         if (outputs.count > 0)
                         {
                             outputs.posIncr = 0;
                         }
                         else
                         {
                             nextRead = RollIncr(nextRead);
                             inputSkipCount--;
                         }
                         //System.out.println("  return token=" + termAtt.toString());
                         return true;
                     }
                     else if (outputs.upto < outputs.count)
                     {
                         // Still have pending outputs to replay at this
                         // position
                         input.Reset();
                         int posIncr = outputs.posIncr;
                         CharsRef output = outputs.PullNext();
                         ClearAttributes();
                         termAtt.CopyBuffer(output.Chars, output.Offset, output.Length);
                         typeAtt.Type = TYPE_SYNONYM;
                         int endOffset = outputs.LastEndOffset;
                         if (endOffset == -1)
                         {
                             endOffset = input.endOffset;
                         }
                         offsetAtt.SetOffset(input.startOffset, endOffset);
                         posIncrAtt.PositionIncrement = posIncr;
                         posLenAtt.PositionLength = outputs.LastPosLength;
                         if (outputs.count == 0)
                         {
                             // Done with the buffered input and all outputs at
                             // this position
                             nextRead = RollIncr(nextRead);
                             inputSkipCount--;
                         }
                         //System.out.println("  return token=" + termAtt.toString());
                         return true;
                     }
                     else
                     {
                         // Done with the buffered input and all outputs at
                         // this position
                         input.Reset();
                         nextRead = RollIncr(nextRead);
                         inputSkipCount--;
                     }
                 }

                 if (finished && nextRead == nextWrite)
                 {
                     // End case: if any output syns went beyond end of
                     // input stream, enumerate them now:
                     PendingOutputs outputs = futureOutputs[nextRead];
                     if (outputs.upto < outputs.count)
                     {
                         int posIncr = outputs.posIncr;
                         CharsRef output = outputs.PullNext();
                         futureInputs[nextRead].Reset();
                         if (outputs.count == 0)
                         {
                             nextWrite = nextRead = RollIncr(nextRead);
                         }
                         ClearAttributes();
                         // Keep offset from last input token:
                         offsetAtt.SetOffset(lastStartOffset, lastEndOffset);
                         termAtt.CopyBuffer(output.Chars, output.Offset, output.Length);
                         typeAtt.Type = TYPE_SYNONYM;
                         //System.out.println("  set posIncr=" + outputs.posIncr + " outputs=" + outputs);
                         posIncrAtt.PositionIncrement = posIncr;
                         //System.out.println("  return token=" + termAtt.toString());
                         return true;
                     }
                     else
                     {
                         return false;
                     }
                 }

                 // Find new synonym matches:
                 Parse();
             }
         }

         public override void Reset()
         {
             base.Reset();
             captureCount = 0;
             finished = false;
             inputSkipCount = 0;
             nextRead = nextWrite = 0;

             // In normal usage these resets would not be needed,
             // since they reset-as-they-are-consumed, but the app
             // may not consume all input tokens (or we might hit an
             // exception), in which case we have leftover state
             // here:
             foreach (PendingInput input in futureInputs)
             {
                 input.Reset();
             }
             foreach (PendingOutputs output in futureOutputs)
             {
                 output.Reset();
             }
         }
     }
 }
	using J2N;
	using Lucene.Net.Analysis.TokenAttributes;
	using Lucene.Net.Store;
	using Lucene.Net.Util;
	using Lucene.Net.Util.Fst;
	using System;
	using System.Diagnostics;

	namespace Lucene.Net.Analysis.Synonym
	{
	/*
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/

	/// <summary>
	/// Matches single or multi word synonyms in a token stream.
	/// This token stream cannot properly handle position
	/// increments != 1, ie, you should place this filter before
	/// filtering out stop words.
	///
	/// <para>Note that with the current implementation, parsing is
	/// greedy, so whenever multiple parses would apply, the rule
	/// starting the earliest and parsing the most tokens wins.
	/// For example if you have these rules:
	///
	/// <code>
	/// a -> x
	/// a b -> y
	/// b c d -> z
	/// </code>
	///
	/// Then input <c>a b c d e</c> parses to <c>y b c
	/// d</c>, ie the 2nd rule "wins" because it started
	/// earliest and matched the most input tokens of other rules
	/// starting at that point.</para>
	///
	/// <para>A future improvement to this filter could allow
	/// non-greedy parsing, such that the 3rd rule would win, and
	/// also separately allow multiple parses, such that all 3
	/// rules would match, perhaps even on a rule by rule
	/// basis.</para>
	///
	/// <para><b>NOTE</b>: when a match occurs, the output tokens
	/// associated with the matching rule are "stacked" on top of
	/// the input stream (if the rule had
	/// <c>keepOrig=true</c>) and also on top of another
	/// matched rule's output tokens. This is not a correct
	/// solution, as really the output should be an arbitrary
	/// graph/lattice. For example, with the above match, you
	/// would expect an exact <see cref="Search.PhraseQuery"/> <c>"y b
	/// c"</c> to match the parsed tokens, but it will fail to
	/// do so. This limitation is necessary because Lucene's
	/// <see cref="TokenStream"/> (and index) cannot yet represent an arbitrary
	/// graph.</para>
	///
	/// <para><b>NOTE</b>: If multiple incoming tokens arrive on the
	/// same position, only the first token at that position is
	/// used for parsing. Subsequent tokens simply pass through
	/// and are not parsed. A future improvement would be to
	/// allow these tokens to also be matched.</para>
	/// </summary>

	// TODO: maybe we should resolve token -> wordID then run
	// FST on wordIDs, for better perf?

	// TODO: a more efficient approach would be Aho/Corasick's
	// algorithm
	// http://en.wikipedia.org/wiki/Aho%E2%80%93Corasick_string_matching_algorithm
	// It improves over the current approach here
	// because it does not fully re-start matching at every
	// token. For example if one pattern is "a b c x"
	// and another is "b c d" and the input is "a b c d", on
	// trying to parse "a b c x" but failing when you got to x,
	// rather than starting over again your really should
	// immediately recognize that "b c d" matches at the next
	// input. I suspect this won't matter that much in
	// practice, but it's possible on some set of synonyms it
	// will. We'd have to modify Aho/Corasick to enforce our
	// conflict resolving (eg greedy matching) because that algo
	// finds all matches. This really amounts to adding a .*
	// closure to the FST and then determinizing it.

	public sealed class SynonymFilter : TokenFilter
	{
	public const string TYPE_SYNONYM = "SYNONYM";

	private readonly SynonymMap synonyms;

	private readonly bool ignoreCase;
	private readonly int rollBufferSize;

	private int captureCount;

	// TODO: we should set PositionLengthAttr too...

	private readonly ICharTermAttribute termAtt;
	private readonly IPositionIncrementAttribute posIncrAtt;
	private readonly IPositionLengthAttribute posLenAtt;
	private readonly ITypeAttribute typeAtt;
	private readonly IOffsetAttribute offsetAtt;


	// How many future input tokens have already been matched
	// to a synonym; because the matching is "greedy" we don't
	// try to do any more matching for such tokens:
	private int inputSkipCount;

	// Hold all buffered (read ahead) stacked input tokens for
	// a future position. When multiple tokens are at the
	// same position, we only store (and match against) the
	// term for the first token at the position, but capture
	// state for (and enumerate) all other tokens at this
	// position:
	private class PendingInput
	{
	internal readonly CharsRef term = new CharsRef();
	internal AttributeSource.State state;
	internal bool keepOrig;
	internal bool matched;
	internal bool consumed = true;
	internal int startOffset;
	internal int endOffset;

	public void Reset()
	{
	state = null;
	consumed = true;
	keepOrig = false;
	matched = false;
	}
	}

	// Rolling buffer, holding pending input tokens we had to
	// clone because we needed to look ahead, indexed by
	// position:
	private readonly PendingInput[] futureInputs;

	// Holds pending output synonyms for one future position:
	private class PendingOutputs
	{
	internal CharsRef[] outputs;
	internal int[] endOffsets;
	internal int[] posLengths;
	internal int upto;
	internal int count;
	internal int posIncr = 1;
	internal int lastEndOffset;
	internal int lastPosLength;

	public PendingOutputs()
	{
	outputs = new CharsRef[1];
	endOffsets = new int[1];
	posLengths = new int[1];
	}

	public virtual void Reset()
	{
	upto = count = 0;
	posIncr = 1;
	}

	public virtual CharsRef PullNext()
	{
	Debug.Assert(upto < count);
	lastEndOffset = endOffsets[upto];
	lastPosLength = posLengths[upto];
	CharsRef result = outputs[upto++];
	posIncr = 0;
	if (upto == count)
	{
	Reset();
	}
	return result;
	}

	public virtual int LastEndOffset
	{
	get
	{
	return lastEndOffset;
	}
	}

	public virtual int LastPosLength
	{
	get
	{
	return lastPosLength;
	}
	}

	public virtual void Add(char[] output, int offset, int len, int endOffset, int posLength)
	{
	if (count == outputs.Length)
	{
	CharsRef[] next = new CharsRef[ArrayUtil.Oversize(1 + count, RamUsageEstimator.NUM_BYTES_OBJECT_REF)];
	Array.Copy(outputs, 0, next, 0, count);
	outputs = next;
	}
	if (count == endOffsets.Length)
	{
	int[] next = new int[ArrayUtil.Oversize(1 + count, RamUsageEstimator.NUM_BYTES_INT32)];
	Array.Copy(endOffsets, 0, next, 0, count);
	endOffsets = next;
	}
	if (count == posLengths.Length)
	{
	int[] next = new int[ArrayUtil.Oversize(1 + count, RamUsageEstimator.NUM_BYTES_INT32)];
	Array.Copy(posLengths, 0, next, 0, count);
	posLengths = next;
	}
	if (outputs[count] == null)
	{
	outputs[count] = new CharsRef();
	}
	outputs[count].CopyChars(output, offset, len);
	// endOffset can be -1, in which case we should simply
	// use the endOffset of the input token, or X >= 0, in
	// which case we use X as the endOffset for this output
	endOffsets[count] = endOffset;
	posLengths[count] = posLength;
	count++;
	}
	}

	private readonly ByteArrayDataInput bytesReader = new ByteArrayDataInput();

	// Rolling buffer, holding stack of pending synonym
	// outputs, indexed by position:
	private readonly PendingOutputs[] futureOutputs;

	// Where (in rolling buffers) to write next input saved state:
	private int nextWrite;

	// Where (in rolling buffers) to read next input saved state:
	private int nextRead;

	// True once we've read last token
	private bool finished;

	private readonly FST.Arc<BytesRef> scratchArc;

	private readonly FST<BytesRef> fst;

	private readonly FST.BytesReader fstReader;


	private readonly BytesRef scratchBytes = new BytesRef();
	private readonly CharsRef scratchChars = new CharsRef();

	/// <param name="input"> input tokenstream </param>
	/// <param name="synonyms"> synonym map </param>
	/// <param name="ignoreCase"> case-folds input for matching with <see cref="Character.ToLower(int)"/>.
	/// Note, if you set this to true, its your responsibility to lowercase
	/// the input entries when you create the <see cref="SynonymMap"/> </param>
	public SynonymFilter(TokenStream input, SynonymMap synonyms, bool ignoreCase)
	: base(input)
	{
	termAtt = AddAttribute<ICharTermAttribute>();
	posIncrAtt = AddAttribute<IPositionIncrementAttribute>();
	posLenAtt = AddAttribute<IPositionLengthAttribute>();
	typeAtt = AddAttribute<ITypeAttribute>();
	offsetAtt = AddAttribute<IOffsetAttribute>();

	this.synonyms = synonyms;
	this.ignoreCase = ignoreCase;
	this.fst = synonyms.Fst;
	if (fst == null)
	{
	throw new System.ArgumentException("fst must be non-null");
	}
	this.fstReader = fst.GetBytesReader();

	// Must be 1+ so that when roll buffer is at full
	// lookahead we can distinguish this full buffer from
	// the empty buffer:
	rollBufferSize = 1 + synonyms.MaxHorizontalContext;

	futureInputs = new PendingInput[rollBufferSize];
	futureOutputs = new PendingOutputs[rollBufferSize];
	for (int pos = 0; pos < rollBufferSize; pos++)
	{
	futureInputs[pos] = new PendingInput();
	futureOutputs[pos] = new PendingOutputs();
	}

	//System.out.println("FSTFilt maxH=" + synonyms.maxHorizontalContext);

	scratchArc = new FST.Arc<BytesRef>();
	}

	private void Capture()
	{
	captureCount++;
	//System.out.println(" capture slot=" + nextWrite);
	PendingInput input = futureInputs[nextWrite];

	input.state = CaptureState();
	input.consumed = false;
	input.term.CopyChars(termAtt.Buffer, 0, termAtt.Length);

	nextWrite = RollIncr(nextWrite);

	// Buffer head should never catch up to tail:
	Debug.Assert(nextWrite != nextRead);
	}

	/*
	This is the core of this TokenFilter: it locates the
	synonym matches and buffers up the results into
	futureInputs/Outputs.

	NOTE: this calls input.incrementToken and does not
	capture the state if no further tokens were checked. So
	caller must then forward state to our caller, or capture:
	*/
	private int lastStartOffset;
	private int lastEndOffset;

	private void Parse()
	{
	//System.out.println("\nS: parse");

	Debug.Assert(inputSkipCount == 0);

	int curNextRead = nextRead;

	// Holds the longest match we've seen so far:
	BytesRef matchOutput = null;
	int matchInputLength = 0;
	int matchEndOffset = -1;

	BytesRef pendingOutput = fst.Outputs.NoOutput;
	fst.GetFirstArc(scratchArc);

	Debug.Assert(scratchArc.Output == fst.Outputs.NoOutput);

	int tokenCount = 0;

	while (true)
	{

	// Pull next token's chars:
	char[] buffer;
	int bufferLen;
	//System.out.println(" cycle nextRead=" + curNextRead + " nextWrite=" + nextWrite);

	int inputEndOffset = 0;

	if (curNextRead == nextWrite)
	{

	// We used up our lookahead buffer of input tokens
	// -- pull next real input token:

	if (finished)
	{
	break;
	}
	else
	{
	//System.out.println(" input.incrToken");
	Debug.Assert(futureInputs[nextWrite].consumed);
	// Not correct: a syn match whose output is longer
	// than its input can set future inputs keepOrig
	// to true:
	//assert !futureInputs[nextWrite].keepOrig;
	if (m_input.IncrementToken())
	{
	buffer = termAtt.Buffer;
	bufferLen = termAtt.Length;
	PendingInput pendingInput = futureInputs[nextWrite];
	lastStartOffset = pendingInput.startOffset = offsetAtt.StartOffset;
	lastEndOffset = pendingInput.endOffset = offsetAtt.EndOffset;
	inputEndOffset = pendingInput.endOffset;
	//System.out.println(" new token=" + new String(buffer, 0, bufferLen));
	if (nextRead != nextWrite)
	{
	Capture();
	}
	else
	{
	pendingInput.consumed = false;
	}

	}
	else
	{
	// No more input tokens
	//System.out.println(" set end");
	finished = true;
	break;
	}
	}
	}
	else
	{
	// Still in our lookahead
	buffer = futureInputs[curNextRead].term.Chars;
	bufferLen = futureInputs[curNextRead].term.Length;
	inputEndOffset = futureInputs[curNextRead].endOffset;
	//System.out.println(" old token=" + new String(buffer, 0, bufferLen));
	}

	tokenCount++;

	// Run each char in this token through the FST:
	int bufUpto = 0;
	while (bufUpto < bufferLen)
	{
	int codePoint = Character.CodePointAt(buffer, bufUpto, bufferLen);
	if (fst.FindTargetArc(ignoreCase ? Character.ToLower(codePoint) : codePoint, scratchArc, scratchArc, fstReader) == null)
	{
	//System.out.println(" stop");
	goto byTokenBreak;
	}

	// Accum the output
	pendingOutput = fst.Outputs.Add(pendingOutput, scratchArc.Output);
	//System.out.println(" char=" + buffer[bufUpto] + " output=" + pendingOutput + " arc.output=" + scratchArc.output);
	bufUpto += Character.CharCount(codePoint);
	}

	// OK, entire token matched; now see if this is a final
	// state:
	if (scratchArc.IsFinal)
	{
	matchOutput = fst.Outputs.Add(pendingOutput, scratchArc.NextFinalOutput);
	matchInputLength = tokenCount;
	matchEndOffset = inputEndOffset;
	//System.out.println(" found matchLength=" + matchInputLength + " output=" + matchOutput);
	}

	// See if the FST wants to continue matching (ie, needs to
	// see the next input token):
	if (fst.FindTargetArc(SynonymMap.WORD_SEPARATOR, scratchArc, scratchArc, fstReader) == null)
	{
	// No further rules can match here; we're done
	// searching for matching rules starting at the
	// current input position.
	break;
	}
	else
	{
	// More matching is possible -- accum the output (if
	// any) of the WORD_SEP arc:
	pendingOutput = fst.Outputs.Add(pendingOutput, scratchArc.Output);
	if (nextRead == nextWrite)
	{
	Capture();
	}
	}

	curNextRead = RollIncr(curNextRead);
	}
	byTokenBreak:

	if (nextRead == nextWrite && !finished)
	{
	//System.out.println(" skip write slot=" + nextWrite);
	nextWrite = RollIncr(nextWrite);
	}

	if (matchOutput != null)
	{
	//System.out.println(" add matchLength=" + matchInputLength + " output=" + matchOutput);
	inputSkipCount = matchInputLength;
	AddOutput(matchOutput, matchInputLength, matchEndOffset);
	}
	else if (nextRead != nextWrite)
	{
	// Even though we had no match here, we set to 1
	// because we need to skip current input token before
	// trying to match again:
	inputSkipCount = 1;
	}
	else
	{
	Debug.Assert(finished);
	}

	//System.out.println(" parse done inputSkipCount=" + inputSkipCount + " nextRead=" + nextRead + " nextWrite=" + nextWrite);
	}

	// Interleaves all output tokens onto the futureOutputs:
	private void AddOutput(BytesRef bytes, int matchInputLength, int matchEndOffset)
	{
	bytesReader.Reset(bytes.Bytes, bytes.Offset, bytes.Length);

	int code = bytesReader.ReadVInt32();
	bool keepOrig = (code & 0x1) == 0;
	int count = (int)((uint)code >> 1);
	//System.out.println(" addOutput count=" + count + " keepOrig=" + keepOrig);
	for (int outputIDX = 0; outputIDX < count; outputIDX++)
	{
	synonyms.Words.Get(bytesReader.ReadVInt32(), scratchBytes);
	//System.out.println(" outIDX=" + outputIDX + " bytes=" + scratchBytes.length);
	UnicodeUtil.UTF8toUTF16(scratchBytes, scratchChars);
	int lastStart = scratchChars.Offset;
	int chEnd = lastStart + scratchChars.Length;
	int outputUpto = nextRead;
	for (int chIDX = lastStart; chIDX <= chEnd; chIDX++)
	{
	if (chIDX == chEnd \|\| scratchChars.Chars[chIDX] == SynonymMap.WORD_SEPARATOR)
	{
	int outputLen = chIDX - lastStart;
	// Caller is not allowed to have empty string in
	// the output:
	Debug.Assert(outputLen > 0, "output contains empty string: " + scratchChars);
	int endOffset;
	int posLen;
	if (chIDX == chEnd && lastStart == scratchChars.Offset)
	{
	// This rule had a single output token, so, we set
	// this output's endOffset to the current
	// endOffset (ie, endOffset of the last input
	// token it matched):
	endOffset = matchEndOffset;
	posLen = keepOrig ? matchInputLength : 1;
	}
	else
	{
	// This rule has more than one output token; we
	// can't pick any particular endOffset for this
	// case, so, we inherit the endOffset for the
	// input token which this output overlaps:
	endOffset = -1;
	posLen = 1;
	}
	futureOutputs[outputUpto].Add(scratchChars.Chars, lastStart, outputLen, endOffset, posLen);
	//System.out.println(" " + new String(scratchChars.chars, lastStart, outputLen) + " outputUpto=" + outputUpto);
	lastStart = 1 + chIDX;
	//System.out.println(" slot=" + outputUpto + " keepOrig=" + keepOrig);
	outputUpto = RollIncr(outputUpto);
	Debug.Assert(futureOutputs[outputUpto].posIncr == 1, "outputUpto=" + outputUpto + " vs nextWrite=" + nextWrite);
	}
	}
	}

	int upto = nextRead;
	for (int idx = 0; idx < matchInputLength; idx++)
	{
	futureInputs[upto].keepOrig \|= keepOrig;
	futureInputs[upto].matched = true;
	upto = RollIncr(upto);
	}
	}

	// ++ mod rollBufferSize
	private int RollIncr(int count)
	{
	count++;
	if (count == rollBufferSize)
	{
	return 0;
	}
	else
	{
	return count;
	}
	}

	// for testing
	internal int CaptureCount
	{
	get
	{
	return captureCount;
	}
	}

	public override bool IncrementToken()
	{

	//System.out.println("\nS: incrToken inputSkipCount=" + inputSkipCount + " nextRead=" + nextRead + " nextWrite=" + nextWrite);

	while (true)
	{

	// First play back any buffered future inputs/outputs
	// w/o running parsing again:
	while (inputSkipCount != 0)
	{

	// At each position, we first output the original
	// token

	// TODO: maybe just a PendingState class, holding
	// both input & outputs?
	PendingInput input = futureInputs[nextRead];
	PendingOutputs outputs = futureOutputs[nextRead];

	//System.out.println(" cycle nextRead=" + nextRead + " nextWrite=" + nextWrite + " inputSkipCount="+ inputSkipCount + " input.keepOrig=" + input.keepOrig + " input.consumed=" + input.consumed + " input.state=" + input.state);

	if (!input.consumed && (input.keepOrig \|\| !input.matched))
	{
	if (input.state != null)
	{
	// Return a previously saved token (because we
	// had to lookahead):
	RestoreState(input.state);
	}
	else
	{
	// Pass-through case: return token we just pulled
	// but didn't capture:
	Debug.Assert(inputSkipCount == 1, "inputSkipCount=" + inputSkipCount + " nextRead=" + nextRead);
	}
	input.Reset();
	if (outputs.count > 0)
	{
	outputs.posIncr = 0;
	}
	else
	{
	nextRead = RollIncr(nextRead);
	inputSkipCount--;
	}
	//System.out.println(" return token=" + termAtt.toString());
	return true;
	}
	else if (outputs.upto < outputs.count)
	{
	// Still have pending outputs to replay at this
	// position
	input.Reset();
	int posIncr = outputs.posIncr;
	CharsRef output = outputs.PullNext();
	ClearAttributes();
	termAtt.CopyBuffer(output.Chars, output.Offset, output.Length);
	typeAtt.Type = TYPE_SYNONYM;
	int endOffset = outputs.LastEndOffset;
	if (endOffset == -1)
	{
	endOffset = input.endOffset;
	}
	offsetAtt.SetOffset(input.startOffset, endOffset);
	posIncrAtt.PositionIncrement = posIncr;
	posLenAtt.PositionLength = outputs.LastPosLength;
	if (outputs.count == 0)
	{
	// Done with the buffered input and all outputs at
	// this position
	nextRead = RollIncr(nextRead);
	inputSkipCount--;
	}
	//System.out.println(" return token=" + termAtt.toString());
	return true;
	}
	else
	{
	// Done with the buffered input and all outputs at
	// this position
	input.Reset();
	nextRead = RollIncr(nextRead);
	inputSkipCount--;
	}
	}

	if (finished && nextRead == nextWrite)
	{
	// End case: if any output syns went beyond end of
	// input stream, enumerate them now:
	PendingOutputs outputs = futureOutputs[nextRead];
	if (outputs.upto < outputs.count)
	{
	int posIncr = outputs.posIncr;
	CharsRef output = outputs.PullNext();
	futureInputs[nextRead].Reset();
	if (outputs.count == 0)
	{
	nextWrite = nextRead = RollIncr(nextRead);
	}
	ClearAttributes();
	// Keep offset from last input token:
	offsetAtt.SetOffset(lastStartOffset, lastEndOffset);
	termAtt.CopyBuffer(output.Chars, output.Offset, output.Length);
	typeAtt.Type = TYPE_SYNONYM;
	//System.out.println(" set posIncr=" + outputs.posIncr + " outputs=" + outputs);
	posIncrAtt.PositionIncrement = posIncr;
	//System.out.println(" return token=" + termAtt.toString());
	return true;
	}
	else
	{
	return false;
	}
	}

	// Find new synonym matches:
	Parse();
	}
	}

	public override void Reset()
	{
	base.Reset();
	captureCount = 0;
	finished = false;
	inputSkipCount = 0;
	nextRead = nextWrite = 0;

	// In normal usage these resets would not be needed,
	// since they reset-as-they-are-consumed, but the app
	// may not consume all input tokens (or we might hit an
	// exception), in which case we have leftover state
	// here:
	foreach (PendingInput input in futureInputs)
	{
	input.Reset();
	}
	foreach (PendingOutputs output in futureOutputs)
	{
	output.Reset();
	}
	}
	}
	}