src/Lucene.Net.Analysis.Common/Analysis/Cjk/CJKBigramFilter.cs - lucenenet - Git at Google

 using J2N;
 using Lucene.Net.Analysis.Standard;
 using Lucene.Net.Analysis.TokenAttributes;
 using Lucene.Net.Util;

 namespace Lucene.Net.Analysis.Cjk
 {
     /*
      * Licensed to the Apache Software Foundation (ASF) under one or more
      * contributor license agreements.  See the NOTICE file distributed with
      * this work for additional information regarding copyright ownership.
      * The ASF licenses this file to You under the Apache License, Version 2.0
      * (the "License"); you may not use this file except in compliance with
      * the License.  You may obtain a copy of the License at
      *
      *     http://www.apache.org/licenses/LICENSE-2.0
      *
      * Unless required by applicable law or agreed to in writing, software
      * distributed under the License is distributed on an "AS IS" BASIS,
      * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
      * See the License for the specific language governing permissions and
      * limitations under the License.
      */

     // LUCENENET specific - converted constants from CJKBigramFilter
     // into a flags enum.
     [System.Flags]
     public enum CJKScript
     {
         /// <summary>
         /// bigram flag for Han Ideographs </summary>
         HAN = 1,
         /// <summary>
         /// bigram flag for Hiragana </summary>
         HIRAGANA = 2,
         /// <summary>
         /// bigram flag for Katakana </summary>
         KATAKANA = 4,
         /// <summary>
         /// bigram flag for Hangul </summary>
         HANGUL = 8,
         /// <summary>
         /// bigram flag for all scripts </summary>
         ALL = 0xff
     }

     /// <summary>
     /// Forms bigrams of CJK terms that are generated from <see cref="StandardTokenizer"/>
     /// or ICUTokenizer.
     /// <para>
     /// CJK types are set by these tokenizers, but you can also use
     /// <see cref="CJKBigramFilter(TokenStream, CJKScript)"/> to explicitly control which
     /// of the CJK scripts are turned into bigrams.
     /// </para>
     /// <para>
     /// By default, when a CJK character has no adjacent characters to form
     /// a bigram, it is output in unigram form. If you want to always output
     /// both unigrams and bigrams, set the <code>outputUnigrams</code>
     /// flag in <see cref="CJKBigramFilter.CJKBigramFilter(TokenStream, CJKScript, bool)"/>.
     /// This can be used for a combined unigram+bigram approach.
     /// </para>
     /// <para>
     /// In all cases, all non-CJK input is passed thru unmodified.
     /// </para>
     /// </summary>
     public sealed class CJKBigramFilter : TokenFilter
     {
         // configuration

         // LUCENENET specific - made flags into their own [Flags] enum named CJKScript and de-nested from this type

         /// <summary>
         /// when we emit a bigram, its then marked as this type </summary>
         public const string DOUBLE_TYPE = "<DOUBLE>";
         /// <summary>
         /// when we emit a unigram, its then marked as this type </summary>
         public const string SINGLE_TYPE = "<SINGLE>";

         // the types from standardtokenizer
         private static readonly string HAN_TYPE = StandardTokenizer.TOKEN_TYPES[StandardTokenizer.IDEOGRAPHIC];
         private static readonly string HIRAGANA_TYPE = StandardTokenizer.TOKEN_TYPES[StandardTokenizer.HIRAGANA];
         private static readonly string KATAKANA_TYPE = StandardTokenizer.TOKEN_TYPES[StandardTokenizer.KATAKANA];
         private static readonly string HANGUL_TYPE = StandardTokenizer.TOKEN_TYPES[StandardTokenizer.HANGUL];

         // sentinel value for ignoring a script
         private static readonly string NO = "<NO>";

         // these are set to either their type or NO if we want to pass them thru
         private readonly string doHan;
         private readonly string doHiragana;
         private readonly string doKatakana;
         private readonly string doHangul;

         // true if we should output unigram tokens always
         private readonly bool outputUnigrams;
         private bool ngramState; // false = output unigram, true = output bigram

         private readonly ICharTermAttribute termAtt;
         private readonly ITypeAttribute typeAtt;
         private readonly IOffsetAttribute offsetAtt;
         private readonly IPositionIncrementAttribute posIncAtt;
         private readonly IPositionLengthAttribute posLengthAtt;

         // buffers containing codepoint and offsets in parallel
         private int[] buffer = new int[8];
         private int[] startOffset = new int[8];
         private int[] endOffset = new int[8];
         // length of valid buffer
         private int bufferLen;
         // current buffer index
         private int index;

         // the last end offset, to determine if we should bigram across tokens
         private int lastEndOffset;

         private bool exhausted;

         /// <summary>
         /// Calls <see cref="CJKBigramFilter.CJKBigramFilter(TokenStream, CJKScript)">
         ///       CJKBigramFilter(@in, CJKScript.HAN | CJKScript.HIRAGANA | CJKScript.KATAKANA | CJKScript.HANGUL)</see>
         /// </summary>
         /// <param name="in">
         ///          Input <see cref="TokenStream"/> </param>
         public CJKBigramFilter(TokenStream @in)
               : this(@in, CJKScript.HAN | CJKScript.HIRAGANA | CJKScript.KATAKANA | CJKScript.HANGUL)
         {
         }

         /// <summary>
         /// Calls <see cref="CJKBigramFilter.CJKBigramFilter(TokenStream, CJKScript, bool)">
         ///       CJKBigramFilter(in, flags, false)</see>
         /// </summary>
         /// <param name="in">
         ///          Input <see cref="TokenStream"/> </param>
         /// <param name="flags"> OR'ed set from <see cref="CJKScript.HAN"/>, <see cref="CJKScript.HIRAGANA"/>,
         ///        <see cref="CJKScript.KATAKANA"/>, <see cref="CJKScript.HANGUL"/> </param>
         public CJKBigramFilter(TokenStream @in, CJKScript flags)
               : this(@in, flags, false)
         {
         }

         /// <summary>
         /// Create a new <see cref="CJKBigramFilter"/>, specifying which writing systems should be bigrammed,
         /// and whether or not unigrams should also be output. </summary>
         /// <param name="in">
         ///          Input <see cref="TokenStream"/> </param>
         /// <param name="flags"> OR'ed set from <see cref="CJKScript.HAN"/>, <see cref="CJKScript.HIRAGANA"/>,
         ///        <see cref="CJKScript.KATAKANA"/>, <see cref="CJKScript.HANGUL"/> </param>
         /// <param name="outputUnigrams"> true if unigrams for the selected writing systems should also be output.
         ///        when this is false, this is only done when there are no adjacent characters to form
         ///        a bigram. </param>
         public CJKBigramFilter(TokenStream @in, CJKScript flags, bool outputUnigrams)
               : base(@in)
         {
             doHan = (flags & CJKScript.HAN) == 0 ? NO : HAN_TYPE;
             doHiragana = (flags & CJKScript.HIRAGANA) == 0 ? NO : HIRAGANA_TYPE;
             doKatakana = (flags & CJKScript.KATAKANA) == 0 ? NO : KATAKANA_TYPE;
             doHangul = (flags & CJKScript.HANGUL) == 0 ? NO : HANGUL_TYPE;
             this.outputUnigrams = outputUnigrams;
             this.termAtt = AddAttribute<ICharTermAttribute>();
             this.typeAtt = AddAttribute<ITypeAttribute>();
             this.offsetAtt = AddAttribute<IOffsetAttribute>();
             this.posIncAtt = AddAttribute<IPositionIncrementAttribute>();
             this.posLengthAtt = AddAttribute<IPositionLengthAttribute>();
         }

         /*
          * much of this complexity revolves around handling the special case of a
          * "lone cjk character" where cjktokenizer would output a unigram. this
          * is also the only time we ever have to captureState.
          */
         public override bool IncrementToken()
         {
             while (true)
             {
                 if (HasBufferedBigram)
                 {

                     // case 1: we have multiple remaining codepoints buffered,
                     // so we can emit a bigram here.

                     if (outputUnigrams)
                     {

                         // when also outputting unigrams, we output the unigram first,
                         // then rewind back to revisit the bigram.
                         // so an input of ABC is A + (rewind)AB + B + (rewind)BC + C
                         // the logic in hasBufferedUnigram ensures we output the C,
                         // even though it did actually have adjacent CJK characters.

                         if (ngramState)
                         {
                             FlushBigram();
                         }
                         else
                         {
                             FlushUnigram();
                             index--;
                         }
                         ngramState = !ngramState;
                     }
                     else
                     {
                         FlushBigram();
                     }
                     return true;
                 }
                 else if (DoNext())
                 {

                     // case 2: look at the token type. should we form any n-grams?

                     string type = typeAtt.Type;
                     if (type == doHan || type == doHiragana || type == doKatakana || type == doHangul)
                     {

                         // acceptable CJK type: we form n-grams from these.
                         // as long as the offsets are aligned, we just add these to our current buffer.
                         // otherwise, we clear the buffer and start over.

                         if (offsetAtt.StartOffset != lastEndOffset) // unaligned, clear queue
                         {
                             if (HasBufferedUnigram)
                             {

                                 // we have a buffered unigram, and we peeked ahead to see if we could form
                                 // a bigram, but we can't, because the offsets are unaligned. capture the state
                                 // of this peeked data to be revisited next time thru the loop, and dump our unigram.

                                 loneState = CaptureState();
                                 FlushUnigram();
                                 return true;
                             }
                             index = 0;
                             bufferLen = 0;
                         }
                         Refill();
                     }
                     else
                     {

                         // not a CJK type: we just return these as-is.

                         if (HasBufferedUnigram)
                         {

                             // we have a buffered unigram, and we peeked ahead to see if we could form
                             // a bigram, but we can't, because its not a CJK type. capture the state
                             // of this peeked data to be revisited next time thru the loop, and dump our unigram.

                             loneState = CaptureState();
                             FlushUnigram();
                             return true;
                         }
                         return true;
                     }
                 }
                 else
                 {

                     // case 3: we have only zero or 1 codepoints buffered,
                     // so not enough to form a bigram. But, we also have no
                     // more input. So if we have a buffered codepoint, emit
                     // a unigram, otherwise, its end of stream.

                     if (HasBufferedUnigram)
                     {
                         FlushUnigram(); // flush our remaining unigram
                         return true;
                     }
                     return false;
                 }
             }
         }

         private State loneState; // rarely used: only for "lone cjk characters", where we emit unigrams

         /// <summary>
         /// looks at next input token, returning false is none is available
         /// </summary>
         private bool DoNext()
         {
             if (loneState != null)
             {
                 RestoreState(loneState);
                 loneState = null;
                 return true;
             }
             else
             {
                 if (exhausted)
                 {
                     return false;
                 }
                 else if (m_input.IncrementToken())
                 {
                     return true;
                 }
                 else
                 {
                     exhausted = true;
                     return false;
                 }
             }
         }

         /// <summary>
         /// refills buffers with new data from the current token.
         /// </summary>
         private void Refill()
         {
             // compact buffers to keep them smallish if they become large
             // just a safety check, but technically we only need the last codepoint
             if (bufferLen > 64)
             {
                 int last = bufferLen - 1;
                 buffer[0] = buffer[last];
                 startOffset[0] = startOffset[last];
                 endOffset[0] = endOffset[last];
                 bufferLen = 1;
                 index -= last;
             }

             char[] termBuffer = termAtt.Buffer;
             int len = termAtt.Length;
             int start = offsetAtt.StartOffset;
             int end = offsetAtt.EndOffset;

             int newSize = bufferLen + len;
             buffer = ArrayUtil.Grow(buffer, newSize);
             startOffset = ArrayUtil.Grow(startOffset, newSize);
             endOffset = ArrayUtil.Grow(endOffset, newSize);
             lastEndOffset = end;

             if (end - start != len)
             {
                 // crazy offsets (modified by synonym or charfilter): just preserve
                 for (int i = 0, cp = 0; i < len; i += Character.CharCount(cp))
                 {
                     cp = buffer[bufferLen] = Character.CodePointAt(termBuffer, i, len);
                     startOffset[bufferLen] = start;
                     endOffset[bufferLen] = end;
                     bufferLen++;
                 }
             }
             else
             {
                 // normal offsets
                 for (int i = 0, cp = 0, cpLen = 0; i < len; i += cpLen)
                 {
                     cp = buffer[bufferLen] = Character.CodePointAt(termBuffer, i, len);
                     cpLen = Character.CharCount(cp);
                     startOffset[bufferLen] = start;
                     start = endOffset[bufferLen] = start + cpLen;
                     bufferLen++;
                 }
             }
         }

         /// <summary>
         /// Flushes a bigram token to output from our buffer
         /// This is the normal case, e.g. ABC -> AB BC
         /// </summary>
         private void FlushBigram()
         {
             ClearAttributes();
             char[] termBuffer = termAtt.ResizeBuffer(4); // maximum bigram length in code units (2 supplementaries)
             int len1 = Character.ToChars(buffer[index], termBuffer, 0);
             int len2 = len1 + Character.ToChars(buffer[index + 1], termBuffer, len1);
             termAtt.Length = len2;
             offsetAtt.SetOffset(startOffset[index], endOffset[index + 1]);
             typeAtt.Type = DOUBLE_TYPE;
             // when outputting unigrams, all bigrams are synonyms that span two unigrams
             if (outputUnigrams)
             {
                 posIncAtt.PositionIncrement = 0;
                 posLengthAtt.PositionLength = 2;
             }
             index++;
         }

         /// <summary>
         /// Flushes a unigram token to output from our buffer.
         /// This happens when we encounter isolated CJK characters, either the whole
         /// CJK string is a single character, or we encounter a CJK character surrounded
         /// by space, punctuation, english, etc, but not beside any other CJK.
         /// </summary>
         private void FlushUnigram()
         {
             ClearAttributes();
             char[] termBuffer = termAtt.ResizeBuffer(2); // maximum unigram length (2 surrogates)
             int len = Character.ToChars(buffer[index], termBuffer, 0);
             termAtt.Length = len;
             offsetAtt.SetOffset(startOffset[index], endOffset[index]);
             typeAtt.Type = SINGLE_TYPE;
             index++;
         }

         /// <summary>
         /// True if we have multiple codepoints sitting in our buffer
         /// </summary>
         private bool HasBufferedBigram => bufferLen - index > 1;

         /// <summary>
         /// True if we have a single codepoint sitting in our buffer, where its future
         /// (whether it is emitted as unigram or forms a bigram) depends upon not-yet-seen
         /// inputs.
         /// </summary>
         private bool HasBufferedUnigram
         {
             get
             {
                 if (outputUnigrams)
                 {
                     // when outputting unigrams always
                     return bufferLen - index == 1;
                 }
                 else
                 {
                     // otherwise its only when we have a lone CJK character
                     return bufferLen == 1 && index == 0;
                 }
             }
         }

         public override void Reset()
         {
             base.Reset();
             bufferLen = 0;
             index = 0;
             lastEndOffset = 0;
             loneState = null;
             exhausted = false;
             ngramState = false;
         }
     }
 }
	using J2N;
	using Lucene.Net.Analysis.Standard;
	using Lucene.Net.Analysis.TokenAttributes;
	using Lucene.Net.Util;

	namespace Lucene.Net.Analysis.Cjk
	{
	/*
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/

	// LUCENENET specific - converted constants from CJKBigramFilter
	// into a flags enum.
	[System.Flags]
	public enum CJKScript
	{
	/// <summary>
	/// bigram flag for Han Ideographs </summary>
	HAN = 1,
	/// <summary>
	/// bigram flag for Hiragana </summary>
	HIRAGANA = 2,
	/// <summary>
	/// bigram flag for Katakana </summary>
	KATAKANA = 4,
	/// <summary>
	/// bigram flag for Hangul </summary>
	HANGUL = 8,
	/// <summary>
	/// bigram flag for all scripts </summary>
	ALL = 0xff
	}

	/// <summary>
	/// Forms bigrams of CJK terms that are generated from <see cref="StandardTokenizer"/>
	/// or ICUTokenizer.
	/// <para>
	/// CJK types are set by these tokenizers, but you can also use
	/// <see cref="CJKBigramFilter(TokenStream, CJKScript)"/> to explicitly control which
	/// of the CJK scripts are turned into bigrams.
	/// </para>
	/// <para>
	/// By default, when a CJK character has no adjacent characters to form
	/// a bigram, it is output in unigram form. If you want to always output
	/// both unigrams and bigrams, set the <code>outputUnigrams</code>
	/// flag in <see cref="CJKBigramFilter.CJKBigramFilter(TokenStream, CJKScript, bool)"/>.
	/// This can be used for a combined unigram+bigram approach.
	/// </para>
	/// <para>
	/// In all cases, all non-CJK input is passed thru unmodified.
	/// </para>
	/// </summary>
	public sealed class CJKBigramFilter : TokenFilter
	{
	// configuration

	// LUCENENET specific - made flags into their own [Flags] enum named CJKScript and de-nested from this type

	/// <summary>
	/// when we emit a bigram, its then marked as this type </summary>
	public const string DOUBLE_TYPE = "<DOUBLE>";
	/// <summary>
	/// when we emit a unigram, its then marked as this type </summary>
	public const string SINGLE_TYPE = "<SINGLE>";

	// the types from standardtokenizer
	private static readonly string HAN_TYPE = StandardTokenizer.TOKEN_TYPES[StandardTokenizer.IDEOGRAPHIC];
	private static readonly string HIRAGANA_TYPE = StandardTokenizer.TOKEN_TYPES[StandardTokenizer.HIRAGANA];
	private static readonly string KATAKANA_TYPE = StandardTokenizer.TOKEN_TYPES[StandardTokenizer.KATAKANA];
	private static readonly string HANGUL_TYPE = StandardTokenizer.TOKEN_TYPES[StandardTokenizer.HANGUL];

	// sentinel value for ignoring a script
	private static readonly string NO = "<NO>";

	// these are set to either their type or NO if we want to pass them thru
	private readonly string doHan;
	private readonly string doHiragana;
	private readonly string doKatakana;
	private readonly string doHangul;

	// true if we should output unigram tokens always
	private readonly bool outputUnigrams;
	private bool ngramState; // false = output unigram, true = output bigram

	private readonly ICharTermAttribute termAtt;
	private readonly ITypeAttribute typeAtt;
	private readonly IOffsetAttribute offsetAtt;
	private readonly IPositionIncrementAttribute posIncAtt;
	private readonly IPositionLengthAttribute posLengthAtt;

	// buffers containing codepoint and offsets in parallel
	private int[] buffer = new int[8];
	private int[] startOffset = new int[8];
	private int[] endOffset = new int[8];
	// length of valid buffer
	private int bufferLen;
	// current buffer index
	private int index;

	// the last end offset, to determine if we should bigram across tokens
	private int lastEndOffset;

	private bool exhausted;

	/// <summary>
	/// Calls <see cref="CJKBigramFilter.CJKBigramFilter(TokenStream, CJKScript)">
	/// CJKBigramFilter(@in, CJKScript.HAN \| CJKScript.HIRAGANA \| CJKScript.KATAKANA \| CJKScript.HANGUL)</see>
	/// </summary>
	/// <param name="in">
	/// Input <see cref="TokenStream"/> </param>
	public CJKBigramFilter(TokenStream @in)
	: this(@in, CJKScript.HAN \| CJKScript.HIRAGANA \| CJKScript.KATAKANA \| CJKScript.HANGUL)
	{
	}

	/// <summary>
	/// Calls <see cref="CJKBigramFilter.CJKBigramFilter(TokenStream, CJKScript, bool)">
	/// CJKBigramFilter(in, flags, false)</see>
	/// </summary>
	/// <param name="in">
	/// Input <see cref="TokenStream"/> </param>
	/// <param name="flags"> OR'ed set from <see cref="CJKScript.HAN"/>, <see cref="CJKScript.HIRAGANA"/>,
	/// <see cref="CJKScript.KATAKANA"/>, <see cref="CJKScript.HANGUL"/> </param>
	public CJKBigramFilter(TokenStream @in, CJKScript flags)
	: this(@in, flags, false)
	{
	}

	/// <summary>
	/// Create a new <see cref="CJKBigramFilter"/>, specifying which writing systems should be bigrammed,
	/// and whether or not unigrams should also be output. </summary>
	/// <param name="in">
	/// Input <see cref="TokenStream"/> </param>
	/// <param name="flags"> OR'ed set from <see cref="CJKScript.HAN"/>, <see cref="CJKScript.HIRAGANA"/>,
	/// <see cref="CJKScript.KATAKANA"/>, <see cref="CJKScript.HANGUL"/> </param>
	/// <param name="outputUnigrams"> true if unigrams for the selected writing systems should also be output.
	/// when this is false, this is only done when there are no adjacent characters to form
	/// a bigram. </param>
	public CJKBigramFilter(TokenStream @in, CJKScript flags, bool outputUnigrams)
	: base(@in)
	{
	doHan = (flags & CJKScript.HAN) == 0 ? NO : HAN_TYPE;
	doHiragana = (flags & CJKScript.HIRAGANA) == 0 ? NO : HIRAGANA_TYPE;
	doKatakana = (flags & CJKScript.KATAKANA) == 0 ? NO : KATAKANA_TYPE;
	doHangul = (flags & CJKScript.HANGUL) == 0 ? NO : HANGUL_TYPE;
	this.outputUnigrams = outputUnigrams;
	this.termAtt = AddAttribute<ICharTermAttribute>();
	this.typeAtt = AddAttribute<ITypeAttribute>();
	this.offsetAtt = AddAttribute<IOffsetAttribute>();
	this.posIncAtt = AddAttribute<IPositionIncrementAttribute>();
	this.posLengthAtt = AddAttribute<IPositionLengthAttribute>();
	}

	/*
	* much of this complexity revolves around handling the special case of a
	* "lone cjk character" where cjktokenizer would output a unigram. this
	* is also the only time we ever have to captureState.
	*/
	public override bool IncrementToken()
	{
	while (true)
	{
	if (HasBufferedBigram)
	{

	// case 1: we have multiple remaining codepoints buffered,
	// so we can emit a bigram here.

	if (outputUnigrams)
	{

	// when also outputting unigrams, we output the unigram first,
	// then rewind back to revisit the bigram.
	// so an input of ABC is A + (rewind)AB + B + (rewind)BC + C
	// the logic in hasBufferedUnigram ensures we output the C,
	// even though it did actually have adjacent CJK characters.

	if (ngramState)
	{
	FlushBigram();
	}
	else
	{
	FlushUnigram();
	index--;
	}
	ngramState = !ngramState;
	}
	else
	{
	FlushBigram();
	}
	return true;
	}
	else if (DoNext())
	{

	// case 2: look at the token type. should we form any n-grams?

	string type = typeAtt.Type;
	if (type == doHan \|\| type == doHiragana \|\| type == doKatakana \|\| type == doHangul)
	{

	// acceptable CJK type: we form n-grams from these.
	// as long as the offsets are aligned, we just add these to our current buffer.
	// otherwise, we clear the buffer and start over.

	if (offsetAtt.StartOffset != lastEndOffset) // unaligned, clear queue
	{
	if (HasBufferedUnigram)
	{

	// we have a buffered unigram, and we peeked ahead to see if we could form
	// a bigram, but we can't, because the offsets are unaligned. capture the state
	// of this peeked data to be revisited next time thru the loop, and dump our unigram.

	loneState = CaptureState();
	FlushUnigram();
	return true;
	}
	index = 0;
	bufferLen = 0;
	}
	Refill();
	}
	else
	{

	// not a CJK type: we just return these as-is.

	if (HasBufferedUnigram)
	{

	// we have a buffered unigram, and we peeked ahead to see if we could form
	// a bigram, but we can't, because its not a CJK type. capture the state
	// of this peeked data to be revisited next time thru the loop, and dump our unigram.

	loneState = CaptureState();
	FlushUnigram();
	return true;
	}
	return true;
	}
	}
	else
	{

	// case 3: we have only zero or 1 codepoints buffered,
	// so not enough to form a bigram. But, we also have no
	// more input. So if we have a buffered codepoint, emit
	// a unigram, otherwise, its end of stream.

	if (HasBufferedUnigram)
	{
	FlushUnigram(); // flush our remaining unigram
	return true;
	}
	return false;
	}
	}
	}

	private State loneState; // rarely used: only for "lone cjk characters", where we emit unigrams

	/// <summary>
	/// looks at next input token, returning false is none is available
	/// </summary>
	private bool DoNext()
	{
	if (loneState != null)
	{
	RestoreState(loneState);
	loneState = null;
	return true;
	}
	else
	{
	if (exhausted)
	{
	return false;
	}
	else if (m_input.IncrementToken())
	{
	return true;
	}
	else
	{
	exhausted = true;
	return false;
	}
	}
	}

	/// <summary>
	/// refills buffers with new data from the current token.
	/// </summary>
	private void Refill()
	{
	// compact buffers to keep them smallish if they become large
	// just a safety check, but technically we only need the last codepoint
	if (bufferLen > 64)
	{
	int last = bufferLen - 1;
	buffer[0] = buffer[last];
	startOffset[0] = startOffset[last];
	endOffset[0] = endOffset[last];
	bufferLen = 1;
	index -= last;
	}

	char[] termBuffer = termAtt.Buffer;
	int len = termAtt.Length;
	int start = offsetAtt.StartOffset;
	int end = offsetAtt.EndOffset;

	int newSize = bufferLen + len;
	buffer = ArrayUtil.Grow(buffer, newSize);
	startOffset = ArrayUtil.Grow(startOffset, newSize);
	endOffset = ArrayUtil.Grow(endOffset, newSize);
	lastEndOffset = end;

	if (end - start != len)
	{
	// crazy offsets (modified by synonym or charfilter): just preserve
	for (int i = 0, cp = 0; i < len; i += Character.CharCount(cp))
	{
	cp = buffer[bufferLen] = Character.CodePointAt(termBuffer, i, len);
	startOffset[bufferLen] = start;
	endOffset[bufferLen] = end;
	bufferLen++;
	}
	}
	else
	{
	// normal offsets
	for (int i = 0, cp = 0, cpLen = 0; i < len; i += cpLen)
	{
	cp = buffer[bufferLen] = Character.CodePointAt(termBuffer, i, len);
	cpLen = Character.CharCount(cp);
	startOffset[bufferLen] = start;
	start = endOffset[bufferLen] = start + cpLen;
	bufferLen++;
	}
	}
	}

	/// <summary>
	/// Flushes a bigram token to output from our buffer
	/// This is the normal case, e.g. ABC -> AB BC
	/// </summary>
	private void FlushBigram()
	{
	ClearAttributes();
	char[] termBuffer = termAtt.ResizeBuffer(4); // maximum bigram length in code units (2 supplementaries)
	int len1 = Character.ToChars(buffer[index], termBuffer, 0);
	int len2 = len1 + Character.ToChars(buffer[index + 1], termBuffer, len1);
	termAtt.Length = len2;
	offsetAtt.SetOffset(startOffset[index], endOffset[index + 1]);
	typeAtt.Type = DOUBLE_TYPE;
	// when outputting unigrams, all bigrams are synonyms that span two unigrams
	if (outputUnigrams)
	{
	posIncAtt.PositionIncrement = 0;
	posLengthAtt.PositionLength = 2;
	}
	index++;
	}

	/// <summary>
	/// Flushes a unigram token to output from our buffer.
	/// This happens when we encounter isolated CJK characters, either the whole
	/// CJK string is a single character, or we encounter a CJK character surrounded
	/// by space, punctuation, english, etc, but not beside any other CJK.
	/// </summary>
	private void FlushUnigram()
	{
	ClearAttributes();
	char[] termBuffer = termAtt.ResizeBuffer(2); // maximum unigram length (2 surrogates)
	int len = Character.ToChars(buffer[index], termBuffer, 0);
	termAtt.Length = len;
	offsetAtt.SetOffset(startOffset[index], endOffset[index]);
	typeAtt.Type = SINGLE_TYPE;
	index++;
	}

	/// <summary>
	/// True if we have multiple codepoints sitting in our buffer
	/// </summary>
	private bool HasBufferedBigram => bufferLen - index > 1;

	/// <summary>
	/// True if we have a single codepoint sitting in our buffer, where its future
	/// (whether it is emitted as unigram or forms a bigram) depends upon not-yet-seen
	/// inputs.
	/// </summary>
	private bool HasBufferedUnigram
	{
	get
	{
	if (outputUnigrams)
	{
	// when outputting unigrams always
	return bufferLen - index == 1;
	}
	else
	{
	// otherwise its only when we have a lone CJK character
	return bufferLen == 1 && index == 0;
	}
	}
	}

	public override void Reset()
	{
	base.Reset();
	bufferLen = 0;
	index = 0;
	lastEndOffset = 0;
	loneState = null;
	exhausted = false;
	ngramState = false;
	}
	}
	}