lucene/analysis/common/src/java/org/apache/lucene/analysis/cjk/CJKBigramFilter.java - lucene-solr - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License.  You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
 package org.apache.lucene.analysis.cjk;


 import java.io.IOException;

 import org.apache.lucene.analysis.TokenFilter;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.standard.StandardTokenizer;
 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
 import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
 import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
 import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute;
 import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
 import org.apache.lucene.util.ArrayUtil;

 /**
  * Forms bigrams of CJK terms that are generated from StandardTokenizer
  * or ICUTokenizer.
  * <p>
  * CJK types are set by these tokenizers, but you can also use
  * {@link #CJKBigramFilter(TokenStream, int)} to explicitly control which
  * of the CJK scripts are turned into bigrams.
  * <p>
  * By default, when a CJK character has no adjacent characters to form
  * a bigram, it is output in unigram form. If you want to always output
  * both unigrams and bigrams, set the <code>outputUnigrams</code>
  * flag in {@link CJKBigramFilter#CJKBigramFilter(TokenStream, int, boolean)}.
  * This can be used for a combined unigram+bigram approach.
  * <p>
  * Unlike ICUTokenizer, StandardTokenizer does not split at script boundaries.
  * Korean Hangul characters are treated the same as many other scripts'
  * letters, and as a result, StandardTokenizer can produce tokens that mix
  * Hangul and non-Hangul characters, e.g. "한국abc".  Such mixed-script tokens
  * are typed as <code>&lt;ALPHANUM&gt;</code> rather than
  * <code>&lt;HANGUL&gt;</code>, and as a result, will not be converted to
  * bigrams by CJKBigramFilter.
  *
  * In all cases, all non-CJK input is passed thru unmodified.
  */
 public final class CJKBigramFilter extends TokenFilter {
   // configuration
   /** bigram flag for Han Ideographs */
   public static final int HAN = 1;
   /** bigram flag for Hiragana */
   public static final int HIRAGANA = 2;
   /** bigram flag for Katakana */
   public static final int KATAKANA = 4;
   /** bigram flag for Hangul */
   public static final int HANGUL = 8;

   /** when we emit a bigram, it's then marked as this type */
   public static final String DOUBLE_TYPE = "<DOUBLE>";
   /** when we emit a unigram, it's then marked as this type */
   public static final String SINGLE_TYPE = "<SINGLE>";

   // the types from standardtokenizer
   private static final String HAN_TYPE = StandardTokenizer.TOKEN_TYPES[StandardTokenizer.IDEOGRAPHIC];
   private static final String HIRAGANA_TYPE = StandardTokenizer.TOKEN_TYPES[StandardTokenizer.HIRAGANA];
   private static final String KATAKANA_TYPE = StandardTokenizer.TOKEN_TYPES[StandardTokenizer.KATAKANA];
   private static final String HANGUL_TYPE = StandardTokenizer.TOKEN_TYPES[StandardTokenizer.HANGUL];

   // sentinel value for ignoring a script
   private static final Object NO = new Object();

   // these are set to either their type or NO if we want to pass them thru
   private final Object doHan;
   private final Object doHiragana;
   private final Object doKatakana;
   private final Object doHangul;

   // true if we should output unigram tokens always
   private final boolean outputUnigrams;
   private boolean ngramState; // false = output unigram, true = output bigram

   private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
   private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class);
   private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
   private final PositionIncrementAttribute posIncAtt = addAttribute(PositionIncrementAttribute.class);
   private final PositionLengthAttribute posLengthAtt = addAttribute(PositionLengthAttribute.class);

   // buffers containing codepoint and offsets in parallel
   int buffer[] = new int[8];
   int startOffset[] = new int[8];
   int endOffset[] = new int[8];
   // length of valid buffer
   int bufferLen;
   // current buffer index
   int index;

   // the last end offset, to determine if we should bigram across tokens
   int lastEndOffset;

   private boolean exhausted;

   /**
    * Calls {@link CJKBigramFilter#CJKBigramFilter(TokenStream, int)
    *       CJKBigramFilter(in, HAN | HIRAGANA | KATAKANA | HANGUL)}
    */
   public CJKBigramFilter(TokenStream in) {
     this(in, HAN | HIRAGANA | KATAKANA | HANGUL);
   }

   /**
    * Calls {@link CJKBigramFilter#CJKBigramFilter(TokenStream, int, boolean)
    *       CJKBigramFilter(in, flags, false)}
    */
   public CJKBigramFilter(TokenStream in, int flags) {
     this(in, flags, false);
   }

   /**
    * Create a new CJKBigramFilter, specifying which writing systems should be bigrammed,
    * and whether or not unigrams should also be output.
    * @param flags OR'ed set from {@link CJKBigramFilter#HAN}, {@link CJKBigramFilter#HIRAGANA},
    *        {@link CJKBigramFilter#KATAKANA}, {@link CJKBigramFilter#HANGUL}
    * @param outputUnigrams true if unigrams for the selected writing systems should also be output.
    *        when this is false, this is only done when there are no adjacent characters to form
    *        a bigram.
    */
   public CJKBigramFilter(TokenStream in, int flags, boolean outputUnigrams) {
     super(in);
     doHan =      (flags & HAN) == 0      ? NO : HAN_TYPE;
     doHiragana = (flags & HIRAGANA) == 0 ? NO : HIRAGANA_TYPE;
     doKatakana = (flags & KATAKANA) == 0 ? NO : KATAKANA_TYPE;
     doHangul =   (flags & HANGUL) == 0   ? NO : HANGUL_TYPE;
     this.outputUnigrams = outputUnigrams;
   }

   /*
    * much of this complexity revolves around handling the special case of a
    * "lone cjk character" where cjktokenizer would output a unigram. this
    * is also the only time we ever have to captureState.
    */
   @Override
   public boolean incrementToken() throws IOException {
     while (true) {
       if (hasBufferedBigram()) {

         // case 1: we have multiple remaining codepoints buffered,
         // so we can emit a bigram here.

         if (outputUnigrams) {

           // when also outputting unigrams, we output the unigram first,
           // then rewind back to revisit the bigram.
           // so an input of ABC is A + (rewind)AB + B + (rewind)BC + C
           // the logic in hasBufferedUnigram ensures we output the C,
           // even though it did actually have adjacent CJK characters.

           if (ngramState) {
             flushBigram();
           } else {
             flushUnigram();
             index--;
           }
           ngramState = !ngramState;
         } else {
           flushBigram();
         }
         return true;
       } else if (doNext()) {

         // case 2: look at the token type. should we form any n-grams?

         String type = typeAtt.type();
         if (type == doHan || type == doHiragana || type == doKatakana || type == doHangul) {

           // acceptable CJK type: we form n-grams from these.
           // as long as the offsets are aligned, we just add these to our current buffer.
           // otherwise, we clear the buffer and start over.

           if (offsetAtt.startOffset() != lastEndOffset) { // unaligned, clear queue
             if (hasBufferedUnigram()) {

               // we have a buffered unigram, and we peeked ahead to see if we could form
               // a bigram, but we can't, because the offsets are unaligned. capture the state
               // of this peeked data to be revisited next time thru the loop, and dump our unigram.

               loneState = captureState();
               flushUnigram();
               return true;
             }
             index = 0;
             bufferLen = 0;
           }
           refill();
         } else {

           // not a CJK type: we just return these as-is.

           if (hasBufferedUnigram()) {

             // we have a buffered unigram, and we peeked ahead to see if we could form
             // a bigram, but we can't, because it's not a CJK type. capture the state
             // of this peeked data to be revisited next time thru the loop, and dump our unigram.

             loneState = captureState();
             flushUnigram();
             return true;
           }
           return true;
         }
       } else {

         // case 3: we have only zero or 1 codepoints buffered,
         // so not enough to form a bigram. But, we also have no
         // more input. So if we have a buffered codepoint, emit
         // a unigram, otherwise, it's end of stream.

         if (hasBufferedUnigram()) {
           flushUnigram(); // flush our remaining unigram
           return true;
         }
         return false;
       }
     }
   }

   private State loneState; // rarely used: only for "lone cjk characters", where we emit unigrams

   /**
    * looks at next input token, returning false is none is available
    */
   private boolean doNext() throws IOException {
     if (loneState != null) {
       restoreState(loneState);
       loneState = null;
       return true;
     } else {
       if (exhausted) {
         return false;
       } else if (input.incrementToken()) {
         return true;
       } else {
         exhausted = true;
         return false;
       }
     }
   }

   /**
    * refills buffers with new data from the current token.
    */
   private void refill() {
     // compact buffers to keep them smallish if they become large
     // just a safety check, but technically we only need the last codepoint
     if (bufferLen > 64) {
       int last = bufferLen - 1;
       buffer[0] = buffer[last];
       startOffset[0] = startOffset[last];
       endOffset[0] = endOffset[last];
       bufferLen = 1;
       index -= last;
     }

     char termBuffer[] = termAtt.buffer();
     int len = termAtt.length();
     int start = offsetAtt.startOffset();
     int end = offsetAtt.endOffset();

     int newSize = bufferLen + len;
     buffer = ArrayUtil.grow(buffer, newSize);
     startOffset = ArrayUtil.grow(startOffset, newSize);
     endOffset = ArrayUtil.grow(endOffset, newSize);
     lastEndOffset = end;

     if (end - start != len) {
       // crazy offsets (modified by synonym or charfilter): just preserve
       for (int i = 0, cp = 0; i < len; i += Character.charCount(cp)) {
         cp = buffer[bufferLen] = Character.codePointAt(termBuffer, i, len);
         startOffset[bufferLen] = start;
         endOffset[bufferLen] = end;
         bufferLen++;
       }
     } else {
       // normal offsets
       for (int i = 0, cp = 0, cpLen = 0; i < len; i += cpLen) {
         cp = buffer[bufferLen] = Character.codePointAt(termBuffer, i, len);
         cpLen = Character.charCount(cp);
         startOffset[bufferLen] = start;
         start = endOffset[bufferLen] = start + cpLen;
         bufferLen++;
       }
     }
   }

   /**
    * Flushes a bigram token to output from our buffer
    * This is the normal case, e.g. ABC -&gt; AB BC
    */
   private void flushBigram() {
     clearAttributes();
     char termBuffer[] = termAtt.resizeBuffer(4); // maximum bigram length in code units (2 supplementaries)
     int len1 = Character.toChars(buffer[index], termBuffer, 0);
     int len2 = len1 + Character.toChars(buffer[index+1], termBuffer, len1);
     termAtt.setLength(len2);
     offsetAtt.setOffset(startOffset[index], endOffset[index+1]);
     typeAtt.setType(DOUBLE_TYPE);
     // when outputting unigrams, all bigrams are synonyms that span two unigrams
     if (outputUnigrams) {
       posIncAtt.setPositionIncrement(0);
       posLengthAtt.setPositionLength(2);
     }
     index++;
   }

   /**
    * Flushes a unigram token to output from our buffer.
    * This happens when we encounter isolated CJK characters, either the whole
    * CJK string is a single character, or we encounter a CJK character surrounded
    * by space, punctuation, english, etc, but not beside any other CJK.
    */
   private void flushUnigram() {
     clearAttributes();
     char termBuffer[] = termAtt.resizeBuffer(2); // maximum unigram length (2 surrogates)
     int len = Character.toChars(buffer[index], termBuffer, 0);
     termAtt.setLength(len);
     offsetAtt.setOffset(startOffset[index], endOffset[index]);
     typeAtt.setType(SINGLE_TYPE);
     index++;
   }

   /**
    * True if we have multiple codepoints sitting in our buffer
    */
   private boolean hasBufferedBigram() {
     return bufferLen - index > 1;
   }

   /**
    * True if we have a single codepoint sitting in our buffer, where its future
    * (whether it is emitted as unigram or forms a bigram) depends upon not-yet-seen
    * inputs.
    */
   private boolean hasBufferedUnigram() {
     if (outputUnigrams) {
       // when outputting unigrams always
       return bufferLen - index == 1;
     } else {
       // otherwise it's only when we have a lone CJK character
       return bufferLen == 1 && index == 0;
     }
   }

   @Override
   public void reset() throws IOException {
     super.reset();
     bufferLen = 0;
     index = 0;
     lastEndOffset = 0;
     loneState = null;
     exhausted = false;
     ngramState = false;
   }
 }
	/*
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/
	package org.apache.lucene.analysis.cjk;


	import java.io.IOException;

	import org.apache.lucene.analysis.TokenFilter;
	import org.apache.lucene.analysis.TokenStream;
	import org.apache.lucene.analysis.standard.StandardTokenizer;
	import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
	import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
	import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
	import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute;
	import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
	import org.apache.lucene.util.ArrayUtil;

	/**
	* Forms bigrams of CJK terms that are generated from StandardTokenizer
	* or ICUTokenizer.
	* <p>
	* CJK types are set by these tokenizers, but you can also use
	* {@link #CJKBigramFilter(TokenStream, int)} to explicitly control which
	* of the CJK scripts are turned into bigrams.
	* <p>
	* By default, when a CJK character has no adjacent characters to form
	* a bigram, it is output in unigram form. If you want to always output
	* both unigrams and bigrams, set the <code>outputUnigrams</code>
	* flag in {@link CJKBigramFilter#CJKBigramFilter(TokenStream, int, boolean)}.
	* This can be used for a combined unigram+bigram approach.
	* <p>
	* Unlike ICUTokenizer, StandardTokenizer does not split at script boundaries.
	* Korean Hangul characters are treated the same as many other scripts'
	* letters, and as a result, StandardTokenizer can produce tokens that mix
	* Hangul and non-Hangul characters, e.g. "한국abc". Such mixed-script tokens
	* are typed as <code><ALPHANUM></code> rather than
	* <code><HANGUL></code>, and as a result, will not be converted to
	* bigrams by CJKBigramFilter.
	*
	* In all cases, all non-CJK input is passed thru unmodified.
	*/
	public final class CJKBigramFilter extends TokenFilter {
	// configuration
	/** bigram flag for Han Ideographs */
	public static final int HAN = 1;
	/** bigram flag for Hiragana */
	public static final int HIRAGANA = 2;
	/** bigram flag for Katakana */
	public static final int KATAKANA = 4;
	/** bigram flag for Hangul */
	public static final int HANGUL = 8;

	/** when we emit a bigram, it's then marked as this type */
	public static final String DOUBLE_TYPE = "<DOUBLE>";
	/** when we emit a unigram, it's then marked as this type */
	public static final String SINGLE_TYPE = "<SINGLE>";

	// the types from standardtokenizer
	private static final String HAN_TYPE = StandardTokenizer.TOKEN_TYPES[StandardTokenizer.IDEOGRAPHIC];
	private static final String HIRAGANA_TYPE = StandardTokenizer.TOKEN_TYPES[StandardTokenizer.HIRAGANA];
	private static final String KATAKANA_TYPE = StandardTokenizer.TOKEN_TYPES[StandardTokenizer.KATAKANA];
	private static final String HANGUL_TYPE = StandardTokenizer.TOKEN_TYPES[StandardTokenizer.HANGUL];

	// sentinel value for ignoring a script
	private static final Object NO = new Object();

	// these are set to either their type or NO if we want to pass them thru
	private final Object doHan;
	private final Object doHiragana;
	private final Object doKatakana;
	private final Object doHangul;

	// true if we should output unigram tokens always
	private final boolean outputUnigrams;
	private boolean ngramState; // false = output unigram, true = output bigram

	private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
	private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class);
	private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
	private final PositionIncrementAttribute posIncAtt = addAttribute(PositionIncrementAttribute.class);
	private final PositionLengthAttribute posLengthAtt = addAttribute(PositionLengthAttribute.class);

	// buffers containing codepoint and offsets in parallel
	int buffer[] = new int[8];
	int startOffset[] = new int[8];
	int endOffset[] = new int[8];
	// length of valid buffer
	int bufferLen;
	// current buffer index
	int index;

	// the last end offset, to determine if we should bigram across tokens
	int lastEndOffset;

	private boolean exhausted;

	/**
	* Calls {@link CJKBigramFilter#CJKBigramFilter(TokenStream, int)
	* CJKBigramFilter(in, HAN \| HIRAGANA \| KATAKANA \| HANGUL)}
	*/
	public CJKBigramFilter(TokenStream in) {
	this(in, HAN \| HIRAGANA \| KATAKANA \| HANGUL);
	}

	/**
	* Calls {@link CJKBigramFilter#CJKBigramFilter(TokenStream, int, boolean)
	* CJKBigramFilter(in, flags, false)}
	*/
	public CJKBigramFilter(TokenStream in, int flags) {
	this(in, flags, false);
	}

	/**
	* Create a new CJKBigramFilter, specifying which writing systems should be bigrammed,
	* and whether or not unigrams should also be output.
	* @param flags OR'ed set from {@link CJKBigramFilter#HAN}, {@link CJKBigramFilter#HIRAGANA},
	* {@link CJKBigramFilter#KATAKANA}, {@link CJKBigramFilter#HANGUL}
	* @param outputUnigrams true if unigrams for the selected writing systems should also be output.
	* when this is false, this is only done when there are no adjacent characters to form
	* a bigram.
	*/
	public CJKBigramFilter(TokenStream in, int flags, boolean outputUnigrams) {
	super(in);
	doHan = (flags & HAN) == 0 ? NO : HAN_TYPE;
	doHiragana = (flags & HIRAGANA) == 0 ? NO : HIRAGANA_TYPE;
	doKatakana = (flags & KATAKANA) == 0 ? NO : KATAKANA_TYPE;
	doHangul = (flags & HANGUL) == 0 ? NO : HANGUL_TYPE;
	this.outputUnigrams = outputUnigrams;
	}

	/*
	* much of this complexity revolves around handling the special case of a
	* "lone cjk character" where cjktokenizer would output a unigram. this
	* is also the only time we ever have to captureState.
	*/
	@Override
	public boolean incrementToken() throws IOException {
	while (true) {
	if (hasBufferedBigram()) {

	// case 1: we have multiple remaining codepoints buffered,
	// so we can emit a bigram here.

	if (outputUnigrams) {

	// when also outputting unigrams, we output the unigram first,
	// then rewind back to revisit the bigram.
	// so an input of ABC is A + (rewind)AB + B + (rewind)BC + C
	// the logic in hasBufferedUnigram ensures we output the C,
	// even though it did actually have adjacent CJK characters.

	if (ngramState) {
	flushBigram();
	} else {
	flushUnigram();
	index--;
	}
	ngramState = !ngramState;
	} else {
	flushBigram();
	}
	return true;
	} else if (doNext()) {

	// case 2: look at the token type. should we form any n-grams?

	String type = typeAtt.type();
	if (type == doHan \|\| type == doHiragana \|\| type == doKatakana \|\| type == doHangul) {

	// acceptable CJK type: we form n-grams from these.
	// as long as the offsets are aligned, we just add these to our current buffer.
	// otherwise, we clear the buffer and start over.

	if (offsetAtt.startOffset() != lastEndOffset) { // unaligned, clear queue
	if (hasBufferedUnigram()) {

	// we have a buffered unigram, and we peeked ahead to see if we could form
	// a bigram, but we can't, because the offsets are unaligned. capture the state
	// of this peeked data to be revisited next time thru the loop, and dump our unigram.

	loneState = captureState();
	flushUnigram();
	return true;
	}
	index = 0;
	bufferLen = 0;
	}
	refill();
	} else {

	// not a CJK type: we just return these as-is.

	if (hasBufferedUnigram()) {

	// we have a buffered unigram, and we peeked ahead to see if we could form
	// a bigram, but we can't, because it's not a CJK type. capture the state
	// of this peeked data to be revisited next time thru the loop, and dump our unigram.

	loneState = captureState();
	flushUnigram();
	return true;
	}
	return true;
	}
	} else {

	// case 3: we have only zero or 1 codepoints buffered,
	// so not enough to form a bigram. But, we also have no
	// more input. So if we have a buffered codepoint, emit
	// a unigram, otherwise, it's end of stream.

	if (hasBufferedUnigram()) {
	flushUnigram(); // flush our remaining unigram
	return true;
	}
	return false;
	}
	}
	}

	private State loneState; // rarely used: only for "lone cjk characters", where we emit unigrams

	/**
	* looks at next input token, returning false is none is available
	*/
	private boolean doNext() throws IOException {
	if (loneState != null) {
	restoreState(loneState);
	loneState = null;
	return true;
	} else {
	if (exhausted) {
	return false;
	} else if (input.incrementToken()) {
	return true;
	} else {
	exhausted = true;
	return false;
	}
	}
	}

	/**
	* refills buffers with new data from the current token.
	*/
	private void refill() {
	// compact buffers to keep them smallish if they become large
	// just a safety check, but technically we only need the last codepoint
	if (bufferLen > 64) {
	int last = bufferLen - 1;
	buffer[0] = buffer[last];
	startOffset[0] = startOffset[last];
	endOffset[0] = endOffset[last];
	bufferLen = 1;
	index -= last;
	}

	char termBuffer[] = termAtt.buffer();
	int len = termAtt.length();
	int start = offsetAtt.startOffset();
	int end = offsetAtt.endOffset();

	int newSize = bufferLen + len;
	buffer = ArrayUtil.grow(buffer, newSize);
	startOffset = ArrayUtil.grow(startOffset, newSize);
	endOffset = ArrayUtil.grow(endOffset, newSize);
	lastEndOffset = end;

	if (end - start != len) {
	// crazy offsets (modified by synonym or charfilter): just preserve
	for (int i = 0, cp = 0; i < len; i += Character.charCount(cp)) {
	cp = buffer[bufferLen] = Character.codePointAt(termBuffer, i, len);
	startOffset[bufferLen] = start;
	endOffset[bufferLen] = end;
	bufferLen++;
	}
	} else {
	// normal offsets
	for (int i = 0, cp = 0, cpLen = 0; i < len; i += cpLen) {
	cp = buffer[bufferLen] = Character.codePointAt(termBuffer, i, len);
	cpLen = Character.charCount(cp);
	startOffset[bufferLen] = start;
	start = endOffset[bufferLen] = start + cpLen;
	bufferLen++;
	}
	}
	}

	/**
	* Flushes a bigram token to output from our buffer
	* This is the normal case, e.g. ABC -> AB BC
	*/
	private void flushBigram() {
	clearAttributes();
	char termBuffer[] = termAtt.resizeBuffer(4); // maximum bigram length in code units (2 supplementaries)
	int len1 = Character.toChars(buffer[index], termBuffer, 0);
	int len2 = len1 + Character.toChars(buffer[index+1], termBuffer, len1);
	termAtt.setLength(len2);
	offsetAtt.setOffset(startOffset[index], endOffset[index+1]);
	typeAtt.setType(DOUBLE_TYPE);
	// when outputting unigrams, all bigrams are synonyms that span two unigrams
	if (outputUnigrams) {
	posIncAtt.setPositionIncrement(0);
	posLengthAtt.setPositionLength(2);
	}
	index++;
	}

	/**
	* Flushes a unigram token to output from our buffer.
	* This happens when we encounter isolated CJK characters, either the whole
	* CJK string is a single character, or we encounter a CJK character surrounded
	* by space, punctuation, english, etc, but not beside any other CJK.
	*/
	private void flushUnigram() {
	clearAttributes();
	char termBuffer[] = termAtt.resizeBuffer(2); // maximum unigram length (2 surrogates)
	int len = Character.toChars(buffer[index], termBuffer, 0);
	termAtt.setLength(len);
	offsetAtt.setOffset(startOffset[index], endOffset[index]);
	typeAtt.setType(SINGLE_TYPE);
	index++;
	}

	/**
	* True if we have multiple codepoints sitting in our buffer
	*/
	private boolean hasBufferedBigram() {
	return bufferLen - index > 1;
	}

	/**
	* True if we have a single codepoint sitting in our buffer, where its future
	* (whether it is emitted as unigram or forms a bigram) depends upon not-yet-seen
	* inputs.
	*/
	private boolean hasBufferedUnigram() {
	if (outputUnigrams) {
	// when outputting unigrams always
	return bufferLen - index == 1;
	} else {
	// otherwise it's only when we have a lone CJK character
	return bufferLen == 1 && index == 0;
	}
	}

	@Override
	public void reset() throws IOException {
	super.reset();
	bufferLen = 0;
	index = 0;
	lastEndOffset = 0;
	loneState = null;
	exhausted = false;
	ngramState = false;
	}
	}