lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/ConcatenateGraphFilter.java - lucene-solr - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License.  You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
 package org.apache.lucene.analysis.miscellaneous;

 import java.io.IOException;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.TokenStreamToAutomaton;
 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
 import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
 import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
 import org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute;
 import org.apache.lucene.util.AttributeImpl;
 import org.apache.lucene.util.AttributeReflector;
 import org.apache.lucene.util.BytesRef;
 import org.apache.lucene.util.BytesRefBuilder;
 import org.apache.lucene.util.CharsRefBuilder;
 import org.apache.lucene.util.IntsRef;
 import org.apache.lucene.util.automaton.Automaton;
 import org.apache.lucene.util.automaton.LimitedFiniteStringsIterator;
 import org.apache.lucene.util.automaton.Operations;
 import org.apache.lucene.util.automaton.TooComplexToDeterminizeException;
 import org.apache.lucene.util.automaton.Transition;
 import org.apache.lucene.util.fst.Util;

 /**
  * Concatenates/Joins every incoming token with a separator into one output token for every path
  * through the token stream (which is a graph). In simple cases this yields one token, but in the
  * presence of any tokens with a zero positionIncrmeent (e.g. synonyms) it will be more. This filter
  * uses the token bytes, position increment, and position length of the incoming stream. Other
  * attributes are not used or manipulated.
  *
  * @lucene.experimental
  */
 public final class ConcatenateGraphFilter extends TokenStream {

   /*
    * Token stream which converts a provided token stream to an automaton.
    * The accepted strings enumeration from the automaton are available through the
    * {@link org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute} attribute
    * The token stream uses a {@link org.apache.lucene.analysis.tokenattributes.PayloadAttribute} to store
    * a completion's payload (see {@link ConcatenateGraphFilter#setPayload(org.apache.lucene.util.BytesRef)})
    */

   /** Represents the default separator between tokens. */
   public static final int SEP_LABEL = TokenStreamToAutomaton.POS_SEP;

   public static final int DEFAULT_MAX_GRAPH_EXPANSIONS = Operations.DEFAULT_MAX_DETERMINIZED_STATES;
   public static final Character DEFAULT_TOKEN_SEPARATOR = SEP_LABEL;
   public static final boolean DEFAULT_PRESERVE_SEP = true;
   public static final boolean DEFAULT_PRESERVE_POSITION_INCREMENTS = true;

   private final BytesRefBuilderTermAttribute bytesAtt =
       addAttribute(BytesRefBuilderTermAttribute.class);
   private final PositionIncrementAttribute posIncrAtt =
       addAttribute(PositionIncrementAttribute.class);
   private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);

   private final TokenStream inputTokenStream;
   private final Character tokenSeparator;
   private final boolean preservePositionIncrements;
   private final int maxGraphExpansions;

   private LimitedFiniteStringsIterator finiteStrings;
   private CharTermAttribute charTermAttribute;
   private boolean wasReset = false;
   private int endOffset;

   /**
    * Creates a token stream to convert <code>input</code> to a token stream of accepted strings by
    * its token stream graph.
    *
    * <p>This constructor uses the default settings of the constants in this class.
    */
   public ConcatenateGraphFilter(TokenStream inputTokenStream) {
     this(
         inputTokenStream,
         DEFAULT_TOKEN_SEPARATOR,
         DEFAULT_PRESERVE_POSITION_INCREMENTS,
         DEFAULT_MAX_GRAPH_EXPANSIONS);
   }

   /**
    * Creates a token stream to convert <code>input</code> to a token stream of accepted strings by
    * its token stream graph.
    *
    * @param inputTokenStream The input/incoming TokenStream
    * @param tokenSeparator Separator to use for concatenation. Can be null, in this case tokens will
    *     be concatenated without any separators.
    * @param preservePositionIncrements Whether to add an empty token for missing positions. The
    *     effect is a consecutive {@link #SEP_LABEL}. When false, it's as if there were no missing
    *     positions (we pretend the surrounding tokens were adjacent).
    * @param maxGraphExpansions If the tokenStream graph has more than this many possible paths
    *     through, then we'll throw {@link TooComplexToDeterminizeException} to preserve the
    *     stability and memory of the machine.
    * @throws TooComplexToDeterminizeException if the tokenStream graph has more than {@code
    *     maxGraphExpansions} expansions
    */
   public ConcatenateGraphFilter(
       TokenStream inputTokenStream,
       Character tokenSeparator,
       boolean preservePositionIncrements,
       int maxGraphExpansions) {
     // Don't call the super(input) ctor - this is a true delegate and has a new attribute source
     // since we consume
     // the input stream entirely in the first call to incrementToken
     this.inputTokenStream = inputTokenStream;
     this.tokenSeparator = tokenSeparator;
     this.preservePositionIncrements = preservePositionIncrements;
     this.maxGraphExpansions = maxGraphExpansions;
   }

   /**
    * Calls {@link #ConcatenateGraphFilter(org.apache.lucene.analysis.TokenStream,
    * java.lang.Character, boolean, int)}
    *
    * @param preserveSep Whether {@link #SEP_LABEL} should separate the input tokens in the
    *     concatenated token
    */
   public ConcatenateGraphFilter(
       TokenStream inputTokenStream,
       boolean preserveSep,
       boolean preservePositionIncrements,
       int maxGraphExpansions) {
     this(
         inputTokenStream,
         (preserveSep) ? DEFAULT_TOKEN_SEPARATOR : null,
         preservePositionIncrements,
         maxGraphExpansions);
   }

   @Override
   public void reset() throws IOException {
     super.reset();
     // we only capture this if we really need it to save the UTF-8 to UTF-16 conversion
     charTermAttribute = getAttribute(CharTermAttribute.class); // may return null
     wasReset = true;
   }

   @Override
   public boolean incrementToken() throws IOException {
     if (finiteStrings == null) {
       if (wasReset == false) {
         throw new IllegalStateException("reset() missing before incrementToken");
       }
       // lazy init/consume
       Automaton automaton =
           toAutomaton(); // calls reset(), incrementToken() repeatedly, and end() on
       // inputTokenStream
       finiteStrings = new LimitedFiniteStringsIterator(automaton, maxGraphExpansions);
       // note: would be nice to know the startOffset but toAutomaton doesn't capture it.  We'll
       // assume 0
       endOffset = inputTokenStream.getAttribute(OffsetAttribute.class).endOffset();
     }

     IntsRef string = finiteStrings.next();
     if (string == null) {
       return false;
     }

     clearAttributes();

     if (finiteStrings.size() > 1) { // if number of iterated strings so far is more than one...
       posIncrAtt.setPositionIncrement(0); // stacked
     }

     offsetAtt.setOffset(0, endOffset);

     Util.toBytesRef(string, bytesAtt.builder()); // now we have UTF-8
     if (charTermAttribute != null) {
       charTermAttribute.setLength(0);
       charTermAttribute.append(bytesAtt.toUTF16());
     }

     return true;
   }

   @Override
   public void end() throws IOException {
     super.end();
     if (finiteStrings == null) { // thus inputTokenStream hasn't yet received end()
       inputTokenStream
           .end(); // the input TS may really want to see "end()" called even if incrementToken
       // hasn't.
     } // else we already eagerly consumed inputTokenStream including end()
     if (endOffset != -1) {
       offsetAtt.setOffset(0, endOffset);
     }
   }

   @Override
   public void close() throws IOException {
     super.close();
     // delegate lifecycle.  Note toAutomaton does not close the stream
     inputTokenStream.close();
     finiteStrings = null;
     wasReset = false; // reset
     endOffset = -1; // reset
   }

   /**
    * Converts the tokenStream to an automaton, treating the transition labels as utf-8. Does *not*
    * close it.
    */
   public Automaton toAutomaton() throws IOException {
     return toAutomaton(false);
   }

   /** Converts the tokenStream to an automaton. Does *not* close it. */
   public Automaton toAutomaton(boolean unicodeAware) throws IOException {
     // TODO refactor this
     // maybe we could hook up a modified automaton from TermAutomatonQuery here?

     // Create corresponding automaton: labels are bytes
     // from each analyzed token, with byte 0 used as
     // separator between tokens:
     final TokenStreamToAutomaton tsta;
     if (tokenSeparator != null) {
       tsta = new EscapingTokenStreamToAutomaton(tokenSeparator);
     } else {
       // When we're not preserving sep, we don't steal 0xff
       // byte, so we don't need to do any escaping:
       tsta = new TokenStreamToAutomaton();
     }
     tsta.setPreservePositionIncrements(preservePositionIncrements);
     tsta.setUnicodeArcs(unicodeAware);

     Automaton automaton = tsta.toAutomaton(inputTokenStream);

     // TODO: we can optimize this somewhat by determinizing
     // while we convert
     automaton = replaceSep(automaton, tokenSeparator);
     // This automaton should not blow up during determinize:
     return Operations.determinize(automaton, maxGraphExpansions);
   }

   /** Just escapes the {@link #SEP_LABEL} byte with an extra. */
   private static final class EscapingTokenStreamToAutomaton extends TokenStreamToAutomaton {

     final BytesRefBuilder spare = new BytesRefBuilder();
     final byte sepLabel;

     public EscapingTokenStreamToAutomaton(int sepLabel) {
       assert sepLabel <= Byte.MAX_VALUE;
       this.sepLabel = (byte) sepLabel;
     }

     @Override
     protected BytesRef changeToken(BytesRef in) {
       int upto = 0;
       for (int i = 0; i < in.length; i++) {
         byte b = in.bytes[in.offset + i];
         if (b == sepLabel) {
           spare.grow(upto + 2);
           spare.setByteAt(upto++, sepLabel);
           spare.setByteAt(upto++, b);
         } else {
           spare.grow(upto + 1);
           spare.setByteAt(upto++, b);
         }
       }
       spare.setLength(upto);
       return spare.get();
     }
   }

   // Replaces SEP with epsilon or remaps them if
   // we were asked to preserve them:
   private static Automaton replaceSep(Automaton a, Character tokenSeparator) {

     Automaton result = new Automaton();

     // Copy all states over
     int numStates = a.getNumStates();
     for (int s = 0; s < numStates; s++) {
       result.createState();
       result.setAccept(s, a.isAccept(s));
     }

     // Go in reverse topo sort so we know we only have to
     // make one pass:
     Transition t = new Transition();
     int[] topoSortStates = Operations.topoSortStates(a);
     for (int i = 0; i < topoSortStates.length; i++) {
       int state = topoSortStates[topoSortStates.length - 1 - i];
       int count = a.initTransition(state, t);
       for (int j = 0; j < count; j++) {
         a.getNextTransition(t);
         if (t.min == TokenStreamToAutomaton.POS_SEP) {
           assert t.max == TokenStreamToAutomaton.POS_SEP;
           if (tokenSeparator != null) {
             // Remap to tokenSeparator:
             result.addTransition(state, t.dest, tokenSeparator);
           } else {
             result.addEpsilon(state, t.dest);
           }
         } else if (t.min == TokenStreamToAutomaton.HOLE) {
           assert t.max == TokenStreamToAutomaton.HOLE;

           // Just remove the hole: there will then be two
           // SEP tokens next to each other, which will only
           // match another hole at search time.  Note that
           // it will also match an empty-string token ... if
           // that's somehow a problem we can always map HOLE
           // to a dedicated byte (and escape it in the
           // input).
           result.addEpsilon(state, t.dest);
         } else {
           result.addTransition(state, t.dest, t.min, t.max);
         }
       }
     }

     result.finishState();

     return result;
   }

   /**
    * Attribute providing access to the term builder and UTF-16 conversion
    *
    * @lucene.internal
    */
   public interface BytesRefBuilderTermAttribute extends TermToBytesRefAttribute {
     /** Returns the builder from which the term is derived. */
     BytesRefBuilder builder();

     /** Returns the term represented as UTF-16 */
     CharSequence toUTF16();
   }

   /**
    * Implementation of {@link BytesRefBuilderTermAttribute}
    *
    * @lucene.internal
    */
   public static final class BytesRefBuilderTermAttributeImpl extends AttributeImpl
       implements BytesRefBuilderTermAttribute, TermToBytesRefAttribute {
     private final BytesRefBuilder bytes = new BytesRefBuilder();
     private transient CharsRefBuilder charsRef;

     /** Sole constructor no-op */
     public BytesRefBuilderTermAttributeImpl() {}

     @Override
     public BytesRefBuilder builder() {
       return bytes;
     }

     @Override
     public BytesRef getBytesRef() {
       return bytes.get();
     }

     @Override
     public void clear() {
       bytes.clear();
     }

     @Override
     public void copyTo(AttributeImpl target) {
       BytesRefBuilderTermAttributeImpl other = (BytesRefBuilderTermAttributeImpl) target;
       other.bytes.copyBytes(bytes);
     }

     @Override
     public AttributeImpl clone() {
       BytesRefBuilderTermAttributeImpl other = new BytesRefBuilderTermAttributeImpl();
       copyTo(other);
       return other;
     }

     @Override
     public void reflectWith(AttributeReflector reflector) {
       reflector.reflect(TermToBytesRefAttribute.class, "bytes", getBytesRef());
     }

     @Override
     public CharSequence toUTF16() {
       if (charsRef == null) {
         charsRef = new CharsRefBuilder();
       }
       charsRef.copyUTF8Bytes(getBytesRef());
       return charsRef.get();
     }
   }
 }
	/*
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/
	package org.apache.lucene.analysis.miscellaneous;

	import java.io.IOException;
	import org.apache.lucene.analysis.TokenStream;
	import org.apache.lucene.analysis.TokenStreamToAutomaton;
	import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
	import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
	import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
	import org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute;
	import org.apache.lucene.util.AttributeImpl;
	import org.apache.lucene.util.AttributeReflector;
	import org.apache.lucene.util.BytesRef;
	import org.apache.lucene.util.BytesRefBuilder;
	import org.apache.lucene.util.CharsRefBuilder;
	import org.apache.lucene.util.IntsRef;
	import org.apache.lucene.util.automaton.Automaton;
	import org.apache.lucene.util.automaton.LimitedFiniteStringsIterator;
	import org.apache.lucene.util.automaton.Operations;
	import org.apache.lucene.util.automaton.TooComplexToDeterminizeException;
	import org.apache.lucene.util.automaton.Transition;
	import org.apache.lucene.util.fst.Util;

	/**
	* Concatenates/Joins every incoming token with a separator into one output token for every path
	* through the token stream (which is a graph). In simple cases this yields one token, but in the
	* presence of any tokens with a zero positionIncrmeent (e.g. synonyms) it will be more. This filter
	* uses the token bytes, position increment, and position length of the incoming stream. Other
	* attributes are not used or manipulated.
	*
	* @lucene.experimental
	*/
	public final class ConcatenateGraphFilter extends TokenStream {

	/*
	* Token stream which converts a provided token stream to an automaton.
	* The accepted strings enumeration from the automaton are available through the
	* {@link org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute} attribute
	* The token stream uses a {@link org.apache.lucene.analysis.tokenattributes.PayloadAttribute} to store
	* a completion's payload (see {@link ConcatenateGraphFilter#setPayload(org.apache.lucene.util.BytesRef)})
	*/

	/** Represents the default separator between tokens. */
	public static final int SEP_LABEL = TokenStreamToAutomaton.POS_SEP;

	public static final int DEFAULT_MAX_GRAPH_EXPANSIONS = Operations.DEFAULT_MAX_DETERMINIZED_STATES;
	public static final Character DEFAULT_TOKEN_SEPARATOR = SEP_LABEL;
	public static final boolean DEFAULT_PRESERVE_SEP = true;
	public static final boolean DEFAULT_PRESERVE_POSITION_INCREMENTS = true;

	private final BytesRefBuilderTermAttribute bytesAtt =
	addAttribute(BytesRefBuilderTermAttribute.class);
	private final PositionIncrementAttribute posIncrAtt =
	addAttribute(PositionIncrementAttribute.class);
	private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);

	private final TokenStream inputTokenStream;
	private final Character tokenSeparator;
	private final boolean preservePositionIncrements;
	private final int maxGraphExpansions;

	private LimitedFiniteStringsIterator finiteStrings;
	private CharTermAttribute charTermAttribute;
	private boolean wasReset = false;
	private int endOffset;

	/**
	* Creates a token stream to convert <code>input</code> to a token stream of accepted strings by
	* its token stream graph.
	*
	* <p>This constructor uses the default settings of the constants in this class.
	*/
	public ConcatenateGraphFilter(TokenStream inputTokenStream) {
	this(
	inputTokenStream,
	DEFAULT_TOKEN_SEPARATOR,
	DEFAULT_PRESERVE_POSITION_INCREMENTS,
	DEFAULT_MAX_GRAPH_EXPANSIONS);
	}

	/**
	* Creates a token stream to convert <code>input</code> to a token stream of accepted strings by
	* its token stream graph.
	*
	* @param inputTokenStream The input/incoming TokenStream
	* @param tokenSeparator Separator to use for concatenation. Can be null, in this case tokens will
	* be concatenated without any separators.
	* @param preservePositionIncrements Whether to add an empty token for missing positions. The
	* effect is a consecutive {@link #SEP_LABEL}. When false, it's as if there were no missing
	* positions (we pretend the surrounding tokens were adjacent).
	* @param maxGraphExpansions If the tokenStream graph has more than this many possible paths
	* through, then we'll throw {@link TooComplexToDeterminizeException} to preserve the
	* stability and memory of the machine.
	* @throws TooComplexToDeterminizeException if the tokenStream graph has more than {@code
	* maxGraphExpansions} expansions
	*/
	public ConcatenateGraphFilter(
	TokenStream inputTokenStream,
	Character tokenSeparator,
	boolean preservePositionIncrements,
	int maxGraphExpansions) {
	// Don't call the super(input) ctor - this is a true delegate and has a new attribute source
	// since we consume
	// the input stream entirely in the first call to incrementToken
	this.inputTokenStream = inputTokenStream;
	this.tokenSeparator = tokenSeparator;
	this.preservePositionIncrements = preservePositionIncrements;
	this.maxGraphExpansions = maxGraphExpansions;
	}

	/**
	* Calls {@link #ConcatenateGraphFilter(org.apache.lucene.analysis.TokenStream,
	* java.lang.Character, boolean, int)}
	*
	* @param preserveSep Whether {@link #SEP_LABEL} should separate the input tokens in the
	* concatenated token
	*/
	public ConcatenateGraphFilter(
	TokenStream inputTokenStream,
	boolean preserveSep,
	boolean preservePositionIncrements,
	int maxGraphExpansions) {
	this(
	inputTokenStream,
	(preserveSep) ? DEFAULT_TOKEN_SEPARATOR : null,
	preservePositionIncrements,
	maxGraphExpansions);
	}

	@Override
	public void reset() throws IOException {
	super.reset();
	// we only capture this if we really need it to save the UTF-8 to UTF-16 conversion
	charTermAttribute = getAttribute(CharTermAttribute.class); // may return null
	wasReset = true;
	}

	@Override
	public boolean incrementToken() throws IOException {
	if (finiteStrings == null) {
	if (wasReset == false) {
	throw new IllegalStateException("reset() missing before incrementToken");
	}
	// lazy init/consume
	Automaton automaton =
	toAutomaton(); // calls reset(), incrementToken() repeatedly, and end() on
	// inputTokenStream
	finiteStrings = new LimitedFiniteStringsIterator(automaton, maxGraphExpansions);
	// note: would be nice to know the startOffset but toAutomaton doesn't capture it. We'll
	// assume 0
	endOffset = inputTokenStream.getAttribute(OffsetAttribute.class).endOffset();
	}

	IntsRef string = finiteStrings.next();
	if (string == null) {
	return false;
	}

	clearAttributes();

	if (finiteStrings.size() > 1) { // if number of iterated strings so far is more than one...
	posIncrAtt.setPositionIncrement(0); // stacked
	}

	offsetAtt.setOffset(0, endOffset);

	Util.toBytesRef(string, bytesAtt.builder()); // now we have UTF-8
	if (charTermAttribute != null) {
	charTermAttribute.setLength(0);
	charTermAttribute.append(bytesAtt.toUTF16());
	}

	return true;
	}

	@Override
	public void end() throws IOException {
	super.end();
	if (finiteStrings == null) { // thus inputTokenStream hasn't yet received end()
	inputTokenStream
	.end(); // the input TS may really want to see "end()" called even if incrementToken
	// hasn't.
	} // else we already eagerly consumed inputTokenStream including end()
	if (endOffset != -1) {
	offsetAtt.setOffset(0, endOffset);
	}
	}

	@Override
	public void close() throws IOException {
	super.close();
	// delegate lifecycle. Note toAutomaton does not close the stream
	inputTokenStream.close();
	finiteStrings = null;
	wasReset = false; // reset
	endOffset = -1; // reset
	}

	/**
	* Converts the tokenStream to an automaton, treating the transition labels as utf-8. Does not
	* close it.
	*/
	public Automaton toAutomaton() throws IOException {
	return toAutomaton(false);
	}

	/** Converts the tokenStream to an automaton. Does not close it. */
	public Automaton toAutomaton(boolean unicodeAware) throws IOException {
	// TODO refactor this
	// maybe we could hook up a modified automaton from TermAutomatonQuery here?

	// Create corresponding automaton: labels are bytes
	// from each analyzed token, with byte 0 used as
	// separator between tokens:
	final TokenStreamToAutomaton tsta;
	if (tokenSeparator != null) {
	tsta = new EscapingTokenStreamToAutomaton(tokenSeparator);
	} else {
	// When we're not preserving sep, we don't steal 0xff
	// byte, so we don't need to do any escaping:
	tsta = new TokenStreamToAutomaton();
	}
	tsta.setPreservePositionIncrements(preservePositionIncrements);
	tsta.setUnicodeArcs(unicodeAware);

	Automaton automaton = tsta.toAutomaton(inputTokenStream);

	// TODO: we can optimize this somewhat by determinizing
	// while we convert
	automaton = replaceSep(automaton, tokenSeparator);
	// This automaton should not blow up during determinize:
	return Operations.determinize(automaton, maxGraphExpansions);
	}

	/** Just escapes the {@link #SEP_LABEL} byte with an extra. */
	private static final class EscapingTokenStreamToAutomaton extends TokenStreamToAutomaton {

	final BytesRefBuilder spare = new BytesRefBuilder();
	final byte sepLabel;

	public EscapingTokenStreamToAutomaton(int sepLabel) {
	assert sepLabel <= Byte.MAX_VALUE;
	this.sepLabel = (byte) sepLabel;
	}

	@Override
	protected BytesRef changeToken(BytesRef in) {
	int upto = 0;
	for (int i = 0; i < in.length; i++) {
	byte b = in.bytes[in.offset + i];
	if (b == sepLabel) {
	spare.grow(upto + 2);
	spare.setByteAt(upto++, sepLabel);
	spare.setByteAt(upto++, b);
	} else {
	spare.grow(upto + 1);
	spare.setByteAt(upto++, b);
	}
	}
	spare.setLength(upto);
	return spare.get();
	}
	}

	// Replaces SEP with epsilon or remaps them if
	// we were asked to preserve them:
	private static Automaton replaceSep(Automaton a, Character tokenSeparator) {

	Automaton result = new Automaton();

	// Copy all states over
	int numStates = a.getNumStates();
	for (int s = 0; s < numStates; s++) {
	result.createState();
	result.setAccept(s, a.isAccept(s));
	}

	// Go in reverse topo sort so we know we only have to
	// make one pass:
	Transition t = new Transition();
	int[] topoSortStates = Operations.topoSortStates(a);
	for (int i = 0; i < topoSortStates.length; i++) {
	int state = topoSortStates[topoSortStates.length - 1 - i];
	int count = a.initTransition(state, t);
	for (int j = 0; j < count; j++) {
	a.getNextTransition(t);
	if (t.min == TokenStreamToAutomaton.POS_SEP) {
	assert t.max == TokenStreamToAutomaton.POS_SEP;
	if (tokenSeparator != null) {
	// Remap to tokenSeparator:
	result.addTransition(state, t.dest, tokenSeparator);
	} else {
	result.addEpsilon(state, t.dest);
	}
	} else if (t.min == TokenStreamToAutomaton.HOLE) {
	assert t.max == TokenStreamToAutomaton.HOLE;

	// Just remove the hole: there will then be two
	// SEP tokens next to each other, which will only
	// match another hole at search time. Note that
	// it will also match an empty-string token ... if
	// that's somehow a problem we can always map HOLE
	// to a dedicated byte (and escape it in the
	// input).
	result.addEpsilon(state, t.dest);
	} else {
	result.addTransition(state, t.dest, t.min, t.max);
	}
	}
	}

	result.finishState();

	return result;
	}

	/**
	* Attribute providing access to the term builder and UTF-16 conversion
	*
	* @lucene.internal
	*/
	public interface BytesRefBuilderTermAttribute extends TermToBytesRefAttribute {
	/** Returns the builder from which the term is derived. */
	BytesRefBuilder builder();

	/** Returns the term represented as UTF-16 */
	CharSequence toUTF16();
	}

	/**
	* Implementation of {@link BytesRefBuilderTermAttribute}
	*
	* @lucene.internal
	*/
	public static final class BytesRefBuilderTermAttributeImpl extends AttributeImpl
	implements BytesRefBuilderTermAttribute, TermToBytesRefAttribute {
	private final BytesRefBuilder bytes = new BytesRefBuilder();
	private transient CharsRefBuilder charsRef;

	/** Sole constructor no-op */
	public BytesRefBuilderTermAttributeImpl() {}

	@Override
	public BytesRefBuilder builder() {
	return bytes;
	}

	@Override
	public BytesRef getBytesRef() {
	return bytes.get();
	}

	@Override
	public void clear() {
	bytes.clear();
	}

	@Override
	public void copyTo(AttributeImpl target) {
	BytesRefBuilderTermAttributeImpl other = (BytesRefBuilderTermAttributeImpl) target;
	other.bytes.copyBytes(bytes);
	}

	@Override
	public AttributeImpl clone() {
	BytesRefBuilderTermAttributeImpl other = new BytesRefBuilderTermAttributeImpl();
	copyTo(other);
	return other;
	}

	@Override
	public void reflectWith(AttributeReflector reflector) {
	reflector.reflect(TermToBytesRefAttribute.class, "bytes", getBytesRef());
	}

	@Override
	public CharSequence toUTF16() {
	if (charsRef == null) {
	charsRef = new CharsRefBuilder();
	}
	charsRef.copyUTF8Bytes(getBytesRef());
	return charsRef.get();
	}
	}
	}