lucene/core/src/java/org/apache/lucene/analysis/TokenStreamToAutomaton.java - lucene-solr - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License.  You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
 package org.apache.lucene.analysis;


 import java.io.IOException;

 import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
 import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
 import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute;
 import org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute;
 import org.apache.lucene.util.BytesRef;
 import org.apache.lucene.util.RollingBuffer;
 import org.apache.lucene.util.automaton.Automaton;

 // TODO: maybe also toFST?  then we can translate atts into FST outputs/weights

 /** Consumes a TokenStream and creates an {@link Automaton}
  *  where the transition labels are UTF8 bytes (or Unicode
  *  code points if unicodeArcs is true) from the {@link
  *  TermToBytesRefAttribute}.  Between tokens we insert
  *  POS_SEP and for holes we insert HOLE.
  *
  * @lucene.experimental */
 public class TokenStreamToAutomaton {

   private boolean preservePositionIncrements;
   private boolean finalOffsetGapAsHole;
   private boolean unicodeArcs;

   /** Sole constructor. */
   public TokenStreamToAutomaton() {
     this.preservePositionIncrements = true;
   }

   /** Whether to generate holes in the automaton for missing positions, <code>true</code> by default. */
   public void setPreservePositionIncrements(boolean enablePositionIncrements) {
     this.preservePositionIncrements = enablePositionIncrements;
   }

   /** If true, any final offset gaps will result in adding a position hole. */
   public void setFinalOffsetGapAsHole(boolean finalOffsetGapAsHole) {
     this.finalOffsetGapAsHole = finalOffsetGapAsHole;
   }

   /** Whether to make transition labels Unicode code points instead of UTF8 bytes,
    *  <code>false</code> by default */
   public void setUnicodeArcs(boolean unicodeArcs) {
     this.unicodeArcs = unicodeArcs;
   }

   private static class Position implements RollingBuffer.Resettable {
     // Any tokens that ended at our position arrive to this state:
     int arriving = -1;

     // Any tokens that start at our position leave from this state:
     int leaving = -1;

     @Override
     public void reset() {
       arriving = -1;
       leaving = -1;
     }
   }

   private static class Positions extends RollingBuffer<Position> {
     @Override
     protected Position newInstance() {
       return new Position();
     }
   }

   /** Subclass and implement this if you need to change the
    *  token (such as escaping certain bytes) before it's
    *  turned into a graph. */
   protected BytesRef changeToken(BytesRef in) {
     return in;
   }

   /** We create transition between two adjacent tokens. */
   public static final int POS_SEP = 0x001f;

   /** We add this arc to represent a hole. */
   public static final int HOLE = 0x001e;

   /** Pulls the graph (including {@link
    *  PositionLengthAttribute}) from the provided {@link
    *  TokenStream}, and creates the corresponding
    *  automaton where arcs are bytes (or Unicode code points
    *  if unicodeArcs = true) from each term. */
   public Automaton toAutomaton(TokenStream in) throws IOException {
     final Automaton.Builder builder = new Automaton.Builder();
     builder.createState();

     final TermToBytesRefAttribute termBytesAtt = in.addAttribute(TermToBytesRefAttribute.class);
     final PositionIncrementAttribute posIncAtt = in.addAttribute(PositionIncrementAttribute.class);
     final PositionLengthAttribute posLengthAtt = in.addAttribute(PositionLengthAttribute.class);
     final OffsetAttribute offsetAtt = in.addAttribute(OffsetAttribute.class);

     in.reset();

     // Only temporarily holds states ahead of our current
     // position:

     final RollingBuffer<Position> positions = new Positions();

     int pos = -1;
     int freedPos = 0;
     Position posData = null;
     int maxOffset = 0;
     while (in.incrementToken()) {
       int posInc = posIncAtt.getPositionIncrement();
       if (preservePositionIncrements == false && posInc > 1) {
         posInc = 1;
       }
       assert pos > -1 || posInc > 0;

       if (posInc > 0) {

         // New node:
         pos += posInc;

         posData = positions.get(pos);
         assert posData.leaving == -1;

         if (posData.arriving == -1) {
           // No token ever arrived to this position
           if (pos == 0) {
             // OK: this is the first token
             posData.leaving = 0;
           } else {
             // This means there's a hole (eg, StopFilter
             // does this):
             posData.leaving = builder.createState();
             addHoles(builder, positions, pos);
           }
         } else {
           posData.leaving = builder.createState();
           builder.addTransition(posData.arriving, posData.leaving, POS_SEP);
           if (posInc > 1) {
             // A token spanned over a hole; add holes
             // "under" it:
             addHoles(builder, positions, pos);
           }
         }
         while (freedPos <= pos) {
           Position freePosData = positions.get(freedPos);
           // don't free this position yet if we may still need to fill holes over it:
           if (freePosData.arriving == -1 || freePosData.leaving == -1) {
             break;
           }
           positions.freeBefore(freedPos);
           freedPos++;
         }
       }

       final int endPos = pos + posLengthAtt.getPositionLength();

       final BytesRef termUTF8 = changeToken(termBytesAtt.getBytesRef());
       int[] termUnicode = null;
       final Position endPosData = positions.get(endPos);
       if (endPosData.arriving == -1) {
         endPosData.arriving = builder.createState();
       }

       int termLen;
       if (unicodeArcs) {
         final String utf16 = termUTF8.utf8ToString();
         termUnicode = new int[utf16.codePointCount(0, utf16.length())];
         termLen = termUnicode.length;
         for (int cp, i = 0, j = 0; i < utf16.length(); i += Character.charCount(cp)) {
           termUnicode[j++] = cp = utf16.codePointAt(i);
         }
       } else {
         termLen = termUTF8.length;
       }

       int state = posData.leaving;

       for(int byteIDX=0;byteIDX<termLen;byteIDX++) {
         final int nextState = byteIDX == termLen-1 ? endPosData.arriving : builder.createState();
         int c;
         if (unicodeArcs) {
           c = termUnicode[byteIDX];
         } else {
           c = termUTF8.bytes[termUTF8.offset + byteIDX] & 0xff;
         }
         builder.addTransition(state, nextState, c);
         state = nextState;
       }

       maxOffset = Math.max(maxOffset, offsetAtt.endOffset());
     }

     in.end();

     int endPosInc = posIncAtt.getPositionIncrement();
     if (endPosInc == 0 && finalOffsetGapAsHole && offsetAtt.endOffset() > maxOffset) {
       endPosInc = 1;
     } else if (endPosInc > 0 && preservePositionIncrements==false) {
       endPosInc = 0;
     }

     int endState;
     if (endPosInc > 0) {
       // there were hole(s) after the last token
       endState = builder.createState();

       // add trailing holes now:
       int lastState = endState;
       while (true) {
         int state1 = builder.createState();
         builder.addTransition(lastState, state1, HOLE);
         endPosInc--;
         if (endPosInc == 0) {
           builder.setAccept(state1, true);
           break;
         }
         int state2 = builder.createState();
         builder.addTransition(state1, state2, POS_SEP);
         lastState = state2;
       }
     } else {
       endState = -1;
     }

     pos++;
     while (pos <= positions.getMaxPos()) {
       posData = positions.get(pos);
       if (posData.arriving != -1) {
         if (endState != -1) {
           builder.addTransition(posData.arriving, endState, POS_SEP);
         } else {
           builder.setAccept(posData.arriving, true);
         }
       }
       pos++;
     }

     return builder.finish();
   }

   // for debugging!
   /*
   private static void toDot(Automaton a) throws IOException {
     final String s = a.toDot();
     Writer w = new OutputStreamWriter(new FileOutputStream("/tmp/out.dot"));
     w.write(s);
     w.close();
     System.out.println("TEST: saved to /tmp/out.dot");
   }
   */

   private static void addHoles(Automaton.Builder builder, RollingBuffer<Position> positions, int pos) {
     Position posData = positions.get(pos);
     Position prevPosData = positions.get(pos-1);

     while(posData.arriving == -1 || prevPosData.leaving == -1) {
       if (posData.arriving == -1) {
         posData.arriving = builder.createState();
         builder.addTransition(posData.arriving, posData.leaving, POS_SEP);
       }
       if (prevPosData.leaving == -1) {
         if (pos == 1) {
           prevPosData.leaving = 0;
         } else {
           prevPosData.leaving = builder.createState();
         }
         if (prevPosData.arriving != -1) {
           builder.addTransition(prevPosData.arriving, prevPosData.leaving, POS_SEP);
         }
       }
       builder.addTransition(prevPosData.leaving, posData.arriving, HOLE);
       pos--;
       if (pos <= 0) {
         break;
       }
       posData = prevPosData;
       prevPosData = positions.get(pos-1);
     }
   }
 }
	/*
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/
	package org.apache.lucene.analysis;


	import java.io.IOException;

	import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
	import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
	import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute;
	import org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute;
	import org.apache.lucene.util.BytesRef;
	import org.apache.lucene.util.RollingBuffer;
	import org.apache.lucene.util.automaton.Automaton;

	// TODO: maybe also toFST? then we can translate atts into FST outputs/weights

	/** Consumes a TokenStream and creates an {@link Automaton}
	* where the transition labels are UTF8 bytes (or Unicode
	* code points if unicodeArcs is true) from the {@link
	* TermToBytesRefAttribute}. Between tokens we insert
	* POS_SEP and for holes we insert HOLE.
	*
	* @lucene.experimental */
	public class TokenStreamToAutomaton {

	private boolean preservePositionIncrements;
	private boolean finalOffsetGapAsHole;
	private boolean unicodeArcs;

	/** Sole constructor. */
	public TokenStreamToAutomaton() {
	this.preservePositionIncrements = true;
	}

	/** Whether to generate holes in the automaton for missing positions, <code>true</code> by default. */
	public void setPreservePositionIncrements(boolean enablePositionIncrements) {
	this.preservePositionIncrements = enablePositionIncrements;
	}

	/** If true, any final offset gaps will result in adding a position hole. */
	public void setFinalOffsetGapAsHole(boolean finalOffsetGapAsHole) {
	this.finalOffsetGapAsHole = finalOffsetGapAsHole;
	}

	/** Whether to make transition labels Unicode code points instead of UTF8 bytes,
	* <code>false</code> by default */
	public void setUnicodeArcs(boolean unicodeArcs) {
	this.unicodeArcs = unicodeArcs;
	}

	private static class Position implements RollingBuffer.Resettable {
	// Any tokens that ended at our position arrive to this state:
	int arriving = -1;

	// Any tokens that start at our position leave from this state:
	int leaving = -1;

	@Override
	public void reset() {
	arriving = -1;
	leaving = -1;
	}
	}

	private static class Positions extends RollingBuffer<Position> {
	@Override
	protected Position newInstance() {
	return new Position();
	}
	}

	/** Subclass and implement this if you need to change the
	* token (such as escaping certain bytes) before it's
	* turned into a graph. */
	protected BytesRef changeToken(BytesRef in) {
	return in;
	}

	/** We create transition between two adjacent tokens. */
	public static final int POS_SEP = 0x001f;

	/** We add this arc to represent a hole. */
	public static final int HOLE = 0x001e;

	/** Pulls the graph (including {@link
	* PositionLengthAttribute}) from the provided {@link
	* TokenStream}, and creates the corresponding
	* automaton where arcs are bytes (or Unicode code points
	* if unicodeArcs = true) from each term. */
	public Automaton toAutomaton(TokenStream in) throws IOException {
	final Automaton.Builder builder = new Automaton.Builder();
	builder.createState();

	final TermToBytesRefAttribute termBytesAtt = in.addAttribute(TermToBytesRefAttribute.class);
	final PositionIncrementAttribute posIncAtt = in.addAttribute(PositionIncrementAttribute.class);
	final PositionLengthAttribute posLengthAtt = in.addAttribute(PositionLengthAttribute.class);
	final OffsetAttribute offsetAtt = in.addAttribute(OffsetAttribute.class);

	in.reset();

	// Only temporarily holds states ahead of our current
	// position:

	final RollingBuffer<Position> positions = new Positions();

	int pos = -1;
	int freedPos = 0;
	Position posData = null;
	int maxOffset = 0;
	while (in.incrementToken()) {
	int posInc = posIncAtt.getPositionIncrement();
	if (preservePositionIncrements == false && posInc > 1) {
	posInc = 1;
	}
	assert pos > -1 \|\| posInc > 0;

	if (posInc > 0) {

	// New node:
	pos += posInc;

	posData = positions.get(pos);
	assert posData.leaving == -1;

	if (posData.arriving == -1) {
	// No token ever arrived to this position
	if (pos == 0) {
	// OK: this is the first token
	posData.leaving = 0;
	} else {
	// This means there's a hole (eg, StopFilter
	// does this):
	posData.leaving = builder.createState();
	addHoles(builder, positions, pos);
	}
	} else {
	posData.leaving = builder.createState();
	builder.addTransition(posData.arriving, posData.leaving, POS_SEP);
	if (posInc > 1) {
	// A token spanned over a hole; add holes
	// "under" it:
	addHoles(builder, positions, pos);
	}
	}
	while (freedPos <= pos) {
	Position freePosData = positions.get(freedPos);
	// don't free this position yet if we may still need to fill holes over it:
	if (freePosData.arriving == -1 \|\| freePosData.leaving == -1) {
	break;
	}
	positions.freeBefore(freedPos);
	freedPos++;
	}
	}

	final int endPos = pos + posLengthAtt.getPositionLength();

	final BytesRef termUTF8 = changeToken(termBytesAtt.getBytesRef());
	int[] termUnicode = null;
	final Position endPosData = positions.get(endPos);
	if (endPosData.arriving == -1) {
	endPosData.arriving = builder.createState();
	}

	int termLen;
	if (unicodeArcs) {
	final String utf16 = termUTF8.utf8ToString();
	termUnicode = new int[utf16.codePointCount(0, utf16.length())];
	termLen = termUnicode.length;
	for (int cp, i = 0, j = 0; i < utf16.length(); i += Character.charCount(cp)) {
	termUnicode[j++] = cp = utf16.codePointAt(i);
	}
	} else {
	termLen = termUTF8.length;
	}

	int state = posData.leaving;

	for(int byteIDX=0;byteIDX<termLen;byteIDX++) {
	final int nextState = byteIDX == termLen-1 ? endPosData.arriving : builder.createState();
	int c;
	if (unicodeArcs) {
	c = termUnicode[byteIDX];
	} else {
	c = termUTF8.bytes[termUTF8.offset + byteIDX] & 0xff;
	}
	builder.addTransition(state, nextState, c);
	state = nextState;
	}

	maxOffset = Math.max(maxOffset, offsetAtt.endOffset());
	}

	in.end();

	int endPosInc = posIncAtt.getPositionIncrement();
	if (endPosInc == 0 && finalOffsetGapAsHole && offsetAtt.endOffset() > maxOffset) {
	endPosInc = 1;
	} else if (endPosInc > 0 && preservePositionIncrements==false) {
	endPosInc = 0;
	}

	int endState;
	if (endPosInc > 0) {
	// there were hole(s) after the last token
	endState = builder.createState();

	// add trailing holes now:
	int lastState = endState;
	while (true) {
	int state1 = builder.createState();
	builder.addTransition(lastState, state1, HOLE);
	endPosInc--;
	if (endPosInc == 0) {
	builder.setAccept(state1, true);
	break;
	}
	int state2 = builder.createState();
	builder.addTransition(state1, state2, POS_SEP);
	lastState = state2;
	}
	} else {
	endState = -1;
	}

	pos++;
	while (pos <= positions.getMaxPos()) {
	posData = positions.get(pos);
	if (posData.arriving != -1) {
	if (endState != -1) {
	builder.addTransition(posData.arriving, endState, POS_SEP);
	} else {
	builder.setAccept(posData.arriving, true);
	}
	}
	pos++;
	}

	return builder.finish();
	}

	// for debugging!
	/*
	private static void toDot(Automaton a) throws IOException {
	final String s = a.toDot();
	Writer w = new OutputStreamWriter(new FileOutputStream("/tmp/out.dot"));
	w.write(s);
	w.close();
	System.out.println("TEST: saved to /tmp/out.dot");
	}
	*/

	private static void addHoles(Automaton.Builder builder, RollingBuffer<Position> positions, int pos) {
	Position posData = positions.get(pos);
	Position prevPosData = positions.get(pos-1);

	while(posData.arriving == -1 \|\| prevPosData.leaving == -1) {
	if (posData.arriving == -1) {
	posData.arriving = builder.createState();
	builder.addTransition(posData.arriving, posData.leaving, POS_SEP);
	}
	if (prevPosData.leaving == -1) {
	if (pos == 1) {
	prevPosData.leaving = 0;
	} else {
	prevPosData.leaving = builder.createState();
	}
	if (prevPosData.arriving != -1) {
	builder.addTransition(prevPosData.arriving, prevPosData.leaving, POS_SEP);
	}
	}
	builder.addTransition(prevPosData.leaving, posData.arriving, HOLE);
	pos--;
	if (pos <= 0) {
	break;
	}
	posData = prevPosData;
	prevPosData = positions.get(pos-1);
	}
	}
	}