lucene/core/src/java/org/apache/lucene/analysis/TokenStreamToAutomaton.java - lucene-solr - Git at Google

 package org.apache.lucene.analysis;

 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License.  You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */

 import java.io.IOException;

 import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
 import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
 import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute;
 import org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute;
 import org.apache.lucene.util.BytesRef;
 import org.apache.lucene.util.RollingBuffer;
 import org.apache.lucene.util.automaton.Automaton;
 import org.apache.lucene.util.automaton.State;
 import org.apache.lucene.util.automaton.Transition;

 // TODO: maybe also toFST?  then we can translate atts into FST outputs/weights

 /** Consumes a TokenStream and creates an {@link Automaton}
  *  where the transition labels are UTF8 bytes (or Unicode
  *  code points if unicodeArcs is true) from the {@link
  *  TermToBytesRefAttribute}.  Between tokens we insert
  *  POS_SEP and for holes we insert HOLE.
  *
  * @lucene.experimental */
 public class TokenStreamToAutomaton {

   private boolean preservePositionIncrements;
   private boolean unicodeArcs;

   /** Sole constructor. */
   public TokenStreamToAutomaton() {
     this.preservePositionIncrements = true;
   }

   /** Whether to generate holes in the automaton for missing positions, <code>true</code> by default. */
   public void setPreservePositionIncrements(boolean enablePositionIncrements) {
     this.preservePositionIncrements = enablePositionIncrements;
   }

   /** Whether to make transition labels Unicode code points instead of UTF8 bytes,
    *  <code>false</code> by default */
   public void setUnicodeArcs(boolean unicodeArcs) {
     this.unicodeArcs = unicodeArcs;
   }

   private static class Position implements RollingBuffer.Resettable {
     // Any tokens that ended at our position arrive to this state:
     State arriving;

     // Any tokens that start at our position leave from this state:
     State leaving;

     @Override
     public void reset() {
       arriving = null;
       leaving = null;
     }
   }

   private static class Positions extends RollingBuffer<Position> {
     @Override
     protected Position newInstance() {
       return new Position();
     }
   }

   /** Subclass & implement this if you need to change the
    *  token (such as escaping certain bytes) before it's
    *  turned into a graph. */
   protected BytesRef changeToken(BytesRef in) {
     return in;
   }

   /** We create transition between two adjacent tokens. */
   public static final int POS_SEP = 0x001f;

   /** We add this arc to represent a hole. */
   public static final int HOLE = 0x001e;

   /** Pulls the graph (including {@link
    *  PositionLengthAttribute}) from the provided {@link
    *  TokenStream}, and creates the corresponding
    *  automaton where arcs are bytes (or Unicode code points
    *  if unicodeArcs = true) from each term. */
   public Automaton toAutomaton(TokenStream in) throws IOException {
     final Automaton a = new Automaton();
     boolean deterministic = true;

     final TermToBytesRefAttribute termBytesAtt = in.addAttribute(TermToBytesRefAttribute.class);
     final PositionIncrementAttribute posIncAtt = in.addAttribute(PositionIncrementAttribute.class);
     final PositionLengthAttribute posLengthAtt = in.addAttribute(PositionLengthAttribute.class);
     final OffsetAttribute offsetAtt = in.addAttribute(OffsetAttribute.class);

     final BytesRef term = termBytesAtt.getBytesRef();

     in.reset();

     // Only temporarily holds states ahead of our current
     // position:

     final RollingBuffer<Position> positions = new Positions();

     int pos = -1;
     Position posData = null;
     int maxOffset = 0;
     while (in.incrementToken()) {
       int posInc = posIncAtt.getPositionIncrement();
       if (!preservePositionIncrements && posInc > 1) {
         posInc = 1;
       }
       assert pos > -1 || posInc > 0;

       if (posInc > 0) {

         // New node:
         pos += posInc;

         posData = positions.get(pos);
         assert posData.leaving == null;

         if (posData.arriving == null) {
           // No token ever arrived to this position
           if (pos == 0) {
             // OK: this is the first token
             posData.leaving = a.getInitialState();
           } else {
             // This means there's a hole (eg, StopFilter
             // does this):
             posData.leaving = new State();
             addHoles(a.getInitialState(), positions, pos);
           }
         } else {
           posData.leaving = new State();
           posData.arriving.addTransition(new Transition(POS_SEP, posData.leaving));
           if (posInc > 1) {
             // A token spanned over a hole; add holes
             // "under" it:
             addHoles(a.getInitialState(), positions, pos);
           }
         }
         positions.freeBefore(pos);
       } else {
         // note: this isn't necessarily true. its just that we aren't surely det.
         // we could optimize this further (e.g. buffer and sort synonyms at a position)
         // but thats probably overkill. this is cheap and dirty
         deterministic = false;
       }

       final int endPos = pos + posLengthAtt.getPositionLength();

       termBytesAtt.fillBytesRef();
       final BytesRef termUTF8 = changeToken(term);
       int[] termUnicode = null;
       final Position endPosData = positions.get(endPos);
       if (endPosData.arriving == null) {
         endPosData.arriving = new State();
       }

       State state = posData.leaving;
       int termLen;
       if (unicodeArcs) {
         final String utf16 = termUTF8.utf8ToString();
         termUnicode = new int[utf16.codePointCount(0, utf16.length())];
         termLen = termUnicode.length;
         for (int cp, i = 0, j = 0; i < utf16.length(); i += Character.charCount(cp))
           termUnicode[j++] = cp = utf16.codePointAt(i);
       } else {
         termLen = termUTF8.length;
       }

       for(int byteIDX=0;byteIDX<termLen;byteIDX++) {
         final State nextState = byteIDX == termLen-1 ? endPosData.arriving : new State();
         int c;
         if (unicodeArcs) {
           c = termUnicode[byteIDX];
         } else {
           c = termUTF8.bytes[termUTF8.offset + byteIDX] & 0xff;
         }
         state.addTransition(new Transition(c, nextState));
         state = nextState;
       }

       maxOffset = Math.max(maxOffset, offsetAtt.endOffset());
     }

     in.end();
     State endState = null;
     if (offsetAtt.endOffset() > maxOffset) {
       endState = new State();
       endState.setAccept(true);
     }

     pos++;
     while (pos <= positions.getMaxPos()) {
       posData = positions.get(pos);
       if (posData.arriving != null) {
         if (endState != null) {
           posData.arriving.addTransition(new Transition(POS_SEP, endState));
         } else {
           posData.arriving.setAccept(true);
         }
       }
       pos++;
     }

     //toDot(a);
     a.setDeterministic(deterministic);
     return a;
   }

   // for debugging!
   /*
   private static void toDot(Automaton a) throws IOException {
     final String s = a.toDot();
     Writer w = new OutputStreamWriter(new FileOutputStream("/tmp/out.dot"));
     w.write(s);
     w.close();
     System.out.println("TEST: saved to /tmp/out.dot");
   }
   */

   private static void addHoles(State startState, RollingBuffer<Position> positions, int pos) {
     Position posData = positions.get(pos);
     Position prevPosData = positions.get(pos-1);

     while(posData.arriving == null || prevPosData.leaving == null) {
       if (posData.arriving == null) {
         posData.arriving = new State();
         posData.arriving.addTransition(new Transition(POS_SEP, posData.leaving));
       }
       if (prevPosData.leaving == null) {
         if (pos == 1) {
           prevPosData.leaving = startState;
         } else {
           prevPosData.leaving = new State();
         }
         if (prevPosData.arriving != null) {
           prevPosData.arriving.addTransition(new Transition(POS_SEP, prevPosData.leaving));
         }
       }
       prevPosData.leaving.addTransition(new Transition(HOLE, posData.arriving));
       pos--;
       if (pos <= 0) {
         break;
       }
       posData = prevPosData;
       prevPosData = positions.get(pos-1);
     }
   }
 }
	package org.apache.lucene.analysis;

	/*
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/

	import java.io.IOException;

	import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
	import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
	import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute;
	import org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute;
	import org.apache.lucene.util.BytesRef;
	import org.apache.lucene.util.RollingBuffer;
	import org.apache.lucene.util.automaton.Automaton;
	import org.apache.lucene.util.automaton.State;
	import org.apache.lucene.util.automaton.Transition;

	// TODO: maybe also toFST? then we can translate atts into FST outputs/weights

	/** Consumes a TokenStream and creates an {@link Automaton}
	* where the transition labels are UTF8 bytes (or Unicode
	* code points if unicodeArcs is true) from the {@link
	* TermToBytesRefAttribute}. Between tokens we insert
	* POS_SEP and for holes we insert HOLE.
	*
	* @lucene.experimental */
	public class TokenStreamToAutomaton {

	private boolean preservePositionIncrements;
	private boolean unicodeArcs;

	/** Sole constructor. */
	public TokenStreamToAutomaton() {
	this.preservePositionIncrements = true;
	}

	/** Whether to generate holes in the automaton for missing positions, <code>true</code> by default. */
	public void setPreservePositionIncrements(boolean enablePositionIncrements) {
	this.preservePositionIncrements = enablePositionIncrements;
	}

	/** Whether to make transition labels Unicode code points instead of UTF8 bytes,
	* <code>false</code> by default */
	public void setUnicodeArcs(boolean unicodeArcs) {
	this.unicodeArcs = unicodeArcs;
	}

	private static class Position implements RollingBuffer.Resettable {
	// Any tokens that ended at our position arrive to this state:
	State arriving;

	// Any tokens that start at our position leave from this state:
	State leaving;

	@Override
	public void reset() {
	arriving = null;
	leaving = null;
	}
	}

	private static class Positions extends RollingBuffer<Position> {
	@Override
	protected Position newInstance() {
	return new Position();
	}
	}

	/** Subclass & implement this if you need to change the
	* token (such as escaping certain bytes) before it's
	* turned into a graph. */
	protected BytesRef changeToken(BytesRef in) {
	return in;
	}

	/** We create transition between two adjacent tokens. */
	public static final int POS_SEP = 0x001f;

	/** We add this arc to represent a hole. */
	public static final int HOLE = 0x001e;

	/** Pulls the graph (including {@link
	* PositionLengthAttribute}) from the provided {@link
	* TokenStream}, and creates the corresponding
	* automaton where arcs are bytes (or Unicode code points
	* if unicodeArcs = true) from each term. */
	public Automaton toAutomaton(TokenStream in) throws IOException {
	final Automaton a = new Automaton();
	boolean deterministic = true;

	final TermToBytesRefAttribute termBytesAtt = in.addAttribute(TermToBytesRefAttribute.class);
	final PositionIncrementAttribute posIncAtt = in.addAttribute(PositionIncrementAttribute.class);
	final PositionLengthAttribute posLengthAtt = in.addAttribute(PositionLengthAttribute.class);
	final OffsetAttribute offsetAtt = in.addAttribute(OffsetAttribute.class);

	final BytesRef term = termBytesAtt.getBytesRef();

	in.reset();

	// Only temporarily holds states ahead of our current
	// position:

	final RollingBuffer<Position> positions = new Positions();

	int pos = -1;
	Position posData = null;
	int maxOffset = 0;
	while (in.incrementToken()) {
	int posInc = posIncAtt.getPositionIncrement();
	if (!preservePositionIncrements && posInc > 1) {
	posInc = 1;
	}
	assert pos > -1 \|\| posInc > 0;

	if (posInc > 0) {

	// New node:
	pos += posInc;

	posData = positions.get(pos);
	assert posData.leaving == null;

	if (posData.arriving == null) {
	// No token ever arrived to this position
	if (pos == 0) {
	// OK: this is the first token
	posData.leaving = a.getInitialState();
	} else {
	// This means there's a hole (eg, StopFilter
	// does this):
	posData.leaving = new State();
	addHoles(a.getInitialState(), positions, pos);
	}
	} else {
	posData.leaving = new State();
	posData.arriving.addTransition(new Transition(POS_SEP, posData.leaving));
	if (posInc > 1) {
	// A token spanned over a hole; add holes
	// "under" it:
	addHoles(a.getInitialState(), positions, pos);
	}
	}
	positions.freeBefore(pos);
	} else {
	// note: this isn't necessarily true. its just that we aren't surely det.
	// we could optimize this further (e.g. buffer and sort synonyms at a position)
	// but thats probably overkill. this is cheap and dirty
	deterministic = false;
	}

	final int endPos = pos + posLengthAtt.getPositionLength();

	termBytesAtt.fillBytesRef();
	final BytesRef termUTF8 = changeToken(term);
	int[] termUnicode = null;
	final Position endPosData = positions.get(endPos);
	if (endPosData.arriving == null) {
	endPosData.arriving = new State();
	}

	State state = posData.leaving;
	int termLen;
	if (unicodeArcs) {
	final String utf16 = termUTF8.utf8ToString();
	termUnicode = new int[utf16.codePointCount(0, utf16.length())];
	termLen = termUnicode.length;
	for (int cp, i = 0, j = 0; i < utf16.length(); i += Character.charCount(cp))
	termUnicode[j++] = cp = utf16.codePointAt(i);
	} else {
	termLen = termUTF8.length;
	}

	for(int byteIDX=0;byteIDX<termLen;byteIDX++) {
	final State nextState = byteIDX == termLen-1 ? endPosData.arriving : new State();
	int c;
	if (unicodeArcs) {
	c = termUnicode[byteIDX];
	} else {
	c = termUTF8.bytes[termUTF8.offset + byteIDX] & 0xff;
	}
	state.addTransition(new Transition(c, nextState));
	state = nextState;
	}

	maxOffset = Math.max(maxOffset, offsetAtt.endOffset());
	}

	in.end();
	State endState = null;
	if (offsetAtt.endOffset() > maxOffset) {
	endState = new State();
	endState.setAccept(true);
	}

	pos++;
	while (pos <= positions.getMaxPos()) {
	posData = positions.get(pos);
	if (posData.arriving != null) {
	if (endState != null) {
	posData.arriving.addTransition(new Transition(POS_SEP, endState));
	} else {
	posData.arriving.setAccept(true);
	}
	}
	pos++;
	}

	//toDot(a);
	a.setDeterministic(deterministic);
	return a;
	}

	// for debugging!
	/*
	private static void toDot(Automaton a) throws IOException {
	final String s = a.toDot();
	Writer w = new OutputStreamWriter(new FileOutputStream("/tmp/out.dot"));
	w.write(s);
	w.close();
	System.out.println("TEST: saved to /tmp/out.dot");
	}
	*/

	private static void addHoles(State startState, RollingBuffer<Position> positions, int pos) {
	Position posData = positions.get(pos);
	Position prevPosData = positions.get(pos-1);

	while(posData.arriving == null \|\| prevPosData.leaving == null) {
	if (posData.arriving == null) {
	posData.arriving = new State();
	posData.arriving.addTransition(new Transition(POS_SEP, posData.leaving));
	}
	if (prevPosData.leaving == null) {
	if (pos == 1) {
	prevPosData.leaving = startState;
	} else {
	prevPosData.leaving = new State();
	}
	if (prevPosData.arriving != null) {
	prevPosData.arriving.addTransition(new Transition(POS_SEP, prevPosData.leaving));
	}
	}
	prevPosData.leaving.addTransition(new Transition(HOLE, posData.arriving));
	pos--;
	if (pos <= 0) {
	break;
	}
	posData = prevPosData;
	prevPosData = positions.get(pos-1);
	}
	}
	}