docs/attachments/LUCENE-3842/LUCENE-3842-TokenStream_to_Automaton.patch - lucene-jira-archive - Git at Google

 Index: lucene/core/src/test/org/apache/lucene/analysis/TestGraphTokenizers.java
 ===================================================================
 --- lucene/core/src/test/org/apache/lucene/analysis/TestGraphTokenizers.java	(revision 0)
 +++ lucene/core/src/test/org/apache/lucene/analysis/TestGraphTokenizers.java	(working copy)
 @@ -0,0 +1,146 @@
 +package org.apache.lucene.analysis;
 +
 +/**
 + * Licensed to the Apache Software Foundation (ASF) under one or more
 + * contributor license agreements.  See the NOTICE file distributed with
 + * this work for additional information regarding copyright ownership.
 + * The ASF licenses this file to You under the Apache License, Version 2.0
 + * (the "License"); you may not use this file except in compliance with
 + * the License.  You may obtain a copy of the License at
 + *
 + *     http://www.apache.org/licenses/LICENSE-2.0
 + *
 + * Unless required by applicable law or agreed to in writing, software
 + * distributed under the License is distributed on an "AS IS" BASIS,
 + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 + * See the License for the specific language governing permissions and
 + * limitations under the License.
 + */
 +
 +import java.io.FileOutputStream;
 +import java.io.IOException;
 +import java.io.OutputStreamWriter;
 +import java.io.PrintWriter;
 +import java.io.StringWriter;
 +import java.io.Writer;
 +
 +import org.apache.lucene.util.LuceneTestCase;
 +import org.apache.lucene.util.automaton.Automaton;
 +import org.apache.lucene.util.automaton.BasicAutomata;
 +import org.apache.lucene.util.automaton.BasicOperations;
 +
 +public class TestGraphTokenizers extends LuceneTestCase {
 +
 +  private static Token token(String term, int posInc, int posLength) {
 +    final Token t = new Token(term, 0, 0);
 +    t.setPositionIncrement(posInc);
 +    t.setPositionLength(posLength);
 +    return t;
 +  }
 +
 +  private static Token token(String term, int posInc, int posLength, int startOffset, int endOffset) {
 +    final Token t = new Token(term, startOffset, endOffset);
 +    t.setPositionIncrement(posInc);
 +    t.setPositionLength(posLength);
 +    return t;
 +  }
 +
 +  public void testSingleToken() throws Exception {
 +
 +    final TokenStream ts = new CannedTokenStream(
 +      new Token[] {
 +        token("abc", 1, 1),
 +      });
 +    final Automaton actual = TokenStreamToAutomaton.toAutomaton(ts);
 +    final Automaton expected = BasicAutomata.makeString("abc");
 +    assertTrue(BasicOperations.sameLanguage(expected, actual));
 +  }
 +
 +  // for debugging!
 +  private static void toDot(Automaton a) throws IOException {
 +    final String s = a.toDot();
 +    Writer w = new OutputStreamWriter(new FileOutputStream("/x/tmp3/out.dot"));
 +    w.write(s);
 +    w.close();
 +  }
 +
 +  private static final Automaton POS_SEP = BasicAutomata.makeCharRange(TokenStreamToAutomaton.POS_SEP,
 +                                                                       TokenStreamToAutomaton.POS_SEP);
 +  public void testTwoTokens() throws Exception {
 +
 +    final TokenStream ts = new CannedTokenStream(
 +      new Token[] {
 +        token("abc", 1, 1),
 +        token("def", 1, 1),
 +      });
 +    final Automaton actual = TokenStreamToAutomaton.toAutomaton(ts);
 +    final Automaton a1 = BasicAutomata.makeString("abc");
 +    final Automaton a2 = BasicAutomata.makeString("def");
 +    final Automaton expected =  BasicOperations.concatenate(a1, POS_SEP, a2);
 +
 +    //toDot(actual);
 +    assertTrue(BasicOperations.sameLanguage(expected, actual));
 +  }
 +
 +  public void testOverlappedTokensSausage() throws Exception {
 +
 +    // Two tokens on top of each other (sausage):
 +    final TokenStream ts = new CannedTokenStream(
 +      new Token[] {
 +        token("abc", 1, 1),
 +        token("xyz", 0, 1)
 +      });
 +    final Automaton actual = TokenStreamToAutomaton.toAutomaton(ts);
 +    final Automaton a1 = BasicAutomata.makeString("abc");
 +    final Automaton a2 = BasicAutomata.makeString("xyz");
 +    final Automaton expected = BasicOperations.union(a1, a2);
 +    assertTrue(BasicOperations.sameLanguage(expected, actual));
 +  }
 +
 +  public void testOverlappedTokensLattice() throws Exception {
 +
 +    final TokenStream ts = new CannedTokenStream(
 +      new Token[] {
 +        token("abc", 1, 1),
 +        token("xyz", 0, 2),
 +        token("def", 1, 1),
 +      });
 +    final Automaton actual = TokenStreamToAutomaton.toAutomaton(ts);
 +    final Automaton a1 = BasicAutomata.makeString("xyz");
 +    final Automaton a2 = BasicOperations.concatenate(BasicAutomata.makeString("abc"),
 +                                                     POS_SEP,
 +                                                     BasicAutomata.makeString("def"));
 +    final Automaton expected = BasicOperations.union(a1, a2);
 +    //toDot(actual);
 +    assertTrue(BasicOperations.sameLanguage(expected, actual));
 +  }
 +
 +  public void testOverlappedTokensLattice2() throws Exception {
 +
 +    final TokenStream ts = new CannedTokenStream(
 +      new Token[] {
 +        token("abc", 1, 1),
 +        token("xyz", 0, 3),
 +        token("def", 1, 1),
 +        token("ghi", 1, 1),
 +      });
 +    final Automaton actual = TokenStreamToAutomaton.toAutomaton(ts);
 +    final Automaton a1 = BasicAutomata.makeString("xyz");
 +    final Automaton a2 = BasicOperations.concatenate(BasicAutomata.makeString("abc"),
 +                                                     POS_SEP,
 +                                                     BasicAutomata.makeString("def"),
 +                                                     POS_SEP,
 +                                                     BasicAutomata.makeString("ghi"));
 +    final Automaton expected = BasicOperations.union(a1, a2);
 +    //toDot(actual);
 +    assertTrue(BasicOperations.sameLanguage(expected, actual));
 +  }
 +
 +  public void testToDot() throws Exception {
 +    final TokenStream ts = new CannedTokenStream(new Token[] {token("abc", 1, 1, 0, 4)});
 +    StringWriter w = new StringWriter();
 +    new TokenStreamToDot("abcd", ts, new PrintWriter(w)).toDot();
 +    assertTrue(w.toString().indexOf("abc / abcd") != -1);
 +  }
 +}
 +

 Property changes on: lucene/core/src/test/org/apache/lucene/analysis/TestGraphTokenizers.java
 ___________________________________________________________________
 Added: svn:eol-style
 ## -0,0 +1 ##
 +native
 Index: lucene/core/src/java/org/apache/lucene/analysis/Token.java
 ===================================================================
 --- lucene/core/src/java/org/apache/lucene/analysis/Token.java	(revision 1296808)
 +++ lucene/core/src/java/org/apache/lucene/analysis/Token.java	(working copy)
 @@ -22,6 +22,7 @@
  import org.apache.lucene.analysis.tokenattributes.FlagsAttribute;
  import org.apache.lucene.analysis.tokenattributes.PayloadAttribute;
  import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
 +import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute;
  import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
  import org.apache.lucene.index.Payload;
  import org.apache.lucene.index.DocsAndPositionsEnum; // for javadoc
 @@ -121,13 +122,15 @@
  */
  public class Token extends CharTermAttributeImpl
                     implements TypeAttribute, PositionIncrementAttribute,
 -                              FlagsAttribute, OffsetAttribute, PayloadAttribute {
 +                              FlagsAttribute, OffsetAttribute, PayloadAttribute, PositionLengthAttribute {

    private int startOffset,endOffset;
    private String type = DEFAULT_TYPE;
    private int flags;
    private Payload payload;
    private int positionIncrement = 1;
 +  // nocommit should we NOT do this...?
 +  private int positionLength = 1;

    /** Constructs a Token will null text. */
    public Token() {
 @@ -270,6 +273,18 @@
      return positionIncrement;
    }

 +  // nocommit jdocs
 +  @Override
 +  public void setPositionLength(int positionLength) {
 +    this.positionLength = positionLength;
 +  }
 +
 +  // nocommit jdocs
 +  @Override
 +  public int getPositionLength() {
 +    return positionLength;
 +  }
 +
    /** Returns this Token's starting offset, the position of the first character
      corresponding to this token in the source text.

 Index: lucene/core/src/java/org/apache/lucene/util/automaton/BasicOperations.java
 ===================================================================
 --- lucene/core/src/java/org/apache/lucene/util/automaton/BasicOperations.java	(revision 1296808)
 +++ lucene/core/src/java/org/apache/lucene/util/automaton/BasicOperations.java	(working copy)
 @@ -90,8 +90,8 @@
     * <p>
     * Complexity: linear in total number of states.
     */
 -  static public Automaton concatenate(List<Automaton> l) {
 -    if (l.isEmpty()) return BasicAutomata.makeEmptyString();
 +  static public Automaton concatenate(Automaton... l) {
 +    if (l.length == 0) return BasicAutomata.makeEmptyString();
      boolean all_singleton = true;
      for (Automaton a : l)
        if (!a.isSingleton()) {
 @@ -109,8 +109,8 @@
        Set<Integer> ids = new HashSet<Integer>();
        for (Automaton a : l)
          ids.add(System.identityHashCode(a));
 -      boolean has_aliases = ids.size() != l.size();
 -      Automaton b = l.get(0);
 +      boolean has_aliases = ids.size() != l.length;
 +      Automaton b = l[0];
        if (has_aliases) b = b.cloneExpanded();
        else b = b.cloneExpandedIfRequired();
        Set<State> ac = b.getAcceptStates();
 @@ -191,7 +191,7 @@
      while (min-- > 0)
        as.add(a);
      as.add(repeat(a));
 -    return concatenate(as);
 +    return concatenate(as.toArray(new Automaton[as.size()]));
    }

    /**
 @@ -213,7 +213,7 @@
        List<Automaton> as = new ArrayList<Automaton>();
        while (min-- > 0)
          as.add(a);
 -      b = concatenate(as);
 +      b = concatenate(as.toArray(new Automaton[as.size()]));
      }
      if (max > 0) {
        Automaton d = a.clone();
 Index: lucene/test-framework/src/java/org/apache/lucene/analysis/CannedTokenStream.java
 ===================================================================
 --- lucene/test-framework/src/java/org/apache/lucene/analysis/CannedTokenStream.java	(revision 1296808)
 +++ lucene/test-framework/src/java/org/apache/lucene/analysis/CannedTokenStream.java	(working copy)
 @@ -22,6 +22,7 @@
  import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
  import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
  import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
 +import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute;

  /**
   * emits a canned set of {@link Token}
 @@ -31,6 +32,7 @@
    private int upto = 0;
    private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
    private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class);
 +  private final PositionLengthAttribute posLengthAtt = addAttribute(PositionLengthAttribute.class);
    private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);

    public CannedTokenStream(Token[] tokens) {
 @@ -47,6 +49,7 @@
        termAtt.setEmpty();
        termAtt.append(token.toString());
        posIncrAtt.setPositionIncrement(token.getPositionIncrement());
 +      posLengthAtt.setPositionLength(token.getPositionLength());
        offsetAtt.setOffset(token.startOffset(), token.endOffset());
        return true;
      } else {
 Index: lucene/test-framework/src/java/org/apache/lucene/analysis/TokenStreamToAutomaton.java
 ===================================================================
 --- lucene/test-framework/src/java/org/apache/lucene/analysis/TokenStreamToAutomaton.java	(revision 0)
 +++ lucene/test-framework/src/java/org/apache/lucene/analysis/TokenStreamToAutomaton.java	(working copy)
 @@ -0,0 +1,134 @@
 +package org.apache.lucene.analysis;
 +
 +/**
 + * Licensed to the Apache Software Foundation (ASF) under one or more
 + * contributor license agreements.  See the NOTICE file distributed with
 + * this work for additional information regarding copyright ownership.
 + * The ASF licenses this file to You under the Apache License, Version 2.0
 + * (the "License"); you may not use this file except in compliance with
 + * the License.  You may obtain a copy of the License at
 + *
 + *     http://www.apache.org/licenses/LICENSE-2.0
 + *
 + * Unless required by applicable law or agreed to in writing, software
 + * distributed under the License is distributed on an "AS IS" BASIS,
 + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 + * See the License for the specific language governing permissions and
 + * limitations under the License.
 + */
 +
 +import java.io.IOException;
 +import java.util.HashMap;
 +import java.util.Map;
 +
 +import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
 +import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute;
 +import org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute;
 +import org.apache.lucene.util.BytesRef;
 +import org.apache.lucene.util.automaton.Automaton;
 +import org.apache.lucene.util.automaton.State;
 +import org.apache.lucene.util.automaton.Transition;
 +
 +/** Consumes a TokenStream and creates an {@link Automaton}. */
 +public class TokenStreamToAutomaton {
 +
 +  // nocommit: what bytes to steal!
 +
 +  // We create transition w/ this label when posInc is 1:
 +  public static final int POS_SEP = 0;
 +
 +  // nocommit move to oal.util.automaton?
 +  // nocommit: toFST?  then we can translate atts into FST weights
 +
 +  /** Pulls the graph (including {@link
 +   *  PositionLengthAttribute}) from the provided {@link
 +   *  TokenStream}, and creates the corresponding
 +   *  automaton where arcs are bytes from each term. */
 +  public static Automaton toAutomaton(TokenStream in) throws IOException {
 +    final Automaton a = new Automaton();
 +
 +    final TermToBytesRefAttribute termBytesAtt = in.addAttribute(TermToBytesRefAttribute.class);
 +    final PositionIncrementAttribute posIncAtt = in.addAttribute(PositionIncrementAttribute.class);
 +    final PositionLengthAttribute posLengthAtt = in.addAttribute(PositionLengthAttribute.class);
 +    final BytesRef term = termBytesAtt.getBytesRef();
 +
 +    in.reset();
 +
 +    // Only temporarily holds states ahead of our current
 +    // position:
 +    // nocommit maybe linked list...?
 +    final Map<Integer,State> posToState = new HashMap<Integer,State>();
 +
 +    State currentFromState = null;
 +    int pos = -1;
 +    int lastEndPos = -1;
 +    while (in.incrementToken()) {
 +      int posInc = posIncAtt.getPositionIncrement();
 +      if (currentFromState == null && posInc == 0) {
 +        // TODO: hmm are TS's still allowed to do this...?
 +        posInc = 1;
 +      }
 +
 +      if (posInc > 0) {
 +        // New node:
 +        pos += posInc;
 +        final State nextFromState;
 +        final State lastEndState = posToState.get(pos);
 +        if (lastEndState == null) {
 +          // nocommit invalid assert!!  if a syn matched
 +          // over what is now a hole this assert falsely
 +          // trips... make test!!
 +          assert currentFromState == null;
 +          nextFromState = a.getInitialState();
 +        } else {
 +          nextFromState = new State();
 +          posToState.remove(pos);
 +          // nocommit if posInc > 1 what to do...?  multiple SEP?
 +          lastEndState.addTransition(new Transition(POS_SEP, nextFromState));
 +        }
 +        currentFromState = nextFromState;
 +      }
 +
 +      // nocommit: make test for this:
 +
 +      // nocommit does posLengthAtt make it possible to
 +      // create broken graph?  ie what if posInc skips over
 +      // the node created by a previous posLengthAtt!?
 +      // hrm.  actually: we must handle this case!  it means
 +      // eg a syn matched a stop word but then stop word was
 +      // deleted...
 +
 +      final int endPos = pos + posLengthAtt.getPositionLength();
 +
 +      termBytesAtt.fillBytesRef();
 +      State endState = posToState.get(endPos);
 +      if (endState == null) {
 +        endState = new State();
 +        posToState.put(endPos, endState);
 +      }
 +
 +      State lastState = currentFromState;
 +
 +      for(int byteIDX=0;byteIDX<term.length;byteIDX++) {
 +        final State nextState;
 +        if (byteIDX == term.length-1) {
 +          nextState = endState;
 +        } else {
 +          nextState = new State();
 +        }
 +
 +        lastState.addTransition(new Transition(term.bytes[term.offset + byteIDX] & 0xff, nextState));
 +        lastState = nextState;
 +      }
 +
 +      lastEndPos = pos + posLengthAtt.getPositionLength();
 +    }
 +
 +    // nocommit is this... right?
 +    for(State endState : posToState.values()) {
 +      endState.setAccept(true);
 +    }
 +
 +    return a;
 +  }
 +}

 Property changes on: lucene/test-framework/src/java/org/apache/lucene/analysis/TokenStreamToAutomaton.java
 ___________________________________________________________________
 Added: svn:eol-style
 ## -0,0 +1 ##
 +native
	Index: lucene/core/src/test/org/apache/lucene/analysis/TestGraphTokenizers.java
	===================================================================
	--- lucene/core/src/test/org/apache/lucene/analysis/TestGraphTokenizers.java (revision 0)
	+++ lucene/core/src/test/org/apache/lucene/analysis/TestGraphTokenizers.java (working copy)
	@@ -0,0 +1,146 @@
	+package org.apache.lucene.analysis;
	+
	+/**
	+ * Licensed to the Apache Software Foundation (ASF) under one or more
	+ * contributor license agreements. See the NOTICE file distributed with
	+ * this work for additional information regarding copyright ownership.
	+ * The ASF licenses this file to You under the Apache License, Version 2.0
	+ * (the "License"); you may not use this file except in compliance with
	+ * the License. You may obtain a copy of the License at
	+ *
	+ * http://www.apache.org/licenses/LICENSE-2.0
	+ *
	+ * Unless required by applicable law or agreed to in writing, software
	+ * distributed under the License is distributed on an "AS IS" BASIS,
	+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	+ * See the License for the specific language governing permissions and
	+ * limitations under the License.
	+ */
	+
	+import java.io.FileOutputStream;
	+import java.io.IOException;
	+import java.io.OutputStreamWriter;
	+import java.io.PrintWriter;
	+import java.io.StringWriter;
	+import java.io.Writer;
	+
	+import org.apache.lucene.util.LuceneTestCase;
	+import org.apache.lucene.util.automaton.Automaton;
	+import org.apache.lucene.util.automaton.BasicAutomata;
	+import org.apache.lucene.util.automaton.BasicOperations;
	+
	+public class TestGraphTokenizers extends LuceneTestCase {
	+
	+ private static Token token(String term, int posInc, int posLength) {
	+ final Token t = new Token(term, 0, 0);
	+ t.setPositionIncrement(posInc);
	+ t.setPositionLength(posLength);
	+ return t;
	+ }
	+
	+ private static Token token(String term, int posInc, int posLength, int startOffset, int endOffset) {
	+ final Token t = new Token(term, startOffset, endOffset);
	+ t.setPositionIncrement(posInc);
	+ t.setPositionLength(posLength);
	+ return t;
	+ }
	+
	+ public void testSingleToken() throws Exception {
	+
	+ final TokenStream ts = new CannedTokenStream(
	+ new Token[] {
	+ token("abc", 1, 1),
	+ });
	+ final Automaton actual = TokenStreamToAutomaton.toAutomaton(ts);
	+ final Automaton expected = BasicAutomata.makeString("abc");
	+ assertTrue(BasicOperations.sameLanguage(expected, actual));
	+ }
	+
	+ // for debugging!
	+ private static void toDot(Automaton a) throws IOException {
	+ final String s = a.toDot();
	+ Writer w = new OutputStreamWriter(new FileOutputStream("/x/tmp3/out.dot"));
	+ w.write(s);
	+ w.close();
	+ }
	+
	+ private static final Automaton POS_SEP = BasicAutomata.makeCharRange(TokenStreamToAutomaton.POS_SEP,
	+ TokenStreamToAutomaton.POS_SEP);
	+ public void testTwoTokens() throws Exception {
	+
	+ final TokenStream ts = new CannedTokenStream(
	+ new Token[] {
	+ token("abc", 1, 1),
	+ token("def", 1, 1),
	+ });
	+ final Automaton actual = TokenStreamToAutomaton.toAutomaton(ts);
	+ final Automaton a1 = BasicAutomata.makeString("abc");
	+ final Automaton a2 = BasicAutomata.makeString("def");
	+ final Automaton expected = BasicOperations.concatenate(a1, POS_SEP, a2);
	+
	+ //toDot(actual);
	+ assertTrue(BasicOperations.sameLanguage(expected, actual));
	+ }
	+
	+ public void testOverlappedTokensSausage() throws Exception {
	+
	+ // Two tokens on top of each other (sausage):
	+ final TokenStream ts = new CannedTokenStream(
	+ new Token[] {
	+ token("abc", 1, 1),
	+ token("xyz", 0, 1)
	+ });
	+ final Automaton actual = TokenStreamToAutomaton.toAutomaton(ts);
	+ final Automaton a1 = BasicAutomata.makeString("abc");
	+ final Automaton a2 = BasicAutomata.makeString("xyz");
	+ final Automaton expected = BasicOperations.union(a1, a2);
	+ assertTrue(BasicOperations.sameLanguage(expected, actual));
	+ }
	+
	+ public void testOverlappedTokensLattice() throws Exception {
	+
	+ final TokenStream ts = new CannedTokenStream(
	+ new Token[] {
	+ token("abc", 1, 1),
	+ token("xyz", 0, 2),
	+ token("def", 1, 1),
	+ });
	+ final Automaton actual = TokenStreamToAutomaton.toAutomaton(ts);
	+ final Automaton a1 = BasicAutomata.makeString("xyz");
	+ final Automaton a2 = BasicOperations.concatenate(BasicAutomata.makeString("abc"),
	+ POS_SEP,
	+ BasicAutomata.makeString("def"));
	+ final Automaton expected = BasicOperations.union(a1, a2);
	+ //toDot(actual);
	+ assertTrue(BasicOperations.sameLanguage(expected, actual));
	+ }
	+
	+ public void testOverlappedTokensLattice2() throws Exception {
	+
	+ final TokenStream ts = new CannedTokenStream(
	+ new Token[] {
	+ token("abc", 1, 1),
	+ token("xyz", 0, 3),
	+ token("def", 1, 1),
	+ token("ghi", 1, 1),
	+ });
	+ final Automaton actual = TokenStreamToAutomaton.toAutomaton(ts);
	+ final Automaton a1 = BasicAutomata.makeString("xyz");
	+ final Automaton a2 = BasicOperations.concatenate(BasicAutomata.makeString("abc"),
	+ POS_SEP,
	+ BasicAutomata.makeString("def"),
	+ POS_SEP,
	+ BasicAutomata.makeString("ghi"));
	+ final Automaton expected = BasicOperations.union(a1, a2);
	+ //toDot(actual);
	+ assertTrue(BasicOperations.sameLanguage(expected, actual));
	+ }
	+
	+ public void testToDot() throws Exception {
	+ final TokenStream ts = new CannedTokenStream(new Token[] {token("abc", 1, 1, 0, 4)});
	+ StringWriter w = new StringWriter();
	+ new TokenStreamToDot("abcd", ts, new PrintWriter(w)).toDot();
	+ assertTrue(w.toString().indexOf("abc / abcd") != -1);
	+ }
	+}
	+

	Property changes on: lucene/core/src/test/org/apache/lucene/analysis/TestGraphTokenizers.java
	___________________________________________________________________
	Added: svn:eol-style
	## -0,0 +1 ##
	+native
	Index: lucene/core/src/java/org/apache/lucene/analysis/Token.java
	===================================================================
	--- lucene/core/src/java/org/apache/lucene/analysis/Token.java (revision 1296808)
	+++ lucene/core/src/java/org/apache/lucene/analysis/Token.java (working copy)
	@@ -22,6 +22,7 @@
	import org.apache.lucene.analysis.tokenattributes.FlagsAttribute;
	import org.apache.lucene.analysis.tokenattributes.PayloadAttribute;
	import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
	+import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute;
	import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
	import org.apache.lucene.index.Payload;
	import org.apache.lucene.index.DocsAndPositionsEnum; // for javadoc
	@@ -121,13 +122,15 @@
	*/
	public class Token extends CharTermAttributeImpl
	implements TypeAttribute, PositionIncrementAttribute,
	- FlagsAttribute, OffsetAttribute, PayloadAttribute {
	+ FlagsAttribute, OffsetAttribute, PayloadAttribute, PositionLengthAttribute {

	private int startOffset,endOffset;
	private String type = DEFAULT_TYPE;
	private int flags;
	private Payload payload;
	private int positionIncrement = 1;
	+ // nocommit should we NOT do this...?
	+ private int positionLength = 1;

	/** Constructs a Token will null text. */
	public Token() {
	@@ -270,6 +273,18 @@
	return positionIncrement;
	}

	+ // nocommit jdocs
	+ @Override
	+ public void setPositionLength(int positionLength) {
	+ this.positionLength = positionLength;
	+ }
	+
	+ // nocommit jdocs
	+ @Override
	+ public int getPositionLength() {
	+ return positionLength;
	+ }
	+
	/** Returns this Token's starting offset, the position of the first character
	corresponding to this token in the source text.

	Index: lucene/core/src/java/org/apache/lucene/util/automaton/BasicOperations.java
	===================================================================
	--- lucene/core/src/java/org/apache/lucene/util/automaton/BasicOperations.java (revision 1296808)
	+++ lucene/core/src/java/org/apache/lucene/util/automaton/BasicOperations.java (working copy)
	@@ -90,8 +90,8 @@
	* <p>
	* Complexity: linear in total number of states.
	*/
	- static public Automaton concatenate(List<Automaton> l) {
	- if (l.isEmpty()) return BasicAutomata.makeEmptyString();
	+ static public Automaton concatenate(Automaton... l) {
	+ if (l.length == 0) return BasicAutomata.makeEmptyString();
	boolean all_singleton = true;
	for (Automaton a : l)
	if (!a.isSingleton()) {
	@@ -109,8 +109,8 @@
	Set<Integer> ids = new HashSet<Integer>();
	for (Automaton a : l)
	ids.add(System.identityHashCode(a));
	- boolean has_aliases = ids.size() != l.size();
	- Automaton b = l.get(0);
	+ boolean has_aliases = ids.size() != l.length;
	+ Automaton b = l[0];
	if (has_aliases) b = b.cloneExpanded();
	else b = b.cloneExpandedIfRequired();
	Set<State> ac = b.getAcceptStates();
	@@ -191,7 +191,7 @@
	while (min-- > 0)
	as.add(a);
	as.add(repeat(a));
	- return concatenate(as);
	+ return concatenate(as.toArray(new Automaton[as.size()]));
	}

	/**
	@@ -213,7 +213,7 @@
	List<Automaton> as = new ArrayList<Automaton>();
	while (min-- > 0)
	as.add(a);
	- b = concatenate(as);
	+ b = concatenate(as.toArray(new Automaton[as.size()]));
	}
	if (max > 0) {
	Automaton d = a.clone();
	Index: lucene/test-framework/src/java/org/apache/lucene/analysis/CannedTokenStream.java
	===================================================================
	--- lucene/test-framework/src/java/org/apache/lucene/analysis/CannedTokenStream.java (revision 1296808)
	+++ lucene/test-framework/src/java/org/apache/lucene/analysis/CannedTokenStream.java (working copy)
	@@ -22,6 +22,7 @@
	import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
	import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
	import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
	+import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute;

	/**
	* emits a canned set of {@link Token}
	@@ -31,6 +32,7 @@
	private int upto = 0;
	private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
	private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class);
	+ private final PositionLengthAttribute posLengthAtt = addAttribute(PositionLengthAttribute.class);
	private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);

	public CannedTokenStream(Token[] tokens) {
	@@ -47,6 +49,7 @@
	termAtt.setEmpty();
	termAtt.append(token.toString());
	posIncrAtt.setPositionIncrement(token.getPositionIncrement());
	+ posLengthAtt.setPositionLength(token.getPositionLength());
	offsetAtt.setOffset(token.startOffset(), token.endOffset());
	return true;
	} else {
	Index: lucene/test-framework/src/java/org/apache/lucene/analysis/TokenStreamToAutomaton.java
	===================================================================
	--- lucene/test-framework/src/java/org/apache/lucene/analysis/TokenStreamToAutomaton.java (revision 0)
	+++ lucene/test-framework/src/java/org/apache/lucene/analysis/TokenStreamToAutomaton.java (working copy)
	@@ -0,0 +1,134 @@
	+package org.apache.lucene.analysis;
	+
	+/**
	+ * Licensed to the Apache Software Foundation (ASF) under one or more
	+ * contributor license agreements. See the NOTICE file distributed with
	+ * this work for additional information regarding copyright ownership.
	+ * The ASF licenses this file to You under the Apache License, Version 2.0
	+ * (the "License"); you may not use this file except in compliance with
	+ * the License. You may obtain a copy of the License at
	+ *
	+ * http://www.apache.org/licenses/LICENSE-2.0
	+ *
	+ * Unless required by applicable law or agreed to in writing, software
	+ * distributed under the License is distributed on an "AS IS" BASIS,
	+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	+ * See the License for the specific language governing permissions and
	+ * limitations under the License.
	+ */
	+
	+import java.io.IOException;
	+import java.util.HashMap;
	+import java.util.Map;
	+
	+import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
	+import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute;
	+import org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute;
	+import org.apache.lucene.util.BytesRef;
	+import org.apache.lucene.util.automaton.Automaton;
	+import org.apache.lucene.util.automaton.State;
	+import org.apache.lucene.util.automaton.Transition;
	+
	+/** Consumes a TokenStream and creates an {@link Automaton}. */
	+public class TokenStreamToAutomaton {
	+
	+ // nocommit: what bytes to steal!
	+
	+ // We create transition w/ this label when posInc is 1:
	+ public static final int POS_SEP = 0;
	+
	+ // nocommit move to oal.util.automaton?
	+ // nocommit: toFST? then we can translate atts into FST weights
	+
	+ /** Pulls the graph (including {@link
	+ * PositionLengthAttribute}) from the provided {@link
	+ * TokenStream}, and creates the corresponding
	+ * automaton where arcs are bytes from each term. */
	+ public static Automaton toAutomaton(TokenStream in) throws IOException {
	+ final Automaton a = new Automaton();
	+
	+ final TermToBytesRefAttribute termBytesAtt = in.addAttribute(TermToBytesRefAttribute.class);
	+ final PositionIncrementAttribute posIncAtt = in.addAttribute(PositionIncrementAttribute.class);
	+ final PositionLengthAttribute posLengthAtt = in.addAttribute(PositionLengthAttribute.class);
	+ final BytesRef term = termBytesAtt.getBytesRef();
	+
	+ in.reset();
	+
	+ // Only temporarily holds states ahead of our current
	+ // position:
	+ // nocommit maybe linked list...?
	+ final Map<Integer,State> posToState = new HashMap<Integer,State>();
	+
	+ State currentFromState = null;
	+ int pos = -1;
	+ int lastEndPos = -1;
	+ while (in.incrementToken()) {
	+ int posInc = posIncAtt.getPositionIncrement();
	+ if (currentFromState == null && posInc == 0) {
	+ // TODO: hmm are TS's still allowed to do this...?
	+ posInc = 1;
	+ }
	+
	+ if (posInc > 0) {
	+ // New node:
	+ pos += posInc;
	+ final State nextFromState;
	+ final State lastEndState = posToState.get(pos);
	+ if (lastEndState == null) {
	+ // nocommit invalid assert!! if a syn matched
	+ // over what is now a hole this assert falsely
	+ // trips... make test!!
	+ assert currentFromState == null;
	+ nextFromState = a.getInitialState();
	+ } else {
	+ nextFromState = new State();
	+ posToState.remove(pos);
	+ // nocommit if posInc > 1 what to do...? multiple SEP?
	+ lastEndState.addTransition(new Transition(POS_SEP, nextFromState));
	+ }
	+ currentFromState = nextFromState;
	+ }
	+
	+ // nocommit: make test for this:
	+
	+ // nocommit does posLengthAtt make it possible to
	+ // create broken graph? ie what if posInc skips over
	+ // the node created by a previous posLengthAtt!?
	+ // hrm. actually: we must handle this case! it means
	+ // eg a syn matched a stop word but then stop word was
	+ // deleted...
	+
	+ final int endPos = pos + posLengthAtt.getPositionLength();
	+
	+ termBytesAtt.fillBytesRef();
	+ State endState = posToState.get(endPos);
	+ if (endState == null) {
	+ endState = new State();
	+ posToState.put(endPos, endState);
	+ }
	+
	+ State lastState = currentFromState;
	+
	+ for(int byteIDX=0;byteIDX<term.length;byteIDX++) {
	+ final State nextState;
	+ if (byteIDX == term.length-1) {
	+ nextState = endState;
	+ } else {
	+ nextState = new State();
	+ }
	+
	+ lastState.addTransition(new Transition(term.bytes[term.offset + byteIDX] & 0xff, nextState));
	+ lastState = nextState;
	+ }
	+
	+ lastEndPos = pos + posLengthAtt.getPositionLength();
	+ }
	+
	+ // nocommit is this... right?
	+ for(State endState : posToState.values()) {
	+ endState.setAccept(true);
	+ }
	+
	+ return a;
	+ }
	+}

	Property changes on: lucene/test-framework/src/java/org/apache/lucene/analysis/TokenStreamToAutomaton.java
	___________________________________________________________________
	Added: svn:eol-style
	## -0,0 +1 ##
	+native