| Index: lucene/core/src/test/org/apache/lucene/analysis/TestGraphTokenizers.java |
| =================================================================== |
| --- lucene/core/src/test/org/apache/lucene/analysis/TestGraphTokenizers.java (revision 0) |
| +++ lucene/core/src/test/org/apache/lucene/analysis/TestGraphTokenizers.java (working copy) |
| @@ -0,0 +1,146 @@ |
| +package org.apache.lucene.analysis; |
| + |
| +/** |
| + * Licensed to the Apache Software Foundation (ASF) under one or more |
| + * contributor license agreements. See the NOTICE file distributed with |
| + * this work for additional information regarding copyright ownership. |
| + * The ASF licenses this file to You under the Apache License, Version 2.0 |
| + * (the "License"); you may not use this file except in compliance with |
| + * the License. You may obtain a copy of the License at |
| + * |
| + * http://www.apache.org/licenses/LICENSE-2.0 |
| + * |
| + * Unless required by applicable law or agreed to in writing, software |
| + * distributed under the License is distributed on an "AS IS" BASIS, |
| + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| + * See the License for the specific language governing permissions and |
| + * limitations under the License. |
| + */ |
| + |
| +import java.io.FileOutputStream; |
| +import java.io.IOException; |
| +import java.io.OutputStreamWriter; |
| +import java.io.PrintWriter; |
| +import java.io.StringWriter; |
| +import java.io.Writer; |
| + |
| +import org.apache.lucene.util.LuceneTestCase; |
| +import org.apache.lucene.util.automaton.Automaton; |
| +import org.apache.lucene.util.automaton.BasicAutomata; |
| +import org.apache.lucene.util.automaton.BasicOperations; |
| + |
| +public class TestGraphTokenizers extends LuceneTestCase { |
| + |
| + private static Token token(String term, int posInc, int posLength) { |
| + final Token t = new Token(term, 0, 0); |
| + t.setPositionIncrement(posInc); |
| + t.setPositionLength(posLength); |
| + return t; |
| + } |
| + |
| + private static Token token(String term, int posInc, int posLength, int startOffset, int endOffset) { |
| + final Token t = new Token(term, startOffset, endOffset); |
| + t.setPositionIncrement(posInc); |
| + t.setPositionLength(posLength); |
| + return t; |
| + } |
| + |
| + public void testSingleToken() throws Exception { |
| + |
| + final TokenStream ts = new CannedTokenStream( |
| + new Token[] { |
| + token("abc", 1, 1), |
| + }); |
| + final Automaton actual = TokenStreamToAutomaton.toAutomaton(ts); |
| + final Automaton expected = BasicAutomata.makeString("abc"); |
| + assertTrue(BasicOperations.sameLanguage(expected, actual)); |
| + } |
| + |
| + // for debugging! |
| + private static void toDot(Automaton a) throws IOException { |
| + final String s = a.toDot(); |
| + Writer w = new OutputStreamWriter(new FileOutputStream("/x/tmp3/out.dot")); |
| + w.write(s); |
| + w.close(); |
| + } |
| + |
| + private static final Automaton POS_SEP = BasicAutomata.makeCharRange(TokenStreamToAutomaton.POS_SEP, |
| + TokenStreamToAutomaton.POS_SEP); |
| + public void testTwoTokens() throws Exception { |
| + |
| + final TokenStream ts = new CannedTokenStream( |
| + new Token[] { |
| + token("abc", 1, 1), |
| + token("def", 1, 1), |
| + }); |
| + final Automaton actual = TokenStreamToAutomaton.toAutomaton(ts); |
| + final Automaton a1 = BasicAutomata.makeString("abc"); |
| + final Automaton a2 = BasicAutomata.makeString("def"); |
| + final Automaton expected = BasicOperations.concatenate(a1, POS_SEP, a2); |
| + |
| + //toDot(actual); |
| + assertTrue(BasicOperations.sameLanguage(expected, actual)); |
| + } |
| + |
| + public void testOverlappedTokensSausage() throws Exception { |
| + |
| + // Two tokens on top of each other (sausage): |
| + final TokenStream ts = new CannedTokenStream( |
| + new Token[] { |
| + token("abc", 1, 1), |
| + token("xyz", 0, 1) |
| + }); |
| + final Automaton actual = TokenStreamToAutomaton.toAutomaton(ts); |
| + final Automaton a1 = BasicAutomata.makeString("abc"); |
| + final Automaton a2 = BasicAutomata.makeString("xyz"); |
| + final Automaton expected = BasicOperations.union(a1, a2); |
| + assertTrue(BasicOperations.sameLanguage(expected, actual)); |
| + } |
| + |
| + public void testOverlappedTokensLattice() throws Exception { |
| + |
| + final TokenStream ts = new CannedTokenStream( |
| + new Token[] { |
| + token("abc", 1, 1), |
| + token("xyz", 0, 2), |
| + token("def", 1, 1), |
| + }); |
| + final Automaton actual = TokenStreamToAutomaton.toAutomaton(ts); |
| + final Automaton a1 = BasicAutomata.makeString("xyz"); |
| + final Automaton a2 = BasicOperations.concatenate(BasicAutomata.makeString("abc"), |
| + POS_SEP, |
| + BasicAutomata.makeString("def")); |
| + final Automaton expected = BasicOperations.union(a1, a2); |
| + //toDot(actual); |
| + assertTrue(BasicOperations.sameLanguage(expected, actual)); |
| + } |
| + |
| + public void testOverlappedTokensLattice2() throws Exception { |
| + |
| + final TokenStream ts = new CannedTokenStream( |
| + new Token[] { |
| + token("abc", 1, 1), |
| + token("xyz", 0, 3), |
| + token("def", 1, 1), |
| + token("ghi", 1, 1), |
| + }); |
| + final Automaton actual = TokenStreamToAutomaton.toAutomaton(ts); |
| + final Automaton a1 = BasicAutomata.makeString("xyz"); |
| + final Automaton a2 = BasicOperations.concatenate(BasicAutomata.makeString("abc"), |
| + POS_SEP, |
| + BasicAutomata.makeString("def"), |
| + POS_SEP, |
| + BasicAutomata.makeString("ghi")); |
| + final Automaton expected = BasicOperations.union(a1, a2); |
| + //toDot(actual); |
| + assertTrue(BasicOperations.sameLanguage(expected, actual)); |
| + } |
| + |
| + public void testToDot() throws Exception { |
| + final TokenStream ts = new CannedTokenStream(new Token[] {token("abc", 1, 1, 0, 4)}); |
| + StringWriter w = new StringWriter(); |
| + new TokenStreamToDot("abcd", ts, new PrintWriter(w)).toDot(); |
| + assertTrue(w.toString().indexOf("abc / abcd") != -1); |
| + } |
| +} |
| + |
| |
| Property changes on: lucene/core/src/test/org/apache/lucene/analysis/TestGraphTokenizers.java |
| ___________________________________________________________________ |
| Added: svn:eol-style |
| ## -0,0 +1 ## |
| +native |
| Index: lucene/core/src/java/org/apache/lucene/analysis/Token.java |
| =================================================================== |
| --- lucene/core/src/java/org/apache/lucene/analysis/Token.java (revision 1296808) |
| +++ lucene/core/src/java/org/apache/lucene/analysis/Token.java (working copy) |
| @@ -22,6 +22,7 @@ |
| import org.apache.lucene.analysis.tokenattributes.FlagsAttribute; |
| import org.apache.lucene.analysis.tokenattributes.PayloadAttribute; |
| import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; |
| +import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute; |
| import org.apache.lucene.analysis.tokenattributes.TypeAttribute; |
| import org.apache.lucene.index.Payload; |
| import org.apache.lucene.index.DocsAndPositionsEnum; // for javadoc |
| @@ -121,13 +122,15 @@ |
| */ |
| public class Token extends CharTermAttributeImpl |
| implements TypeAttribute, PositionIncrementAttribute, |
| - FlagsAttribute, OffsetAttribute, PayloadAttribute { |
| + FlagsAttribute, OffsetAttribute, PayloadAttribute, PositionLengthAttribute { |
| |
| private int startOffset,endOffset; |
| private String type = DEFAULT_TYPE; |
| private int flags; |
| private Payload payload; |
| private int positionIncrement = 1; |
| + // nocommit should we NOT do this...? |
| + private int positionLength = 1; |
| |
| /** Constructs a Token will null text. */ |
| public Token() { |
| @@ -270,6 +273,18 @@ |
| return positionIncrement; |
| } |
| |
| + // nocommit jdocs |
| + @Override |
| + public void setPositionLength(int positionLength) { |
| + this.positionLength = positionLength; |
| + } |
| + |
| + // nocommit jdocs |
| + @Override |
| + public int getPositionLength() { |
| + return positionLength; |
| + } |
| + |
| /** Returns this Token's starting offset, the position of the first character |
| corresponding to this token in the source text. |
| |
| Index: lucene/core/src/java/org/apache/lucene/util/automaton/BasicOperations.java |
| =================================================================== |
| --- lucene/core/src/java/org/apache/lucene/util/automaton/BasicOperations.java (revision 1296808) |
| +++ lucene/core/src/java/org/apache/lucene/util/automaton/BasicOperations.java (working copy) |
| @@ -90,8 +90,8 @@ |
| * <p> |
| * Complexity: linear in total number of states. |
| */ |
| - static public Automaton concatenate(List<Automaton> l) { |
| - if (l.isEmpty()) return BasicAutomata.makeEmptyString(); |
| + static public Automaton concatenate(Automaton... l) { |
| + if (l.length == 0) return BasicAutomata.makeEmptyString(); |
| boolean all_singleton = true; |
| for (Automaton a : l) |
| if (!a.isSingleton()) { |
| @@ -109,8 +109,8 @@ |
| Set<Integer> ids = new HashSet<Integer>(); |
| for (Automaton a : l) |
| ids.add(System.identityHashCode(a)); |
| - boolean has_aliases = ids.size() != l.size(); |
| - Automaton b = l.get(0); |
| + boolean has_aliases = ids.size() != l.length; |
| + Automaton b = l[0]; |
| if (has_aliases) b = b.cloneExpanded(); |
| else b = b.cloneExpandedIfRequired(); |
| Set<State> ac = b.getAcceptStates(); |
| @@ -191,7 +191,7 @@ |
| while (min-- > 0) |
| as.add(a); |
| as.add(repeat(a)); |
| - return concatenate(as); |
| + return concatenate(as.toArray(new Automaton[as.size()])); |
| } |
| |
| /** |
| @@ -213,7 +213,7 @@ |
| List<Automaton> as = new ArrayList<Automaton>(); |
| while (min-- > 0) |
| as.add(a); |
| - b = concatenate(as); |
| + b = concatenate(as.toArray(new Automaton[as.size()])); |
| } |
| if (max > 0) { |
| Automaton d = a.clone(); |
| Index: lucene/test-framework/src/java/org/apache/lucene/analysis/CannedTokenStream.java |
| =================================================================== |
| --- lucene/test-framework/src/java/org/apache/lucene/analysis/CannedTokenStream.java (revision 1296808) |
| +++ lucene/test-framework/src/java/org/apache/lucene/analysis/CannedTokenStream.java (working copy) |
| @@ -22,6 +22,7 @@ |
| import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; |
| import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; |
| import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; |
| +import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute; |
| |
| /** |
| * emits a canned set of {@link Token} |
| @@ -31,6 +32,7 @@ |
| private int upto = 0; |
| private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); |
| private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class); |
| + private final PositionLengthAttribute posLengthAtt = addAttribute(PositionLengthAttribute.class); |
| private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class); |
| |
| public CannedTokenStream(Token[] tokens) { |
| @@ -47,6 +49,7 @@ |
| termAtt.setEmpty(); |
| termAtt.append(token.toString()); |
| posIncrAtt.setPositionIncrement(token.getPositionIncrement()); |
| + posLengthAtt.setPositionLength(token.getPositionLength()); |
| offsetAtt.setOffset(token.startOffset(), token.endOffset()); |
| return true; |
| } else { |
| Index: lucene/test-framework/src/java/org/apache/lucene/analysis/TokenStreamToAutomaton.java |
| =================================================================== |
| --- lucene/test-framework/src/java/org/apache/lucene/analysis/TokenStreamToAutomaton.java (revision 0) |
| +++ lucene/test-framework/src/java/org/apache/lucene/analysis/TokenStreamToAutomaton.java (working copy) |
| @@ -0,0 +1,134 @@ |
| +package org.apache.lucene.analysis; |
| + |
| +/** |
| + * Licensed to the Apache Software Foundation (ASF) under one or more |
| + * contributor license agreements. See the NOTICE file distributed with |
| + * this work for additional information regarding copyright ownership. |
| + * The ASF licenses this file to You under the Apache License, Version 2.0 |
| + * (the "License"); you may not use this file except in compliance with |
| + * the License. You may obtain a copy of the License at |
| + * |
| + * http://www.apache.org/licenses/LICENSE-2.0 |
| + * |
| + * Unless required by applicable law or agreed to in writing, software |
| + * distributed under the License is distributed on an "AS IS" BASIS, |
| + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| + * See the License for the specific language governing permissions and |
| + * limitations under the License. |
| + */ |
| + |
| +import java.io.IOException; |
| +import java.util.HashMap; |
| +import java.util.Map; |
| + |
| +import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; |
| +import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute; |
| +import org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute; |
| +import org.apache.lucene.util.BytesRef; |
| +import org.apache.lucene.util.automaton.Automaton; |
| +import org.apache.lucene.util.automaton.State; |
| +import org.apache.lucene.util.automaton.Transition; |
| + |
| +/** Consumes a TokenStream and creates an {@link Automaton}. */ |
| +public class TokenStreamToAutomaton { |
| + |
| + // nocommit: what bytes to steal! |
| + |
| + // We create transition w/ this label when posInc is 1: |
| + public static final int POS_SEP = 0; |
| + |
| + // nocommit move to oal.util.automaton? |
| + // nocommit: toFST? then we can translate atts into FST weights |
| + |
| + /** Pulls the graph (including {@link |
| + * PositionLengthAttribute}) from the provided {@link |
| + * TokenStream}, and creates the corresponding |
| + * automaton where arcs are bytes from each term. */ |
| + public static Automaton toAutomaton(TokenStream in) throws IOException { |
| + final Automaton a = new Automaton(); |
| + |
| + final TermToBytesRefAttribute termBytesAtt = in.addAttribute(TermToBytesRefAttribute.class); |
| + final PositionIncrementAttribute posIncAtt = in.addAttribute(PositionIncrementAttribute.class); |
| + final PositionLengthAttribute posLengthAtt = in.addAttribute(PositionLengthAttribute.class); |
| + final BytesRef term = termBytesAtt.getBytesRef(); |
| + |
| + in.reset(); |
| + |
| + // Only temporarily holds states ahead of our current |
| + // position: |
| + // nocommit maybe linked list...? |
| + final Map<Integer,State> posToState = new HashMap<Integer,State>(); |
| + |
| + State currentFromState = null; |
| + int pos = -1; |
| + int lastEndPos = -1; |
| + while (in.incrementToken()) { |
| + int posInc = posIncAtt.getPositionIncrement(); |
| + if (currentFromState == null && posInc == 0) { |
| + // TODO: hmm are TS's still allowed to do this...? |
| + posInc = 1; |
| + } |
| + |
| + if (posInc > 0) { |
| + // New node: |
| + pos += posInc; |
| + final State nextFromState; |
| + final State lastEndState = posToState.get(pos); |
| + if (lastEndState == null) { |
| + // nocommit invalid assert!! if a syn matched |
| + // over what is now a hole this assert falsely |
| + // trips... make test!! |
| + assert currentFromState == null; |
| + nextFromState = a.getInitialState(); |
| + } else { |
| + nextFromState = new State(); |
| + posToState.remove(pos); |
| + // nocommit if posInc > 1 what to do...? multiple SEP? |
| + lastEndState.addTransition(new Transition(POS_SEP, nextFromState)); |
| + } |
| + currentFromState = nextFromState; |
| + } |
| + |
| + // nocommit: make test for this: |
| + |
| + // nocommit does posLengthAtt make it possible to |
| + // create broken graph? ie what if posInc skips over |
| + // the node created by a previous posLengthAtt!? |
| + // hrm. actually: we must handle this case! it means |
| + // eg a syn matched a stop word but then stop word was |
| + // deleted... |
| + |
| + final int endPos = pos + posLengthAtt.getPositionLength(); |
| + |
| + termBytesAtt.fillBytesRef(); |
| + State endState = posToState.get(endPos); |
| + if (endState == null) { |
| + endState = new State(); |
| + posToState.put(endPos, endState); |
| + } |
| + |
| + State lastState = currentFromState; |
| + |
| + for(int byteIDX=0;byteIDX<term.length;byteIDX++) { |
| + final State nextState; |
| + if (byteIDX == term.length-1) { |
| + nextState = endState; |
| + } else { |
| + nextState = new State(); |
| + } |
| + |
| + lastState.addTransition(new Transition(term.bytes[term.offset + byteIDX] & 0xff, nextState)); |
| + lastState = nextState; |
| + } |
| + |
| + lastEndPos = pos + posLengthAtt.getPositionLength(); |
| + } |
| + |
| + // nocommit is this... right? |
| + for(State endState : posToState.values()) { |
| + endState.setAccept(true); |
| + } |
| + |
| + return a; |
| + } |
| +} |
| |
| Property changes on: lucene/test-framework/src/java/org/apache/lucene/analysis/TokenStreamToAutomaton.java |
| ___________________________________________________________________ |
| Added: svn:eol-style |
| ## -0,0 +1 ## |
| +native |