blob: edb5ff2362864ad0864fc436840b61bb463792a1 [file] [log] [blame]
Index: lucene/core/src/test/org/apache/lucene/analysis/TestGraphTokenizers.java
===================================================================
--- lucene/core/src/test/org/apache/lucene/analysis/TestGraphTokenizers.java (revision 0)
+++ lucene/core/src/test/org/apache/lucene/analysis/TestGraphTokenizers.java (working copy)
@@ -0,0 +1,146 @@
+package org.apache.lucene.analysis;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.FileOutputStream;
+import java.io.IOException;
+import java.io.OutputStreamWriter;
+import java.io.PrintWriter;
+import java.io.StringWriter;
+import java.io.Writer;
+
+import org.apache.lucene.util.LuceneTestCase;
+import org.apache.lucene.util.automaton.Automaton;
+import org.apache.lucene.util.automaton.BasicAutomata;
+import org.apache.lucene.util.automaton.BasicOperations;
+
+public class TestGraphTokenizers extends LuceneTestCase {
+
+ private static Token token(String term, int posInc, int posLength) {
+ final Token t = new Token(term, 0, 0);
+ t.setPositionIncrement(posInc);
+ t.setPositionLength(posLength);
+ return t;
+ }
+
+ private static Token token(String term, int posInc, int posLength, int startOffset, int endOffset) {
+ final Token t = new Token(term, startOffset, endOffset);
+ t.setPositionIncrement(posInc);
+ t.setPositionLength(posLength);
+ return t;
+ }
+
+ public void testSingleToken() throws Exception {
+
+ final TokenStream ts = new CannedTokenStream(
+ new Token[] {
+ token("abc", 1, 1),
+ });
+ final Automaton actual = TokenStreamToAutomaton.toAutomaton(ts);
+ final Automaton expected = BasicAutomata.makeString("abc");
+ assertTrue(BasicOperations.sameLanguage(expected, actual));
+ }
+
+ // for debugging!
+ private static void toDot(Automaton a) throws IOException {
+ final String s = a.toDot();
+ Writer w = new OutputStreamWriter(new FileOutputStream("/x/tmp3/out.dot"));
+ w.write(s);
+ w.close();
+ }
+
+ private static final Automaton POS_SEP = BasicAutomata.makeCharRange(TokenStreamToAutomaton.POS_SEP,
+ TokenStreamToAutomaton.POS_SEP);
+ public void testTwoTokens() throws Exception {
+
+ final TokenStream ts = new CannedTokenStream(
+ new Token[] {
+ token("abc", 1, 1),
+ token("def", 1, 1),
+ });
+ final Automaton actual = TokenStreamToAutomaton.toAutomaton(ts);
+ final Automaton a1 = BasicAutomata.makeString("abc");
+ final Automaton a2 = BasicAutomata.makeString("def");
+ final Automaton expected = BasicOperations.concatenate(a1, POS_SEP, a2);
+
+ //toDot(actual);
+ assertTrue(BasicOperations.sameLanguage(expected, actual));
+ }
+
+ public void testOverlappedTokensSausage() throws Exception {
+
+ // Two tokens on top of each other (sausage):
+ final TokenStream ts = new CannedTokenStream(
+ new Token[] {
+ token("abc", 1, 1),
+ token("xyz", 0, 1)
+ });
+ final Automaton actual = TokenStreamToAutomaton.toAutomaton(ts);
+ final Automaton a1 = BasicAutomata.makeString("abc");
+ final Automaton a2 = BasicAutomata.makeString("xyz");
+ final Automaton expected = BasicOperations.union(a1, a2);
+ assertTrue(BasicOperations.sameLanguage(expected, actual));
+ }
+
+ public void testOverlappedTokensLattice() throws Exception {
+
+ final TokenStream ts = new CannedTokenStream(
+ new Token[] {
+ token("abc", 1, 1),
+ token("xyz", 0, 2),
+ token("def", 1, 1),
+ });
+ final Automaton actual = TokenStreamToAutomaton.toAutomaton(ts);
+ final Automaton a1 = BasicAutomata.makeString("xyz");
+ final Automaton a2 = BasicOperations.concatenate(BasicAutomata.makeString("abc"),
+ POS_SEP,
+ BasicAutomata.makeString("def"));
+ final Automaton expected = BasicOperations.union(a1, a2);
+ //toDot(actual);
+ assertTrue(BasicOperations.sameLanguage(expected, actual));
+ }
+
+ public void testOverlappedTokensLattice2() throws Exception {
+
+ final TokenStream ts = new CannedTokenStream(
+ new Token[] {
+ token("abc", 1, 1),
+ token("xyz", 0, 3),
+ token("def", 1, 1),
+ token("ghi", 1, 1),
+ });
+ final Automaton actual = TokenStreamToAutomaton.toAutomaton(ts);
+ final Automaton a1 = BasicAutomata.makeString("xyz");
+ final Automaton a2 = BasicOperations.concatenate(BasicAutomata.makeString("abc"),
+ POS_SEP,
+ BasicAutomata.makeString("def"),
+ POS_SEP,
+ BasicAutomata.makeString("ghi"));
+ final Automaton expected = BasicOperations.union(a1, a2);
+ //toDot(actual);
+ assertTrue(BasicOperations.sameLanguage(expected, actual));
+ }
+
+ public void testToDot() throws Exception {
+ final TokenStream ts = new CannedTokenStream(new Token[] {token("abc", 1, 1, 0, 4)});
+ StringWriter w = new StringWriter();
+ new TokenStreamToDot("abcd", ts, new PrintWriter(w)).toDot();
+ assertTrue(w.toString().indexOf("abc / abcd") != -1);
+ }
+}
+
Property changes on: lucene/core/src/test/org/apache/lucene/analysis/TestGraphTokenizers.java
___________________________________________________________________
Added: svn:eol-style
## -0,0 +1 ##
+native
Index: lucene/core/src/java/org/apache/lucene/analysis/Token.java
===================================================================
--- lucene/core/src/java/org/apache/lucene/analysis/Token.java (revision 1296808)
+++ lucene/core/src/java/org/apache/lucene/analysis/Token.java (working copy)
@@ -22,6 +22,7 @@
import org.apache.lucene.analysis.tokenattributes.FlagsAttribute;
import org.apache.lucene.analysis.tokenattributes.PayloadAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
+import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute;
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
import org.apache.lucene.index.Payload;
import org.apache.lucene.index.DocsAndPositionsEnum; // for javadoc
@@ -121,13 +122,15 @@
*/
public class Token extends CharTermAttributeImpl
implements TypeAttribute, PositionIncrementAttribute,
- FlagsAttribute, OffsetAttribute, PayloadAttribute {
+ FlagsAttribute, OffsetAttribute, PayloadAttribute, PositionLengthAttribute {
private int startOffset,endOffset;
private String type = DEFAULT_TYPE;
private int flags;
private Payload payload;
private int positionIncrement = 1;
+ // nocommit should we NOT do this...?
+ private int positionLength = 1;
/** Constructs a Token will null text. */
public Token() {
@@ -270,6 +273,18 @@
return positionIncrement;
}
+ // nocommit jdocs
+ @Override
+ public void setPositionLength(int positionLength) {
+ this.positionLength = positionLength;
+ }
+
+ // nocommit jdocs
+ @Override
+ public int getPositionLength() {
+ return positionLength;
+ }
+
/** Returns this Token's starting offset, the position of the first character
corresponding to this token in the source text.
Index: lucene/core/src/java/org/apache/lucene/util/automaton/BasicOperations.java
===================================================================
--- lucene/core/src/java/org/apache/lucene/util/automaton/BasicOperations.java (revision 1296808)
+++ lucene/core/src/java/org/apache/lucene/util/automaton/BasicOperations.java (working copy)
@@ -90,8 +90,8 @@
* <p>
* Complexity: linear in total number of states.
*/
- static public Automaton concatenate(List<Automaton> l) {
- if (l.isEmpty()) return BasicAutomata.makeEmptyString();
+ static public Automaton concatenate(Automaton... l) {
+ if (l.length == 0) return BasicAutomata.makeEmptyString();
boolean all_singleton = true;
for (Automaton a : l)
if (!a.isSingleton()) {
@@ -109,8 +109,8 @@
Set<Integer> ids = new HashSet<Integer>();
for (Automaton a : l)
ids.add(System.identityHashCode(a));
- boolean has_aliases = ids.size() != l.size();
- Automaton b = l.get(0);
+ boolean has_aliases = ids.size() != l.length;
+ Automaton b = l[0];
if (has_aliases) b = b.cloneExpanded();
else b = b.cloneExpandedIfRequired();
Set<State> ac = b.getAcceptStates();
@@ -191,7 +191,7 @@
while (min-- > 0)
as.add(a);
as.add(repeat(a));
- return concatenate(as);
+ return concatenate(as.toArray(new Automaton[as.size()]));
}
/**
@@ -213,7 +213,7 @@
List<Automaton> as = new ArrayList<Automaton>();
while (min-- > 0)
as.add(a);
- b = concatenate(as);
+ b = concatenate(as.toArray(new Automaton[as.size()]));
}
if (max > 0) {
Automaton d = a.clone();
Index: lucene/test-framework/src/java/org/apache/lucene/analysis/CannedTokenStream.java
===================================================================
--- lucene/test-framework/src/java/org/apache/lucene/analysis/CannedTokenStream.java (revision 1296808)
+++ lucene/test-framework/src/java/org/apache/lucene/analysis/CannedTokenStream.java (working copy)
@@ -22,6 +22,7 @@
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
+import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute;
/**
* emits a canned set of {@link Token}
@@ -31,6 +32,7 @@
private int upto = 0;
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class);
+ private final PositionLengthAttribute posLengthAtt = addAttribute(PositionLengthAttribute.class);
private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
public CannedTokenStream(Token[] tokens) {
@@ -47,6 +49,7 @@
termAtt.setEmpty();
termAtt.append(token.toString());
posIncrAtt.setPositionIncrement(token.getPositionIncrement());
+ posLengthAtt.setPositionLength(token.getPositionLength());
offsetAtt.setOffset(token.startOffset(), token.endOffset());
return true;
} else {
Index: lucene/test-framework/src/java/org/apache/lucene/analysis/TokenStreamToAutomaton.java
===================================================================
--- lucene/test-framework/src/java/org/apache/lucene/analysis/TokenStreamToAutomaton.java (revision 0)
+++ lucene/test-framework/src/java/org/apache/lucene/analysis/TokenStreamToAutomaton.java (working copy)
@@ -0,0 +1,134 @@
+package org.apache.lucene.analysis;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.util.HashMap;
+import java.util.Map;
+
+import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
+import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute;
+import org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute;
+import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.automaton.Automaton;
+import org.apache.lucene.util.automaton.State;
+import org.apache.lucene.util.automaton.Transition;
+
+/** Consumes a TokenStream and creates an {@link Automaton}. */
+public class TokenStreamToAutomaton {
+
+ // nocommit: what bytes to steal!
+
+ // We create transition w/ this label when posInc is 1:
+ public static final int POS_SEP = 0;
+
+ // nocommit move to oal.util.automaton?
+ // nocommit: toFST? then we can translate atts into FST weights
+
+ /** Pulls the graph (including {@link
+ * PositionLengthAttribute}) from the provided {@link
+ * TokenStream}, and creates the corresponding
+ * automaton where arcs are bytes from each term. */
+ public static Automaton toAutomaton(TokenStream in) throws IOException {
+ final Automaton a = new Automaton();
+
+ final TermToBytesRefAttribute termBytesAtt = in.addAttribute(TermToBytesRefAttribute.class);
+ final PositionIncrementAttribute posIncAtt = in.addAttribute(PositionIncrementAttribute.class);
+ final PositionLengthAttribute posLengthAtt = in.addAttribute(PositionLengthAttribute.class);
+ final BytesRef term = termBytesAtt.getBytesRef();
+
+ in.reset();
+
+ // Only temporarily holds states ahead of our current
+ // position:
+ // nocommit maybe linked list...?
+ final Map<Integer,State> posToState = new HashMap<Integer,State>();
+
+ State currentFromState = null;
+ int pos = -1;
+ int lastEndPos = -1;
+ while (in.incrementToken()) {
+ int posInc = posIncAtt.getPositionIncrement();
+ if (currentFromState == null && posInc == 0) {
+ // TODO: hmm are TS's still allowed to do this...?
+ posInc = 1;
+ }
+
+ if (posInc > 0) {
+ // New node:
+ pos += posInc;
+ final State nextFromState;
+ final State lastEndState = posToState.get(pos);
+ if (lastEndState == null) {
+ // nocommit invalid assert!! if a syn matched
+ // over what is now a hole this assert falsely
+ // trips... make test!!
+ assert currentFromState == null;
+ nextFromState = a.getInitialState();
+ } else {
+ nextFromState = new State();
+ posToState.remove(pos);
+ // nocommit if posInc > 1 what to do...? multiple SEP?
+ lastEndState.addTransition(new Transition(POS_SEP, nextFromState));
+ }
+ currentFromState = nextFromState;
+ }
+
+ // nocommit: make test for this:
+
+ // nocommit does posLengthAtt make it possible to
+ // create broken graph? ie what if posInc skips over
+ // the node created by a previous posLengthAtt!?
+ // hrm. actually: we must handle this case! it means
+ // eg a syn matched a stop word but then stop word was
+ // deleted...
+
+ final int endPos = pos + posLengthAtt.getPositionLength();
+
+ termBytesAtt.fillBytesRef();
+ State endState = posToState.get(endPos);
+ if (endState == null) {
+ endState = new State();
+ posToState.put(endPos, endState);
+ }
+
+ State lastState = currentFromState;
+
+ for(int byteIDX=0;byteIDX<term.length;byteIDX++) {
+ final State nextState;
+ if (byteIDX == term.length-1) {
+ nextState = endState;
+ } else {
+ nextState = new State();
+ }
+
+ lastState.addTransition(new Transition(term.bytes[term.offset + byteIDX] & 0xff, nextState));
+ lastState = nextState;
+ }
+
+ lastEndPos = pos + posLengthAtt.getPositionLength();
+ }
+
+ // nocommit is this... right?
+ for(State endState : posToState.values()) {
+ endState.setAccept(true);
+ }
+
+ return a;
+ }
+}
Property changes on: lucene/test-framework/src/java/org/apache/lucene/analysis/TokenStreamToAutomaton.java
___________________________________________________________________
Added: svn:eol-style
## -0,0 +1 ##
+native