blob: f8958b66d7b1eff87e1a7c9e4145058aeabf021d [file] [log] [blame]
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/SingletonTokenStream.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/SingletonTokenStream.java
new file mode 100644
index 0000000000..872c7bf151
--- /dev/null
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/SingletonTokenStream.java
@@ -0,0 +1,97 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.analysis.miscellaneous;
+
+import java.io.IOException;
+
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
+import org.apache.lucene.util.AttributeSource;
+
+/**
+ * A TokenStream implementation that returns a single token value before
+ * becoming exhausted
+ *
+ * Users can choose whether or not to set offsets on the token stream. As
+ * an example of where not setting offsets may be useful, SingletonTokenStream
+ * could be used in conjunction with {@link ConcatenatingTokenStream} to add
+ * boundary markers to text, which should be inserted before and after the
+ * actual text tokens but should not affect their offsets
+ */
+public final class SingletonTokenStream extends TokenStream {
+
+ private final CharTermAttribute termAttribute = addAttribute(CharTermAttribute.class);
+ private final OffsetAttribute offsetAttribute = addAttribute(OffsetAttribute.class);
+ private boolean used = true;
+
+ private final String value;
+ private final boolean setOffsets;
+
+ /**
+ * Create a new SingletonTokenStream with a given value
+ *
+ * @param value the value to emit
+ * @param setOffsets whether or not to set offsets on the value
+ */
+ public SingletonTokenStream(String value, boolean setOffsets) {
+ this.value = value;
+ this.setOffsets = setOffsets;
+ }
+
+ /**
+ * Create a new SingletonTokenStream with a given value and attribute source
+ *
+ * @param source the attribute source to use
+ * @param value the value to emit
+ * @param setOffsets whether or not to set offsets on the value
+ */
+ public SingletonTokenStream(AttributeSource source, String value, boolean setOffsets) {
+ super(source);
+ this.value = value;
+ this.setOffsets = setOffsets;
+ }
+
+ @Override
+ public boolean incrementToken() {
+ if (used) {
+ return false;
+ }
+ clearAttributes();
+ termAttribute.append(value);
+ if (setOffsets) {
+ offsetAttribute.setOffset(0, value.length());
+ }
+ used = true;
+ return true;
+ }
+
+ @Override
+ public void end() throws IOException {
+ super.end();
+ if (setOffsets) {
+ final int finalOffset = value.length();
+ offsetAttribute.setOffset(finalOffset, finalOffset);
+ }
+ }
+
+ @Override
+ public void reset() {
+ used = false;
+ }
+}
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestConcatenatingTokenStream.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestConcatenatingTokenStream.java
index 19542e408f..3d473760ae 100644
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestConcatenatingTokenStream.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestConcatenatingTokenStream.java
@@ -20,11 +20,14 @@ package org.apache.lucene.analysis.miscellaneous;
import java.io.IOException;
import java.io.StringReader;
+import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.CannedTokenStream;
import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.core.WhitespaceTokenizer;
import org.apache.lucene.analysis.tokenattributes.FlagsAttribute;
import org.apache.lucene.analysis.tokenattributes.PayloadAttribute;
import org.apache.lucene.util.AttributeFactory;
@@ -42,21 +45,22 @@ public class TestConcatenatingTokenStream extends BaseTokenStreamTestCase {
final MockTokenizer third = new MockTokenizer(factory, MockTokenizer.WHITESPACE, false);
third.setReader(new StringReader(" third words"));
- TokenStream ts = new ConcatenatingTokenStream(first, second, new EmptyTokenStream(), third);
+ TokenStream ts = new ConcatenatingTokenStream(first, second, new EmptyTokenStream(), third,
+ new SingletonTokenStream(first.cloneAttributes(), "end", false));
assertTokenStreamContents(ts,
- new String[] { "first", "words", "second", "words", "third", "words" },
- new int[]{ 0, 6, 12, 19, 25, 31 },
- new int[]{ 5, 11, 18, 24, 30, 36 });
+ new String[] { "first", "words", "second", "words", "third", "words", "end" },
+ new int[]{ 0, 6, 12, 19, 25, 31, 36 },
+ new int[]{ 5, 11, 18, 24, 30, 36, 36 });
// test re-use
- first.setReader(new StringReader("first words "));
- second.setReader(new StringReader("second words"));
- third.setReader(new StringReader(" third words"));
+ first.setReader(new StringReader("first tokens "));
+ second.setReader(new StringReader("second tokens"));
+ third.setReader(new StringReader(" third tokens"));
assertTokenStreamContents(ts,
- new String[] { "first", "words", "second", "words", "third", "words" },
- new int[]{ 0, 6, 12, 19, 25, 31 },
- new int[]{ 5, 11, 18, 24, 30, 36 },
- new int[]{ 1, 1, 1, 1, 1, 1 });
+ new String[] { "first", "tokens", "second", "tokens", "third", "tokens", "end" },
+ new int[]{ 0, 6, 13, 20, 27, 33, 39 },
+ new int[]{ 5, 12, 19, 26, 32, 39, 39 },
+ new int[]{ 1, 1, 1, 1, 1, 1, 1 });
}
@@ -99,7 +103,7 @@ public class TestConcatenatingTokenStream extends BaseTokenStreamTestCase {
}
- public void testInconsistentAttributeFactories() throws IOException {
+ public void testInconsistentAttributeFactories() {
final MockTokenizer first = new MockTokenizer(AttributeFactory.DEFAULT_ATTRIBUTE_FACTORY, MockTokenizer.WHITESPACE, true);
final MockTokenizer second = new MockTokenizer(TokenStream.DEFAULT_TOKEN_ATTRIBUTE_FACTORY, MockTokenizer.WHITESPACE, true);
@@ -108,4 +112,23 @@ public class TestConcatenatingTokenStream extends BaseTokenStreamTestCase {
}
+ public void testThroughAnalyzer() throws IOException {
+
+ Analyzer a = new Analyzer() {
+ @Override
+ protected TokenStreamComponents createComponents(String fieldName) {
+ Tokenizer t = new WhitespaceTokenizer();
+ SingletonTokenStream boundary1 = new SingletonTokenStream("qq", false);
+ SingletonTokenStream boundary2 = new SingletonTokenStream("qq", false);
+ return new TokenStreamComponents(t, new ConcatenatingTokenStream(boundary1, t, boundary2));
+ }
+ };
+
+ assertAnalyzesTo(a, "some words",
+ new String[]{ "qq", "some", "words", "qq" },
+ new int[]{ 0, 0, 5, 10 },
+ new int[]{ 0, 4, 10, 10 });
+
+ }
+
}
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestSingletonTokenStream.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestSingletonTokenStream.java
new file mode 100644
index 0000000000..bc057b9afc
--- /dev/null
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestSingletonTokenStream.java
@@ -0,0 +1,38 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.analysis.miscellaneous;
+
+import java.io.IOException;
+
+import org.apache.lucene.analysis.BaseTokenStreamTestCase;
+
+public class TestSingletonTokenStream extends BaseTokenStreamTestCase {
+
+ public void testSingletonWithOffsets() throws IOException {
+ SingletonTokenStream ts = new SingletonTokenStream("value", true);
+ assertTokenStreamContents(ts, new String[]{ "value" }, new int[]{ 0 }, new int[]{ 5 }, new int[] { 1 });
+ ts.reset();
+ assertTokenStreamContents(ts, new String[]{ "value" }, new int[]{ 0 }, new int[]{ 5 }, new int[] { 1 });
+ }
+
+ public void testSingletonWithoutOffsets() throws IOException {
+ SingletonTokenStream ts = new SingletonTokenStream("value", false);
+ assertTokenStreamContents(ts, new String[]{ "value" }, new int[]{ 0 }, new int[]{ 0 }, new int[] { 1 });
+ }
+
+}