| diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/SingletonTokenStream.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/SingletonTokenStream.java |
| new file mode 100644 |
| index 0000000000..872c7bf151 |
| --- /dev/null |
| +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/SingletonTokenStream.java |
| @@ -0,0 +1,97 @@ |
| +/* |
| + * Licensed to the Apache Software Foundation (ASF) under one or more |
| + * contributor license agreements. See the NOTICE file distributed with |
| + * this work for additional information regarding copyright ownership. |
| + * The ASF licenses this file to You under the Apache License, Version 2.0 |
| + * (the "License"); you may not use this file except in compliance with |
| + * the License. You may obtain a copy of the License at |
| + * |
| + * http://www.apache.org/licenses/LICENSE-2.0 |
| + * |
| + * Unless required by applicable law or agreed to in writing, software |
| + * distributed under the License is distributed on an "AS IS" BASIS, |
| + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| + * See the License for the specific language governing permissions and |
| + * limitations under the License. |
| + */ |
| + |
| +package org.apache.lucene.analysis.miscellaneous; |
| + |
| +import java.io.IOException; |
| + |
| +import org.apache.lucene.analysis.TokenStream; |
| +import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; |
| +import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; |
| +import org.apache.lucene.util.AttributeSource; |
| + |
| +/** |
| + * A TokenStream implementation that returns a single token value before |
| + * becoming exhausted |
| + * |
| + * Users can choose whether or not to set offsets on the token stream. As |
| + * an example of where not setting offsets may be useful, SingletonTokenStream |
| + * could be used in conjunction with {@link ConcatenatingTokenStream} to add |
| + * boundary markers to text, which should be inserted before and after the |
| + * actual text tokens but should not affect their offsets |
| + */ |
| +public final class SingletonTokenStream extends TokenStream { |
| + |
| + private final CharTermAttribute termAttribute = addAttribute(CharTermAttribute.class); |
| + private final OffsetAttribute offsetAttribute = addAttribute(OffsetAttribute.class); |
| + private boolean used = true; |
| + |
| + private final String value; |
| + private final boolean setOffsets; |
| + |
| + /** |
| + * Create a new SingletonTokenStream with a given value |
| + * |
| + * @param value the value to emit |
| + * @param setOffsets whether or not to set offsets on the value |
| + */ |
| + public SingletonTokenStream(String value, boolean setOffsets) { |
| + this.value = value; |
| + this.setOffsets = setOffsets; |
| + } |
| + |
| + /** |
| + * Create a new SingletonTokenStream with a given value and attribute source |
| + * |
| + * @param source the attribute source to use |
| + * @param value the value to emit |
| + * @param setOffsets whether or not to set offsets on the value |
| + */ |
| + public SingletonTokenStream(AttributeSource source, String value, boolean setOffsets) { |
| + super(source); |
| + this.value = value; |
| + this.setOffsets = setOffsets; |
| + } |
| + |
| + @Override |
| + public boolean incrementToken() { |
| + if (used) { |
| + return false; |
| + } |
| + clearAttributes(); |
| + termAttribute.append(value); |
| + if (setOffsets) { |
| + offsetAttribute.setOffset(0, value.length()); |
| + } |
| + used = true; |
| + return true; |
| + } |
| + |
| + @Override |
| + public void end() throws IOException { |
| + super.end(); |
| + if (setOffsets) { |
| + final int finalOffset = value.length(); |
| + offsetAttribute.setOffset(finalOffset, finalOffset); |
| + } |
| + } |
| + |
| + @Override |
| + public void reset() { |
| + used = false; |
| + } |
| +} |
| diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestConcatenatingTokenStream.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestConcatenatingTokenStream.java |
| index 19542e408f..3d473760ae 100644 |
| --- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestConcatenatingTokenStream.java |
| +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestConcatenatingTokenStream.java |
| @@ -20,11 +20,14 @@ package org.apache.lucene.analysis.miscellaneous; |
| import java.io.IOException; |
| import java.io.StringReader; |
| |
| +import org.apache.lucene.analysis.Analyzer; |
| import org.apache.lucene.analysis.BaseTokenStreamTestCase; |
| import org.apache.lucene.analysis.CannedTokenStream; |
| import org.apache.lucene.analysis.MockTokenizer; |
| import org.apache.lucene.analysis.Token; |
| import org.apache.lucene.analysis.TokenStream; |
| +import org.apache.lucene.analysis.Tokenizer; |
| +import org.apache.lucene.analysis.core.WhitespaceTokenizer; |
| import org.apache.lucene.analysis.tokenattributes.FlagsAttribute; |
| import org.apache.lucene.analysis.tokenattributes.PayloadAttribute; |
| import org.apache.lucene.util.AttributeFactory; |
| @@ -42,21 +45,22 @@ public class TestConcatenatingTokenStream extends BaseTokenStreamTestCase { |
| final MockTokenizer third = new MockTokenizer(factory, MockTokenizer.WHITESPACE, false); |
| third.setReader(new StringReader(" third words")); |
| |
| - TokenStream ts = new ConcatenatingTokenStream(first, second, new EmptyTokenStream(), third); |
| + TokenStream ts = new ConcatenatingTokenStream(first, second, new EmptyTokenStream(), third, |
| + new SingletonTokenStream(first.cloneAttributes(), "end", false)); |
| assertTokenStreamContents(ts, |
| - new String[] { "first", "words", "second", "words", "third", "words" }, |
| - new int[]{ 0, 6, 12, 19, 25, 31 }, |
| - new int[]{ 5, 11, 18, 24, 30, 36 }); |
| + new String[] { "first", "words", "second", "words", "third", "words", "end" }, |
| + new int[]{ 0, 6, 12, 19, 25, 31, 36 }, |
| + new int[]{ 5, 11, 18, 24, 30, 36, 36 }); |
| |
| // test re-use |
| - first.setReader(new StringReader("first words ")); |
| - second.setReader(new StringReader("second words")); |
| - third.setReader(new StringReader(" third words")); |
| + first.setReader(new StringReader("first tokens ")); |
| + second.setReader(new StringReader("second tokens")); |
| + third.setReader(new StringReader(" third tokens")); |
| assertTokenStreamContents(ts, |
| - new String[] { "first", "words", "second", "words", "third", "words" }, |
| - new int[]{ 0, 6, 12, 19, 25, 31 }, |
| - new int[]{ 5, 11, 18, 24, 30, 36 }, |
| - new int[]{ 1, 1, 1, 1, 1, 1 }); |
| + new String[] { "first", "tokens", "second", "tokens", "third", "tokens", "end" }, |
| + new int[]{ 0, 6, 13, 20, 27, 33, 39 }, |
| + new int[]{ 5, 12, 19, 26, 32, 39, 39 }, |
| + new int[]{ 1, 1, 1, 1, 1, 1, 1 }); |
| |
| } |
| |
| @@ -99,7 +103,7 @@ public class TestConcatenatingTokenStream extends BaseTokenStreamTestCase { |
| |
| } |
| |
| - public void testInconsistentAttributeFactories() throws IOException { |
| + public void testInconsistentAttributeFactories() { |
| |
| final MockTokenizer first = new MockTokenizer(AttributeFactory.DEFAULT_ATTRIBUTE_FACTORY, MockTokenizer.WHITESPACE, true); |
| final MockTokenizer second = new MockTokenizer(TokenStream.DEFAULT_TOKEN_ATTRIBUTE_FACTORY, MockTokenizer.WHITESPACE, true); |
| @@ -108,4 +112,23 @@ public class TestConcatenatingTokenStream extends BaseTokenStreamTestCase { |
| |
| } |
| |
| + public void testThroughAnalyzer() throws IOException { |
| + |
| + Analyzer a = new Analyzer() { |
| + @Override |
| + protected TokenStreamComponents createComponents(String fieldName) { |
| + Tokenizer t = new WhitespaceTokenizer(); |
| + SingletonTokenStream boundary1 = new SingletonTokenStream("qq", false); |
| + SingletonTokenStream boundary2 = new SingletonTokenStream("qq", false); |
| + return new TokenStreamComponents(t, new ConcatenatingTokenStream(boundary1, t, boundary2)); |
| + } |
| + }; |
| + |
| + assertAnalyzesTo(a, "some words", |
| + new String[]{ "qq", "some", "words", "qq" }, |
| + new int[]{ 0, 0, 5, 10 }, |
| + new int[]{ 0, 4, 10, 10 }); |
| + |
| + } |
| + |
| } |
| diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestSingletonTokenStream.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestSingletonTokenStream.java |
| new file mode 100644 |
| index 0000000000..bc057b9afc |
| --- /dev/null |
| +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestSingletonTokenStream.java |
| @@ -0,0 +1,38 @@ |
| +/* |
| + * Licensed to the Apache Software Foundation (ASF) under one or more |
| + * contributor license agreements. See the NOTICE file distributed with |
| + * this work for additional information regarding copyright ownership. |
| + * The ASF licenses this file to You under the Apache License, Version 2.0 |
| + * (the "License"); you may not use this file except in compliance with |
| + * the License. You may obtain a copy of the License at |
| + * |
| + * http://www.apache.org/licenses/LICENSE-2.0 |
| + * |
| + * Unless required by applicable law or agreed to in writing, software |
| + * distributed under the License is distributed on an "AS IS" BASIS, |
| + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| + * See the License for the specific language governing permissions and |
| + * limitations under the License. |
| + */ |
| + |
| +package org.apache.lucene.analysis.miscellaneous; |
| + |
| +import java.io.IOException; |
| + |
| +import org.apache.lucene.analysis.BaseTokenStreamTestCase; |
| + |
| +public class TestSingletonTokenStream extends BaseTokenStreamTestCase { |
| + |
| + public void testSingletonWithOffsets() throws IOException { |
| + SingletonTokenStream ts = new SingletonTokenStream("value", true); |
| + assertTokenStreamContents(ts, new String[]{ "value" }, new int[]{ 0 }, new int[]{ 5 }, new int[] { 1 }); |
| + ts.reset(); |
| + assertTokenStreamContents(ts, new String[]{ "value" }, new int[]{ 0 }, new int[]{ 5 }, new int[] { 1 }); |
| + } |
| + |
| + public void testSingletonWithoutOffsets() throws IOException { |
| + SingletonTokenStream ts = new SingletonTokenStream("value", false); |
| + assertTokenStreamContents(ts, new String[]{ "value" }, new int[]{ 0 }, new int[]{ 0 }, new int[] { 1 }); |
| + } |
| + |
| +} |