docs/attachments/LUCENE-8651/LUCENE-8651.patch - lucene-jira-archive - Git at Google

 diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/SingletonTokenStream.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/SingletonTokenStream.java
 new file mode 100644
 index 0000000000..872c7bf151
 --- /dev/null
 +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/SingletonTokenStream.java
 @@ -0,0 +1,97 @@
 +/*
 + * Licensed to the Apache Software Foundation (ASF) under one or more
 + * contributor license agreements.  See the NOTICE file distributed with
 + * this work for additional information regarding copyright ownership.
 + * The ASF licenses this file to You under the Apache License, Version 2.0
 + * (the "License"); you may not use this file except in compliance with
 + * the License.  You may obtain a copy of the License at
 + *
 + *     http://www.apache.org/licenses/LICENSE-2.0
 + *
 + * Unless required by applicable law or agreed to in writing, software
 + * distributed under the License is distributed on an "AS IS" BASIS,
 + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 + * See the License for the specific language governing permissions and
 + * limitations under the License.
 + */
 +
 +package org.apache.lucene.analysis.miscellaneous;
 +
 +import java.io.IOException;
 +
 +import org.apache.lucene.analysis.TokenStream;
 +import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
 +import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
 +import org.apache.lucene.util.AttributeSource;
 +
 +/**
 + * A TokenStream implementation that returns a single token value before
 + * becoming exhausted
 + *
 + * Users can choose whether or not to set offsets on the token stream.  As
 + * an example of where not setting offsets may be useful, SingletonTokenStream
 + * could be used in conjunction with {@link ConcatenatingTokenStream} to add
 + * boundary markers to text, which should be inserted before and after the
 + * actual text tokens but should not affect their offsets
 + */
 +public final class SingletonTokenStream extends TokenStream {
 +
 +  private final CharTermAttribute termAttribute = addAttribute(CharTermAttribute.class);
 +  private final OffsetAttribute offsetAttribute = addAttribute(OffsetAttribute.class);
 +  private boolean used = true;
 +
 +  private final String value;
 +  private final boolean setOffsets;
 +
 +  /**
 +   * Create a new SingletonTokenStream with a given value
 +   *
 +   * @param value       the value to emit
 +   * @param setOffsets  whether or not to set offsets on the value
 +   */
 +  public SingletonTokenStream(String value, boolean setOffsets) {
 +    this.value = value;
 +    this.setOffsets = setOffsets;
 +  }
 +
 +  /**
 +   * Create a new SingletonTokenStream with a given value and attribute source
 +   *
 +   * @param source      the attribute source to use
 +   * @param value       the value to emit
 +   * @param setOffsets  whether or not to set offsets on the value
 +   */
 +  public SingletonTokenStream(AttributeSource source, String value, boolean setOffsets) {
 +    super(source);
 +    this.value = value;
 +    this.setOffsets = setOffsets;
 +  }
 +
 +  @Override
 +  public boolean incrementToken() {
 +    if (used) {
 +      return false;
 +    }
 +    clearAttributes();
 +    termAttribute.append(value);
 +    if (setOffsets) {
 +      offsetAttribute.setOffset(0, value.length());
 +    }
 +    used = true;
 +    return true;
 +  }
 +
 +  @Override
 +  public void end() throws IOException {
 +    super.end();
 +    if (setOffsets) {
 +      final int finalOffset = value.length();
 +      offsetAttribute.setOffset(finalOffset, finalOffset);
 +    }
 +  }
 +
 +  @Override
 +  public void reset() {
 +    used = false;
 +  }
 +}
 diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestConcatenatingTokenStream.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestConcatenatingTokenStream.java
 index 19542e408f..3d473760ae 100644
 --- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestConcatenatingTokenStream.java
 +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestConcatenatingTokenStream.java
 @@ -20,11 +20,14 @@ package org.apache.lucene.analysis.miscellaneous;
  import java.io.IOException;
  import java.io.StringReader;

 +import org.apache.lucene.analysis.Analyzer;
  import org.apache.lucene.analysis.BaseTokenStreamTestCase;
  import org.apache.lucene.analysis.CannedTokenStream;
  import org.apache.lucene.analysis.MockTokenizer;
  import org.apache.lucene.analysis.Token;
  import org.apache.lucene.analysis.TokenStream;
 +import org.apache.lucene.analysis.Tokenizer;
 +import org.apache.lucene.analysis.core.WhitespaceTokenizer;
  import org.apache.lucene.analysis.tokenattributes.FlagsAttribute;
  import org.apache.lucene.analysis.tokenattributes.PayloadAttribute;
  import org.apache.lucene.util.AttributeFactory;
 @@ -42,21 +45,22 @@ public class TestConcatenatingTokenStream extends BaseTokenStreamTestCase {
      final MockTokenizer third = new MockTokenizer(factory, MockTokenizer.WHITESPACE, false);
      third.setReader(new StringReader(" third words"));

 -    TokenStream ts = new ConcatenatingTokenStream(first, second, new EmptyTokenStream(), third);
 +    TokenStream ts = new ConcatenatingTokenStream(first, second, new EmptyTokenStream(), third,
 +        new SingletonTokenStream(first.cloneAttributes(), "end", false));
      assertTokenStreamContents(ts,
 -        new String[] { "first", "words", "second", "words", "third", "words" },
 -        new int[]{ 0, 6, 12, 19, 25, 31 },
 -        new int[]{ 5, 11, 18, 24, 30, 36 });
 +        new String[] { "first", "words", "second", "words", "third", "words", "end" },
 +        new int[]{ 0, 6, 12, 19, 25, 31, 36 },
 +        new int[]{ 5, 11, 18, 24, 30, 36, 36 });

      // test re-use
 -    first.setReader(new StringReader("first words "));
 -    second.setReader(new StringReader("second words"));
 -    third.setReader(new StringReader(" third words"));
 +    first.setReader(new StringReader("first tokens "));
 +    second.setReader(new StringReader("second tokens"));
 +    third.setReader(new StringReader(" third tokens"));
      assertTokenStreamContents(ts,
 -        new String[] { "first", "words", "second", "words", "third", "words" },
 -        new int[]{ 0, 6, 12, 19, 25, 31 },
 -        new int[]{ 5, 11, 18, 24, 30, 36 },
 -        new int[]{ 1, 1, 1, 1, 1, 1 });
 +        new String[] { "first", "tokens", "second", "tokens", "third", "tokens", "end" },
 +        new int[]{ 0, 6, 13, 20, 27, 33, 39 },
 +        new int[]{ 5, 12, 19, 26, 32, 39, 39 },
 +        new int[]{ 1, 1, 1, 1, 1, 1, 1 });

    }

 @@ -99,7 +103,7 @@ public class TestConcatenatingTokenStream extends BaseTokenStreamTestCase {

    }

 -  public void testInconsistentAttributeFactories() throws IOException {
 +  public void testInconsistentAttributeFactories() {

      final MockTokenizer first = new MockTokenizer(AttributeFactory.DEFAULT_ATTRIBUTE_FACTORY, MockTokenizer.WHITESPACE, true);
      final MockTokenizer second = new MockTokenizer(TokenStream.DEFAULT_TOKEN_ATTRIBUTE_FACTORY, MockTokenizer.WHITESPACE, true);
 @@ -108,4 +112,23 @@ public class TestConcatenatingTokenStream extends BaseTokenStreamTestCase {

    }

 +  public void testThroughAnalyzer() throws IOException {
 +
 +    Analyzer a = new Analyzer() {
 +      @Override
 +      protected TokenStreamComponents createComponents(String fieldName) {
 +        Tokenizer t = new WhitespaceTokenizer();
 +        SingletonTokenStream boundary1 = new SingletonTokenStream("qq", false);
 +        SingletonTokenStream boundary2 = new SingletonTokenStream("qq", false);
 +        return new TokenStreamComponents(t, new ConcatenatingTokenStream(boundary1, t, boundary2));
 +      }
 +    };
 +
 +    assertAnalyzesTo(a, "some words",
 +        new String[]{ "qq", "some", "words", "qq" },
 +        new int[]{     0,    0,      5,       10 },
 +        new int[]{     0,    4,      10,      10 });
 +
 +  }
 +
  }
 diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestSingletonTokenStream.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestSingletonTokenStream.java
 new file mode 100644
 index 0000000000..bc057b9afc
 --- /dev/null
 +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestSingletonTokenStream.java
 @@ -0,0 +1,38 @@
 +/*
 + * Licensed to the Apache Software Foundation (ASF) under one or more
 + * contributor license agreements.  See the NOTICE file distributed with
 + * this work for additional information regarding copyright ownership.
 + * The ASF licenses this file to You under the Apache License, Version 2.0
 + * (the "License"); you may not use this file except in compliance with
 + * the License.  You may obtain a copy of the License at
 + *
 + *     http://www.apache.org/licenses/LICENSE-2.0
 + *
 + * Unless required by applicable law or agreed to in writing, software
 + * distributed under the License is distributed on an "AS IS" BASIS,
 + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 + * See the License for the specific language governing permissions and
 + * limitations under the License.
 + */
 +
 +package org.apache.lucene.analysis.miscellaneous;
 +
 +import java.io.IOException;
 +
 +import org.apache.lucene.analysis.BaseTokenStreamTestCase;
 +
 +public class TestSingletonTokenStream extends BaseTokenStreamTestCase {
 +
 +  public void testSingletonWithOffsets() throws IOException {
 +    SingletonTokenStream ts = new SingletonTokenStream("value", true);
 +    assertTokenStreamContents(ts, new String[]{ "value" }, new int[]{ 0 }, new int[]{ 5 }, new int[] { 1 });
 +    ts.reset();
 +    assertTokenStreamContents(ts, new String[]{ "value" }, new int[]{ 0 }, new int[]{ 5 }, new int[] { 1 });
 +  }
 +
 +  public void testSingletonWithoutOffsets() throws IOException {
 +    SingletonTokenStream ts = new SingletonTokenStream("value", false);
 +    assertTokenStreamContents(ts, new String[]{ "value" }, new int[]{ 0 }, new int[]{ 0 }, new int[] { 1 });
 +  }
 +
 +}
	diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/SingletonTokenStream.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/SingletonTokenStream.java
	new file mode 100644
	index 0000000000..872c7bf151
	--- /dev/null
	+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/SingletonTokenStream.java
	@@ -0,0 +1,97 @@
	+/*
	+ * Licensed to the Apache Software Foundation (ASF) under one or more
	+ * contributor license agreements. See the NOTICE file distributed with
	+ * this work for additional information regarding copyright ownership.
	+ * The ASF licenses this file to You under the Apache License, Version 2.0
	+ * (the "License"); you may not use this file except in compliance with
	+ * the License. You may obtain a copy of the License at
	+ *
	+ * http://www.apache.org/licenses/LICENSE-2.0
	+ *
	+ * Unless required by applicable law or agreed to in writing, software
	+ * distributed under the License is distributed on an "AS IS" BASIS,
	+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	+ * See the License for the specific language governing permissions and
	+ * limitations under the License.
	+ */
	+
	+package org.apache.lucene.analysis.miscellaneous;
	+
	+import java.io.IOException;
	+
	+import org.apache.lucene.analysis.TokenStream;
	+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
	+import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
	+import org.apache.lucene.util.AttributeSource;
	+
	+/**
	+ * A TokenStream implementation that returns a single token value before
	+ * becoming exhausted
	+ *
	+ * Users can choose whether or not to set offsets on the token stream. As
	+ * an example of where not setting offsets may be useful, SingletonTokenStream
	+ * could be used in conjunction with {@link ConcatenatingTokenStream} to add
	+ * boundary markers to text, which should be inserted before and after the
	+ * actual text tokens but should not affect their offsets
	+ */
	+public final class SingletonTokenStream extends TokenStream {
	+
	+ private final CharTermAttribute termAttribute = addAttribute(CharTermAttribute.class);
	+ private final OffsetAttribute offsetAttribute = addAttribute(OffsetAttribute.class);
	+ private boolean used = true;
	+
	+ private final String value;
	+ private final boolean setOffsets;
	+
	+ /**
	+ * Create a new SingletonTokenStream with a given value
	+ *
	+ * @param value the value to emit
	+ * @param setOffsets whether or not to set offsets on the value
	+ */
	+ public SingletonTokenStream(String value, boolean setOffsets) {
	+ this.value = value;
	+ this.setOffsets = setOffsets;
	+ }
	+
	+ /**
	+ * Create a new SingletonTokenStream with a given value and attribute source
	+ *
	+ * @param source the attribute source to use
	+ * @param value the value to emit
	+ * @param setOffsets whether or not to set offsets on the value
	+ */
	+ public SingletonTokenStream(AttributeSource source, String value, boolean setOffsets) {
	+ super(source);
	+ this.value = value;
	+ this.setOffsets = setOffsets;
	+ }
	+
	+ @Override
	+ public boolean incrementToken() {
	+ if (used) {
	+ return false;
	+ }
	+ clearAttributes();
	+ termAttribute.append(value);
	+ if (setOffsets) {
	+ offsetAttribute.setOffset(0, value.length());
	+ }
	+ used = true;
	+ return true;
	+ }
	+
	+ @Override
	+ public void end() throws IOException {
	+ super.end();
	+ if (setOffsets) {
	+ final int finalOffset = value.length();
	+ offsetAttribute.setOffset(finalOffset, finalOffset);
	+ }
	+ }
	+
	+ @Override
	+ public void reset() {
	+ used = false;
	+ }
	+}
	diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestConcatenatingTokenStream.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestConcatenatingTokenStream.java
	index 19542e408f..3d473760ae 100644
	--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestConcatenatingTokenStream.java
	+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestConcatenatingTokenStream.java
	@@ -20,11 +20,14 @@ package org.apache.lucene.analysis.miscellaneous;
	import java.io.IOException;
	import java.io.StringReader;

	+import org.apache.lucene.analysis.Analyzer;
	import org.apache.lucene.analysis.BaseTokenStreamTestCase;
	import org.apache.lucene.analysis.CannedTokenStream;
	import org.apache.lucene.analysis.MockTokenizer;
	import org.apache.lucene.analysis.Token;
	import org.apache.lucene.analysis.TokenStream;
	+import org.apache.lucene.analysis.Tokenizer;
	+import org.apache.lucene.analysis.core.WhitespaceTokenizer;
	import org.apache.lucene.analysis.tokenattributes.FlagsAttribute;
	import org.apache.lucene.analysis.tokenattributes.PayloadAttribute;
	import org.apache.lucene.util.AttributeFactory;
	@@ -42,21 +45,22 @@ public class TestConcatenatingTokenStream extends BaseTokenStreamTestCase {
	final MockTokenizer third = new MockTokenizer(factory, MockTokenizer.WHITESPACE, false);
	third.setReader(new StringReader(" third words"));

	- TokenStream ts = new ConcatenatingTokenStream(first, second, new EmptyTokenStream(), third);
	+ TokenStream ts = new ConcatenatingTokenStream(first, second, new EmptyTokenStream(), third,
	+ new SingletonTokenStream(first.cloneAttributes(), "end", false));
	assertTokenStreamContents(ts,
	- new String[] { "first", "words", "second", "words", "third", "words" },
	- new int[]{ 0, 6, 12, 19, 25, 31 },
	- new int[]{ 5, 11, 18, 24, 30, 36 });
	+ new String[] { "first", "words", "second", "words", "third", "words", "end" },
	+ new int[]{ 0, 6, 12, 19, 25, 31, 36 },
	+ new int[]{ 5, 11, 18, 24, 30, 36, 36 });

	// test re-use
	- first.setReader(new StringReader("first words "));
	- second.setReader(new StringReader("second words"));
	- third.setReader(new StringReader(" third words"));
	+ first.setReader(new StringReader("first tokens "));
	+ second.setReader(new StringReader("second tokens"));
	+ third.setReader(new StringReader(" third tokens"));
	assertTokenStreamContents(ts,
	- new String[] { "first", "words", "second", "words", "third", "words" },
	- new int[]{ 0, 6, 12, 19, 25, 31 },
	- new int[]{ 5, 11, 18, 24, 30, 36 },
	- new int[]{ 1, 1, 1, 1, 1, 1 });
	+ new String[] { "first", "tokens", "second", "tokens", "third", "tokens", "end" },
	+ new int[]{ 0, 6, 13, 20, 27, 33, 39 },
	+ new int[]{ 5, 12, 19, 26, 32, 39, 39 },
	+ new int[]{ 1, 1, 1, 1, 1, 1, 1 });

	}

	@@ -99,7 +103,7 @@ public class TestConcatenatingTokenStream extends BaseTokenStreamTestCase {

	}

	- public void testInconsistentAttributeFactories() throws IOException {
	+ public void testInconsistentAttributeFactories() {

	final MockTokenizer first = new MockTokenizer(AttributeFactory.DEFAULT_ATTRIBUTE_FACTORY, MockTokenizer.WHITESPACE, true);
	final MockTokenizer second = new MockTokenizer(TokenStream.DEFAULT_TOKEN_ATTRIBUTE_FACTORY, MockTokenizer.WHITESPACE, true);
	@@ -108,4 +112,23 @@ public class TestConcatenatingTokenStream extends BaseTokenStreamTestCase {

	}

	+ public void testThroughAnalyzer() throws IOException {
	+
	+ Analyzer a = new Analyzer() {
	+ @Override
	+ protected TokenStreamComponents createComponents(String fieldName) {
	+ Tokenizer t = new WhitespaceTokenizer();
	+ SingletonTokenStream boundary1 = new SingletonTokenStream("qq", false);
	+ SingletonTokenStream boundary2 = new SingletonTokenStream("qq", false);
	+ return new TokenStreamComponents(t, new ConcatenatingTokenStream(boundary1, t, boundary2));
	+ }
	+ };
	+
	+ assertAnalyzesTo(a, "some words",
	+ new String[]{ "qq", "some", "words", "qq" },
	+ new int[]{ 0, 0, 5, 10 },
	+ new int[]{ 0, 4, 10, 10 });
	+
	+ }
	+
	}
	diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestSingletonTokenStream.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestSingletonTokenStream.java
	new file mode 100644
	index 0000000000..bc057b9afc
	--- /dev/null
	+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestSingletonTokenStream.java
	@@ -0,0 +1,38 @@
	+/*
	+ * Licensed to the Apache Software Foundation (ASF) under one or more
	+ * contributor license agreements. See the NOTICE file distributed with
	+ * this work for additional information regarding copyright ownership.
	+ * The ASF licenses this file to You under the Apache License, Version 2.0
	+ * (the "License"); you may not use this file except in compliance with
	+ * the License. You may obtain a copy of the License at
	+ *
	+ * http://www.apache.org/licenses/LICENSE-2.0
	+ *
	+ * Unless required by applicable law or agreed to in writing, software
	+ * distributed under the License is distributed on an "AS IS" BASIS,
	+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	+ * See the License for the specific language governing permissions and
	+ * limitations under the License.
	+ */
	+
	+package org.apache.lucene.analysis.miscellaneous;
	+
	+import java.io.IOException;
	+
	+import org.apache.lucene.analysis.BaseTokenStreamTestCase;
	+
	+public class TestSingletonTokenStream extends BaseTokenStreamTestCase {
	+
	+ public void testSingletonWithOffsets() throws IOException {
	+ SingletonTokenStream ts = new SingletonTokenStream("value", true);
	+ assertTokenStreamContents(ts, new String[]{ "value" }, new int[]{ 0 }, new int[]{ 5 }, new int[] { 1 });
	+ ts.reset();
	+ assertTokenStreamContents(ts, new String[]{ "value" }, new int[]{ 0 }, new int[]{ 5 }, new int[] { 1 });
	+ }
	+
	+ public void testSingletonWithoutOffsets() throws IOException {
	+ SingletonTokenStream ts = new SingletonTokenStream("value", false);
	+ assertTokenStreamContents(ts, new String[]{ "value" }, new int[]{ 0 }, new int[]{ 0 }, new int[] { 1 });
	+ }
	+
	+}