blob: 147a7fb2de55250997c765c9ecf7df2b9c515c45 [file] [log] [blame]
Index: lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestFingerprintFilter.java
IDEA additional info:
Subsystem: com.intellij.openapi.diff.impl.patch.CharsetEP
<+>UTF-8
===================================================================
--- lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestFingerprintFilter.java (revision 7bb3e5c2482c7b73ed2dd26ff4be4613e7f44872)
+++ lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestFingerprintFilter.java (date 1526676568000)
@@ -69,4 +69,13 @@
}
}
+ public void testEmpty() throws Exception {
+ for (final boolean consumeAll : new boolean[] { true, false }) {
+ MockTokenizer tokenizer = whitespaceMockTokenizer("");
+ tokenizer.setEnableChecks(consumeAll);
+ TokenStream stream = new FingerprintFilter(tokenizer);
+ assertTokenStreamContents(stream, new String[0]);
+ }
+ }
+
}
Index: lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestConcatenateFilterFactory.java
IDEA additional info:
Subsystem: com.intellij.openapi.diff.impl.patch.CharsetEP
<+>UTF-8
===================================================================
--- lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestConcatenateFilterFactory.java (date 1526675919000)
+++ lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestConcatenateFilterFactory.java (date 1526675919000)
@@ -0,0 +1,55 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.analysis.miscellaneous;
+
+import java.io.Reader;
+import java.io.StringReader;
+
+import org.apache.lucene.analysis.MockTokenizer;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.util.BaseTokenStreamFactoryTestCase;
+
+public class TestConcatenateFilterFactory extends BaseTokenStreamFactoryTestCase {
+ public void test() throws Exception {
+ for (final boolean consumeAll : new boolean[]{true, false}) {
+ Reader reader = new StringReader("A1 B2 A1 D4 C3");
+ MockTokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
+ tokenizer.setReader(reader);
+ tokenizer.setEnableChecks(consumeAll);
+ TokenStream stream = tokenizer;
+ stream = tokenFilterFactory("Concatenate",
+ ConcatenateFilterFactory.SEPARATOR_KEY, "_"
+ ).create(stream);
+ assertTokenStreamContents(stream, new String[]{"A1_B2_A1_D4_C3"});
+ }
+ }
+
+ public void testRequired() throws Exception {
+ // no params are required
+ tokenFilterFactory("Concatenate");
+ }
+
+ /**
+ * Test that bogus arguments result in exception
+ */
+ public void testBogusArguments() throws Exception {
+ IllegalArgumentException expected = expectThrows(IllegalArgumentException.class, () ->
+ tokenFilterFactory("Concatenate", "bogusArg", "bogusValue"));
+ assertTrue(expected.getMessage().contains("Unknown parameters"));
+ }
+}
Index: lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestConcatenateFilter.java
IDEA additional info:
Subsystem: com.intellij.openapi.diff.impl.patch.CharsetEP
<+>UTF-8
===================================================================
--- lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestConcatenateFilter.java (date 1526678320000)
+++ lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestConcatenateFilter.java (date 1526678320000)
@@ -0,0 +1,64 @@
+/*
+ This software was produced for the U. S. Government
+ under Contract No. W15P7T-11-C-F600, and is
+ subject to the Rights in Noncommercial Computer Software
+ and Noncommercial Computer Software Documentation
+ Clause 252.227-7014 (JUN 1995)
+
+ Copyright 2013 The MITRE Corporation. All Rights Reserved.
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+ */
+
+package org.apache.lucene.analysis.miscellaneous;
+
+import java.io.IOException;
+
+import org.apache.lucene.analysis.BaseTokenStreamTestCase;
+import org.apache.lucene.analysis.MockTokenizer;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.Tokenizer;
+
+public class TestConcatenateFilter extends BaseTokenStreamTestCase {
+
+ public void testTypical() throws IOException {
+ String NYC = "new york city";
+ Tokenizer tokenizer = whitespaceMockTokenizer(NYC);
+ ConcatenateFilter filter = new ConcatenateFilter(tokenizer, ' ');
+ assertTokenStreamContents(filter, new String[]{NYC},
+ new int[]{0}, new int[]{NYC.length()}, new String[]{"shingle"},
+ new int[]{1}, null, NYC.length(), true);
+ }
+
+ public void testCustomSeparator() throws IOException {
+ String NYC = "new york city";
+ Tokenizer tokenizer = whitespaceMockTokenizer(NYC);
+ ConcatenateFilter filter = new ConcatenateFilter(tokenizer, '_');
+ assertTokenStreamContents(filter, new String[]{NYC.replace(' ', '_')});
+ }
+
+ public void testSingleToken() throws Exception {
+ for (final boolean consumeAll : new boolean[] { true, false }) {
+ MockTokenizer tokenizer = whitespaceMockTokenizer("A1");
+ tokenizer.setEnableChecks(consumeAll);
+ TokenStream stream = new ConcatenateFilter(tokenizer, ' ');
+ assertTokenStreamContents(stream, new String[] { "A1" });
+ }
+ }
+
+ public void testEmpty() throws IOException {
+ Tokenizer tokenizer = whitespaceMockTokenizer("");
+ ConcatenateFilter filter = new ConcatenateFilter(tokenizer, ' ');
+ assertTokenStreamContents(filter, new String[0]);
+ }
+}
\ No newline at end of file
Index: lucene/NOTICE.txt
IDEA additional info:
Subsystem: com.intellij.openapi.diff.impl.patch.CharsetEP
<+>UTF-8
===================================================================
--- lucene/NOTICE.txt (revision 7bb3e5c2482c7b73ed2dd26ff4be4613e7f44872)
+++ lucene/NOTICE.txt (date 1526677162000)
@@ -202,3 +202,12 @@
which can be obtained from
https://bitbucket.org/eunjeon/mecab-ko-dic/downloads/mecab-ko-dic-2.0.3-20170922.tar.gz
+
+The ConcatenateFilter came from the OpenSextant Solr Text Tagger,
+Copyright 2013 The MITRE Corporation. All Rights Reserved.
+
+ This software was produced for the U. S. Government
+ under Contract No. W15P7T-11-C-F600, and is
+ subject to the Rights in Noncommercial Computer Software
+ and Noncommercial Computer Software Documentation
+ Clause 252.227-7014 (JUN 1995)
\ No newline at end of file
Index: lucene/analysis/common/src/resources/META-INF/services/org.apache.lucene.analysis.util.TokenFilterFactory
IDEA additional info:
Subsystem: com.intellij.openapi.diff.impl.patch.CharsetEP
<+>UTF-8
===================================================================
--- lucene/analysis/common/src/resources/META-INF/services/org.apache.lucene.analysis.util.TokenFilterFactory (revision 7bb3e5c2482c7b73ed2dd26ff4be4613e7f44872)
+++ lucene/analysis/common/src/resources/META-INF/services/org.apache.lucene.analysis.util.TokenFilterFactory (date 1526675800000)
@@ -63,6 +63,7 @@
org.apache.lucene.analysis.minhash.MinHashFilterFactory
org.apache.lucene.analysis.miscellaneous.ASCIIFoldingFilterFactory
org.apache.lucene.analysis.miscellaneous.CapitalizationFilterFactory
+org.apache.lucene.analysis.miscellaneous.ConcatenateFilterFactory
org.apache.lucene.analysis.miscellaneous.CodepointCountFilterFactory
org.apache.lucene.analysis.miscellaneous.DateRecognizerFilterFactory
org.apache.lucene.analysis.miscellaneous.DelimitedTermFrequencyTokenFilterFactory
Index: lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/ConcatenateFilterFactory.java
IDEA additional info:
Subsystem: com.intellij.openapi.diff.impl.patch.CharsetEP
<+>UTF-8
===================================================================
--- lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/ConcatenateFilterFactory.java (date 1526675396000)
+++ lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/ConcatenateFilterFactory.java (date 1526675396000)
@@ -0,0 +1,56 @@
+/*
+ This software was produced for the U. S. Government
+ under Contract No. W15P7T-11-C-F600, and is
+ subject to the Rights in Noncommercial Computer Software
+ and Noncommercial Computer Software Documentation
+ Clause 252.227-7014 (JUN 1995)
+
+ Copyright 2013 The MITRE Corporation. All Rights Reserved.
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+ */
+
+package org.apache.lucene.analysis.miscellaneous;
+
+import java.util.Map;
+
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.util.TokenFilterFactory;
+
+/**
+ * Factory for {@link ConcatenateFilter}.
+ *
+ * <pre class="prettyprint">
+ * The {@code separator} property is optional and defaults to the space character.
+ * </pre>
+ * @see ConcatenateFilter
+ * @since 7.4.0
+ */
+public class ConcatenateFilterFactory extends TokenFilterFactory {
+
+ public static final String SEPARATOR_KEY = "separator";
+ private final char separator;
+
+ public ConcatenateFilterFactory(Map<String, String> args) {
+ super(args);
+ separator = getChar(args, SEPARATOR_KEY, ' ');
+ if (!args.isEmpty()) {
+ throw new IllegalArgumentException("Unknown parameters: " + args);
+ }
+ }
+
+ @Override
+ public TokenStream create(TokenStream input) {
+ return new ConcatenateFilter(input, separator);
+ }
+}
Index: lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/FingerprintFilter.java
IDEA additional info:
Subsystem: com.intellij.openapi.diff.impl.patch.CharsetEP
<+>UTF-8
===================================================================
--- lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/FingerprintFilter.java (revision 7bb3e5c2482c7b73ed2dd26ff4be4613e7f44872)
+++ lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/FingerprintFilter.java (date 1526673044000)
@@ -81,8 +81,7 @@
@Override
public final boolean incrementToken() throws IOException {
- if (uniqueTerms != null) {
- // We have already built the single output token - there's no more
+ if (inputEnded) {
return false;
}
boolean result = buildSingleOutputToken();
@@ -177,6 +176,7 @@
}
});
+ //TODO lets append directly to termAttribute?
StringBuilder sb = new StringBuilder();
for (Object item : items) {
if (sb.length() >= 1) {
Index: lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/ConcatenateFilter.java
IDEA additional info:
Subsystem: com.intellij.openapi.diff.impl.patch.CharsetEP
<+>UTF-8
===================================================================
--- lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/ConcatenateFilter.java (date 1526678320000)
+++ lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/ConcatenateFilter.java (date 1526678320000)
@@ -0,0 +1,139 @@
+/*
+ This software was produced for the U. S. Government
+ under Contract No. W15P7T-11-C-F600, and is
+ subject to the Rights in Noncommercial Computer Software
+ and Noncommercial Computer Software Documentation
+ Clause 252.227-7014 (JUN 1995)
+
+ Copyright 2013 The MITRE Corporation. All Rights Reserved.
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+ */
+
+package org.apache.lucene.analysis.miscellaneous;
+
+import java.io.IOException;
+
+import org.apache.lucene.analysis.TokenFilter;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.shingle.ShingleFilter;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
+import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
+import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute;
+import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
+import org.apache.lucene.util.AttributeSource;
+
+/**
+ * Concatenates/Joins every incoming token with a configured separator into one output token.
+ *
+ * The behavior of this filter is undefined when {@link PositionIncrementAttribute} and {@link PositionLengthAttribute}
+ * have non-default values. Currently these attributes are simply ignored but in the future, this filter might
+ * insert an additional separator for posInc gaps, and it may produce additional concatenated/joined tokens if there are
+ * multiple tokens at the same position.
+ */
+public class ConcatenateFilter extends TokenFilter {
+ /*
+ TODO use GraphTokenStreamFiniteStrings to handle arbitrary analysis
+ */
+
+ private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
+ private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
+ private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class);
+ private final PositionLengthAttribute posLenAtt = addAttribute(PositionLengthAttribute.class);
+ private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class);
+
+ private AttributeSource.State finalState;
+
+ private final char separator;
+ private boolean inputEnded = false;
+ private StringBuilder buf = new StringBuilder(128);
+
+ /**
+ * @param input The input TokenStream
+ * @param separator the separator placed between each token
+ */
+ public ConcatenateFilter(TokenStream input, char separator) {
+ super(input);
+ this.separator = separator;
+ }
+
+ @Override
+ public void reset() throws IOException {
+ super.reset();
+ inputEnded = false;
+ finalState = null;
+ }
+
+ @Override
+ public final boolean incrementToken() throws IOException {
+ //note: this code is identical to that of FingerprintFilter
+ if (inputEnded) {
+ return false;
+ }
+ boolean result = buildSingleOutputToken();
+ finalState = captureState();
+ return result;
+ }
+
+ /**
+ * Gathers all tokens from input then concatenates.
+ *
+ * @return false for end of stream; true otherwise
+ */
+ private final boolean buildSingleOutputToken() throws IOException {
+ inputEnded = false;
+
+ buf.setLength(0);
+ boolean firstTerm = true;
+ while (input.incrementToken()) {
+ if (!firstTerm) {
+ buf.append(separator);
+ }
+ //TODO consider indexing special chars when posInc > 1 (stop words). We ignore for now.
+ buf.append(termAtt);
+ firstTerm = false;
+ }
+ input.end();//call here so we can see end of stream offsets
+ inputEnded = true;
+
+ //no input tokens, not even 1
+ if (firstTerm) {
+ return false;
+ }
+
+ termAtt.setEmpty().append(buf);
+ //Setting the other attributes ultimately won't have much effect but lets be thorough
+ offsetAtt.setOffset(0, offsetAtt.endOffset());
+ posIncrAtt.setPositionIncrement(1);
+ posLenAtt.setPositionLength(1);//or do we add up the positions? Probably not used any way.
+ typeAtt.setType(ShingleFilter.DEFAULT_TOKEN_TYPE);//"shingle"
+
+ return true;
+ }
+
+ @Override
+ public void end() throws IOException {
+ //note: this code is identical to that of FingerprintFilter
+ if (!inputEnded) {
+ // Rare case - If an IOException occurs while performing buildSingleOutputToken
+ // we may not have called input.end() already
+ input.end();
+ inputEnded = true;
+ }
+
+ if (finalState != null) {
+ restoreState(finalState);
+ }
+ }
+}