| Index: lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestFingerprintFilter.java |
| IDEA additional info: |
| Subsystem: com.intellij.openapi.diff.impl.patch.CharsetEP |
| <+>UTF-8 |
| =================================================================== |
| --- lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestFingerprintFilter.java (revision 7bb3e5c2482c7b73ed2dd26ff4be4613e7f44872) |
| +++ lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestFingerprintFilter.java (date 1526676568000) |
| @@ -69,4 +69,13 @@ |
| } |
| } |
| |
| + public void testEmpty() throws Exception { |
| + for (final boolean consumeAll : new boolean[] { true, false }) { |
| + MockTokenizer tokenizer = whitespaceMockTokenizer(""); |
| + tokenizer.setEnableChecks(consumeAll); |
| + TokenStream stream = new FingerprintFilter(tokenizer); |
| + assertTokenStreamContents(stream, new String[0]); |
| + } |
| + } |
| + |
| } |
| Index: lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestConcatenateFilterFactory.java |
| IDEA additional info: |
| Subsystem: com.intellij.openapi.diff.impl.patch.CharsetEP |
| <+>UTF-8 |
| =================================================================== |
| --- lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestConcatenateFilterFactory.java (date 1526675919000) |
| +++ lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestConcatenateFilterFactory.java (date 1526675919000) |
| @@ -0,0 +1,55 @@ |
| +/* |
| + * Licensed to the Apache Software Foundation (ASF) under one or more |
| + * contributor license agreements. See the NOTICE file distributed with |
| + * this work for additional information regarding copyright ownership. |
| + * The ASF licenses this file to You under the Apache License, Version 2.0 |
| + * (the "License"); you may not use this file except in compliance with |
| + * the License. You may obtain a copy of the License at |
| + * |
| + * http://www.apache.org/licenses/LICENSE-2.0 |
| + * |
| + * Unless required by applicable law or agreed to in writing, software |
| + * distributed under the License is distributed on an "AS IS" BASIS, |
| + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| + * See the License for the specific language governing permissions and |
| + * limitations under the License. |
| + */ |
| + |
| +package org.apache.lucene.analysis.miscellaneous; |
| + |
| +import java.io.Reader; |
| +import java.io.StringReader; |
| + |
| +import org.apache.lucene.analysis.MockTokenizer; |
| +import org.apache.lucene.analysis.TokenStream; |
| +import org.apache.lucene.analysis.util.BaseTokenStreamFactoryTestCase; |
| + |
| +public class TestConcatenateFilterFactory extends BaseTokenStreamFactoryTestCase { |
| + public void test() throws Exception { |
| + for (final boolean consumeAll : new boolean[]{true, false}) { |
| + Reader reader = new StringReader("A1 B2 A1 D4 C3"); |
| + MockTokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false); |
| + tokenizer.setReader(reader); |
| + tokenizer.setEnableChecks(consumeAll); |
| + TokenStream stream = tokenizer; |
| + stream = tokenFilterFactory("Concatenate", |
| + ConcatenateFilterFactory.SEPARATOR_KEY, "_" |
| + ).create(stream); |
| + assertTokenStreamContents(stream, new String[]{"A1_B2_A1_D4_C3"}); |
| + } |
| + } |
| + |
| + public void testRequired() throws Exception { |
| + // no params are required |
| + tokenFilterFactory("Concatenate"); |
| + } |
| + |
| + /** |
| + * Test that bogus arguments result in exception |
| + */ |
| + public void testBogusArguments() throws Exception { |
| + IllegalArgumentException expected = expectThrows(IllegalArgumentException.class, () -> |
| + tokenFilterFactory("Concatenate", "bogusArg", "bogusValue")); |
| + assertTrue(expected.getMessage().contains("Unknown parameters")); |
| + } |
| +} |
| Index: lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestConcatenateFilter.java |
| IDEA additional info: |
| Subsystem: com.intellij.openapi.diff.impl.patch.CharsetEP |
| <+>UTF-8 |
| =================================================================== |
| --- lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestConcatenateFilter.java (date 1526678320000) |
| +++ lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestConcatenateFilter.java (date 1526678320000) |
| @@ -0,0 +1,64 @@ |
| +/* |
| + This software was produced for the U. S. Government |
| + under Contract No. W15P7T-11-C-F600, and is |
| + subject to the Rights in Noncommercial Computer Software |
| + and Noncommercial Computer Software Documentation |
| + Clause 252.227-7014 (JUN 1995) |
| + |
| + Copyright 2013 The MITRE Corporation. All Rights Reserved. |
| + |
| + Licensed under the Apache License, Version 2.0 (the "License"); |
| + you may not use this file except in compliance with the License. |
| + You may obtain a copy of the License at |
| + |
| + http://www.apache.org/licenses/LICENSE-2.0 |
| + |
| + Unless required by applicable law or agreed to in writing, software |
| + distributed under the License is distributed on an "AS IS" BASIS, |
| + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| + See the License for the specific language governing permissions and |
| + limitations under the License. |
| + */ |
| + |
| +package org.apache.lucene.analysis.miscellaneous; |
| + |
| +import java.io.IOException; |
| + |
| +import org.apache.lucene.analysis.BaseTokenStreamTestCase; |
| +import org.apache.lucene.analysis.MockTokenizer; |
| +import org.apache.lucene.analysis.TokenStream; |
| +import org.apache.lucene.analysis.Tokenizer; |
| + |
| +public class TestConcatenateFilter extends BaseTokenStreamTestCase { |
| + |
| + public void testTypical() throws IOException { |
| + String NYC = "new york city"; |
| + Tokenizer tokenizer = whitespaceMockTokenizer(NYC); |
| + ConcatenateFilter filter = new ConcatenateFilter(tokenizer, ' '); |
| + assertTokenStreamContents(filter, new String[]{NYC}, |
| + new int[]{0}, new int[]{NYC.length()}, new String[]{"shingle"}, |
| + new int[]{1}, null, NYC.length(), true); |
| + } |
| + |
| + public void testCustomSeparator() throws IOException { |
| + String NYC = "new york city"; |
| + Tokenizer tokenizer = whitespaceMockTokenizer(NYC); |
| + ConcatenateFilter filter = new ConcatenateFilter(tokenizer, '_'); |
| + assertTokenStreamContents(filter, new String[]{NYC.replace(' ', '_')}); |
| + } |
| + |
| + public void testSingleToken() throws Exception { |
| + for (final boolean consumeAll : new boolean[] { true, false }) { |
| + MockTokenizer tokenizer = whitespaceMockTokenizer("A1"); |
| + tokenizer.setEnableChecks(consumeAll); |
| + TokenStream stream = new ConcatenateFilter(tokenizer, ' '); |
| + assertTokenStreamContents(stream, new String[] { "A1" }); |
| + } |
| + } |
| + |
| + public void testEmpty() throws IOException { |
| + Tokenizer tokenizer = whitespaceMockTokenizer(""); |
| + ConcatenateFilter filter = new ConcatenateFilter(tokenizer, ' '); |
| + assertTokenStreamContents(filter, new String[0]); |
| + } |
| +} |
| \ No newline at end of file |
| Index: lucene/NOTICE.txt |
| IDEA additional info: |
| Subsystem: com.intellij.openapi.diff.impl.patch.CharsetEP |
| <+>UTF-8 |
| =================================================================== |
| --- lucene/NOTICE.txt (revision 7bb3e5c2482c7b73ed2dd26ff4be4613e7f44872) |
| +++ lucene/NOTICE.txt (date 1526677162000) |
| @@ -202,3 +202,12 @@ |
| which can be obtained from |
| |
| https://bitbucket.org/eunjeon/mecab-ko-dic/downloads/mecab-ko-dic-2.0.3-20170922.tar.gz |
| + |
| +The ConcatenateFilter came from the OpenSextant Solr Text Tagger, |
| +Copyright 2013 The MITRE Corporation. All Rights Reserved. |
| + |
| + This software was produced for the U. S. Government |
| + under Contract No. W15P7T-11-C-F600, and is |
| + subject to the Rights in Noncommercial Computer Software |
| + and Noncommercial Computer Software Documentation |
| + Clause 252.227-7014 (JUN 1995) |
| \ No newline at end of file |
| Index: lucene/analysis/common/src/resources/META-INF/services/org.apache.lucene.analysis.util.TokenFilterFactory |
| IDEA additional info: |
| Subsystem: com.intellij.openapi.diff.impl.patch.CharsetEP |
| <+>UTF-8 |
| =================================================================== |
| --- lucene/analysis/common/src/resources/META-INF/services/org.apache.lucene.analysis.util.TokenFilterFactory (revision 7bb3e5c2482c7b73ed2dd26ff4be4613e7f44872) |
| +++ lucene/analysis/common/src/resources/META-INF/services/org.apache.lucene.analysis.util.TokenFilterFactory (date 1526675800000) |
| @@ -63,6 +63,7 @@ |
| org.apache.lucene.analysis.minhash.MinHashFilterFactory |
| org.apache.lucene.analysis.miscellaneous.ASCIIFoldingFilterFactory |
| org.apache.lucene.analysis.miscellaneous.CapitalizationFilterFactory |
| +org.apache.lucene.analysis.miscellaneous.ConcatenateFilterFactory |
| org.apache.lucene.analysis.miscellaneous.CodepointCountFilterFactory |
| org.apache.lucene.analysis.miscellaneous.DateRecognizerFilterFactory |
| org.apache.lucene.analysis.miscellaneous.DelimitedTermFrequencyTokenFilterFactory |
| Index: lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/ConcatenateFilterFactory.java |
| IDEA additional info: |
| Subsystem: com.intellij.openapi.diff.impl.patch.CharsetEP |
| <+>UTF-8 |
| =================================================================== |
| --- lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/ConcatenateFilterFactory.java (date 1526675396000) |
| +++ lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/ConcatenateFilterFactory.java (date 1526675396000) |
| @@ -0,0 +1,56 @@ |
| +/* |
| + This software was produced for the U. S. Government |
| + under Contract No. W15P7T-11-C-F600, and is |
| + subject to the Rights in Noncommercial Computer Software |
| + and Noncommercial Computer Software Documentation |
| + Clause 252.227-7014 (JUN 1995) |
| + |
| + Copyright 2013 The MITRE Corporation. All Rights Reserved. |
| + |
| + Licensed under the Apache License, Version 2.0 (the "License"); |
| + you may not use this file except in compliance with the License. |
| + You may obtain a copy of the License at |
| + |
| + http://www.apache.org/licenses/LICENSE-2.0 |
| + |
| + Unless required by applicable law or agreed to in writing, software |
| + distributed under the License is distributed on an "AS IS" BASIS, |
| + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| + See the License for the specific language governing permissions and |
| + limitations under the License. |
| + */ |
| + |
| +package org.apache.lucene.analysis.miscellaneous; |
| + |
| +import java.util.Map; |
| + |
| +import org.apache.lucene.analysis.TokenStream; |
| +import org.apache.lucene.analysis.util.TokenFilterFactory; |
| + |
| +/** |
| + * Factory for {@link ConcatenateFilter}. |
| + * |
| + * <pre class="prettyprint"> |
| + * The {@code separator} property is optional and defaults to the space character. |
| + * </pre> |
| + * @see ConcatenateFilter |
| + * @since 7.4.0 |
| + */ |
| +public class ConcatenateFilterFactory extends TokenFilterFactory { |
| + |
| + public static final String SEPARATOR_KEY = "separator"; |
| + private final char separator; |
| + |
| + public ConcatenateFilterFactory(Map<String, String> args) { |
| + super(args); |
| + separator = getChar(args, SEPARATOR_KEY, ' '); |
| + if (!args.isEmpty()) { |
| + throw new IllegalArgumentException("Unknown parameters: " + args); |
| + } |
| + } |
| + |
| + @Override |
| + public TokenStream create(TokenStream input) { |
| + return new ConcatenateFilter(input, separator); |
| + } |
| +} |
| Index: lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/FingerprintFilter.java |
| IDEA additional info: |
| Subsystem: com.intellij.openapi.diff.impl.patch.CharsetEP |
| <+>UTF-8 |
| =================================================================== |
| --- lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/FingerprintFilter.java (revision 7bb3e5c2482c7b73ed2dd26ff4be4613e7f44872) |
| +++ lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/FingerprintFilter.java (date 1526673044000) |
| @@ -81,8 +81,7 @@ |
| |
| @Override |
| public final boolean incrementToken() throws IOException { |
| - if (uniqueTerms != null) { |
| - // We have already built the single output token - there's no more |
| + if (inputEnded) { |
| return false; |
| } |
| boolean result = buildSingleOutputToken(); |
| @@ -177,6 +176,7 @@ |
| } |
| }); |
| |
| + //TODO lets append directly to termAttribute? |
| StringBuilder sb = new StringBuilder(); |
| for (Object item : items) { |
| if (sb.length() >= 1) { |
| Index: lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/ConcatenateFilter.java |
| IDEA additional info: |
| Subsystem: com.intellij.openapi.diff.impl.patch.CharsetEP |
| <+>UTF-8 |
| =================================================================== |
| --- lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/ConcatenateFilter.java (date 1526678320000) |
| +++ lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/ConcatenateFilter.java (date 1526678320000) |
| @@ -0,0 +1,139 @@ |
| +/* |
| + This software was produced for the U. S. Government |
| + under Contract No. W15P7T-11-C-F600, and is |
| + subject to the Rights in Noncommercial Computer Software |
| + and Noncommercial Computer Software Documentation |
| + Clause 252.227-7014 (JUN 1995) |
| + |
| + Copyright 2013 The MITRE Corporation. All Rights Reserved. |
| + |
| + Licensed under the Apache License, Version 2.0 (the "License"); |
| + you may not use this file except in compliance with the License. |
| + You may obtain a copy of the License at |
| + |
| + http://www.apache.org/licenses/LICENSE-2.0 |
| + |
| + Unless required by applicable law or agreed to in writing, software |
| + distributed under the License is distributed on an "AS IS" BASIS, |
| + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| + See the License for the specific language governing permissions and |
| + limitations under the License. |
| + */ |
| + |
| +package org.apache.lucene.analysis.miscellaneous; |
| + |
| +import java.io.IOException; |
| + |
| +import org.apache.lucene.analysis.TokenFilter; |
| +import org.apache.lucene.analysis.TokenStream; |
| +import org.apache.lucene.analysis.shingle.ShingleFilter; |
| +import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; |
| +import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; |
| +import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; |
| +import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute; |
| +import org.apache.lucene.analysis.tokenattributes.TypeAttribute; |
| +import org.apache.lucene.util.AttributeSource; |
| + |
| +/** |
| + * Concatenates/Joins every incoming token with a configured separator into one output token. |
| + * |
| + * The behavior of this filter is undefined when {@link PositionIncrementAttribute} and {@link PositionLengthAttribute} |
| + * have non-default values. Currently these attributes are simply ignored but in the future, this filter might |
| + * insert an additional separator for posInc gaps, and it may produce additional concatenated/joined tokens if there are |
| + * multiple tokens at the same position. |
| + */ |
| +public class ConcatenateFilter extends TokenFilter { |
| + /* |
| + TODO use GraphTokenStreamFiniteStrings to handle arbitrary analysis |
| + */ |
| + |
| + private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); |
| + private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class); |
| + private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class); |
| + private final PositionLengthAttribute posLenAtt = addAttribute(PositionLengthAttribute.class); |
| + private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class); |
| + |
| + private AttributeSource.State finalState; |
| + |
| + private final char separator; |
| + private boolean inputEnded = false; |
| + private StringBuilder buf = new StringBuilder(128); |
| + |
| + /** |
| + * @param input The input TokenStream |
| + * @param separator the separator placed between each token |
| + */ |
| + public ConcatenateFilter(TokenStream input, char separator) { |
| + super(input); |
| + this.separator = separator; |
| + } |
| + |
| + @Override |
| + public void reset() throws IOException { |
| + super.reset(); |
| + inputEnded = false; |
| + finalState = null; |
| + } |
| + |
| + @Override |
| + public final boolean incrementToken() throws IOException { |
| + //note: this code is identical to that of FingerprintFilter |
| + if (inputEnded) { |
| + return false; |
| + } |
| + boolean result = buildSingleOutputToken(); |
| + finalState = captureState(); |
| + return result; |
| + } |
| + |
| + /** |
| + * Gathers all tokens from input then concatenates. |
| + * |
| + * @return false for end of stream; true otherwise |
| + */ |
| + private final boolean buildSingleOutputToken() throws IOException { |
| + inputEnded = false; |
| + |
| + buf.setLength(0); |
| + boolean firstTerm = true; |
| + while (input.incrementToken()) { |
| + if (!firstTerm) { |
| + buf.append(separator); |
| + } |
| + //TODO consider indexing special chars when posInc > 1 (stop words). We ignore for now. |
| + buf.append(termAtt); |
| + firstTerm = false; |
| + } |
| + input.end();//call here so we can see end of stream offsets |
| + inputEnded = true; |
| + |
| + //no input tokens, not even 1 |
| + if (firstTerm) { |
| + return false; |
| + } |
| + |
| + termAtt.setEmpty().append(buf); |
| + //Setting the other attributes ultimately won't have much effect but lets be thorough |
| + offsetAtt.setOffset(0, offsetAtt.endOffset()); |
| + posIncrAtt.setPositionIncrement(1); |
| + posLenAtt.setPositionLength(1);//or do we add up the positions? Probably not used any way. |
| + typeAtt.setType(ShingleFilter.DEFAULT_TOKEN_TYPE);//"shingle" |
| + |
| + return true; |
| + } |
| + |
| + @Override |
| + public void end() throws IOException { |
| + //note: this code is identical to that of FingerprintFilter |
| + if (!inputEnded) { |
| + // Rare case - If an IOException occurs while performing buildSingleOutputToken |
| + // we may not have called input.end() already |
| + input.end(); |
| + inputEnded = true; |
| + } |
| + |
| + if (finalState != null) { |
| + restoreState(finalState); |
| + } |
| + } |
| +} |