docs/attachments/LUCENE-8323/LUCENE-8323.patch - lucene-jira-archive - Git at Google

 Index: lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestFingerprintFilter.java
 IDEA additional info:
 Subsystem: com.intellij.openapi.diff.impl.patch.CharsetEP
 <+>UTF-8
 ===================================================================
 --- lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestFingerprintFilter.java	(revision 7bb3e5c2482c7b73ed2dd26ff4be4613e7f44872)
 +++ lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestFingerprintFilter.java	(date 1526676568000)
 @@ -69,4 +69,13 @@
      }
    }

 +  public void testEmpty() throws Exception {
 +    for (final boolean consumeAll : new boolean[] { true, false }) {
 +      MockTokenizer tokenizer = whitespaceMockTokenizer("");
 +      tokenizer.setEnableChecks(consumeAll);
 +      TokenStream stream = new FingerprintFilter(tokenizer);
 +      assertTokenStreamContents(stream, new String[0]);
 +    }
 +  }
 +
  }
 Index: lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestConcatenateFilterFactory.java
 IDEA additional info:
 Subsystem: com.intellij.openapi.diff.impl.patch.CharsetEP
 <+>UTF-8
 ===================================================================
 --- lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestConcatenateFilterFactory.java	(date 1526675919000)
 +++ lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestConcatenateFilterFactory.java	(date 1526675919000)
 @@ -0,0 +1,55 @@
 +/*
 + * Licensed to the Apache Software Foundation (ASF) under one or more
 + * contributor license agreements.  See the NOTICE file distributed with
 + * this work for additional information regarding copyright ownership.
 + * The ASF licenses this file to You under the Apache License, Version 2.0
 + * (the "License"); you may not use this file except in compliance with
 + * the License.  You may obtain a copy of the License at
 + *
 + *     http://www.apache.org/licenses/LICENSE-2.0
 + *
 + * Unless required by applicable law or agreed to in writing, software
 + * distributed under the License is distributed on an "AS IS" BASIS,
 + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 + * See the License for the specific language governing permissions and
 + * limitations under the License.
 + */
 +
 +package org.apache.lucene.analysis.miscellaneous;
 +
 +import java.io.Reader;
 +import java.io.StringReader;
 +
 +import org.apache.lucene.analysis.MockTokenizer;
 +import org.apache.lucene.analysis.TokenStream;
 +import org.apache.lucene.analysis.util.BaseTokenStreamFactoryTestCase;
 +
 +public class TestConcatenateFilterFactory extends BaseTokenStreamFactoryTestCase {
 +  public void test() throws Exception {
 +    for (final boolean consumeAll : new boolean[]{true, false}) {
 +      Reader reader = new StringReader("A1 B2 A1 D4 C3");
 +      MockTokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
 +      tokenizer.setReader(reader);
 +      tokenizer.setEnableChecks(consumeAll);
 +      TokenStream stream = tokenizer;
 +      stream = tokenFilterFactory("Concatenate",
 +          ConcatenateFilterFactory.SEPARATOR_KEY, "_"
 +      ).create(stream);
 +      assertTokenStreamContents(stream, new String[]{"A1_B2_A1_D4_C3"});
 +    }
 +  }
 +
 +  public void testRequired() throws Exception {
 +    // no params are required
 +    tokenFilterFactory("Concatenate");
 +  }
 +
 +  /**
 +   * Test that bogus arguments result in exception
 +   */
 +  public void testBogusArguments() throws Exception {
 +    IllegalArgumentException expected = expectThrows(IllegalArgumentException.class, () ->
 +        tokenFilterFactory("Concatenate", "bogusArg", "bogusValue"));
 +    assertTrue(expected.getMessage().contains("Unknown parameters"));
 +  }
 +}
 Index: lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestConcatenateFilter.java
 IDEA additional info:
 Subsystem: com.intellij.openapi.diff.impl.patch.CharsetEP
 <+>UTF-8
 ===================================================================
 --- lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestConcatenateFilter.java	(date 1526678320000)
 +++ lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestConcatenateFilter.java	(date 1526678320000)
 @@ -0,0 +1,64 @@
 +/*
 + This software was produced for the U. S. Government
 + under Contract No. W15P7T-11-C-F600, and is
 + subject to the Rights in Noncommercial Computer Software
 + and Noncommercial Computer Software Documentation
 + Clause 252.227-7014 (JUN 1995)
 +
 + Copyright 2013 The MITRE Corporation. All Rights Reserved.
 +
 + Licensed under the Apache License, Version 2.0 (the "License");
 + you may not use this file except in compliance with the License.
 + You may obtain a copy of the License at
 +
 +     http://www.apache.org/licenses/LICENSE-2.0
 +
 + Unless required by applicable law or agreed to in writing, software
 + distributed under the License is distributed on an "AS IS" BASIS,
 + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 + See the License for the specific language governing permissions and
 + limitations under the License.
 + */
 +
 +package org.apache.lucene.analysis.miscellaneous;
 +
 +import java.io.IOException;
 +
 +import org.apache.lucene.analysis.BaseTokenStreamTestCase;
 +import org.apache.lucene.analysis.MockTokenizer;
 +import org.apache.lucene.analysis.TokenStream;
 +import org.apache.lucene.analysis.Tokenizer;
 +
 +public class TestConcatenateFilter extends BaseTokenStreamTestCase {
 +
 +  public void testTypical() throws IOException {
 +    String NYC = "new york city";
 +    Tokenizer tokenizer = whitespaceMockTokenizer(NYC);
 +    ConcatenateFilter filter = new ConcatenateFilter(tokenizer, ' ');
 +    assertTokenStreamContents(filter, new String[]{NYC},
 +        new int[]{0}, new int[]{NYC.length()}, new String[]{"shingle"},
 +        new int[]{1}, null, NYC.length(), true);
 +  }
 +
 +  public void testCustomSeparator() throws IOException {
 +    String NYC = "new york city";
 +    Tokenizer tokenizer = whitespaceMockTokenizer(NYC);
 +    ConcatenateFilter filter = new ConcatenateFilter(tokenizer, '_');
 +    assertTokenStreamContents(filter, new String[]{NYC.replace(' ', '_')});
 +  }
 +
 +  public void testSingleToken() throws Exception {
 +    for (final boolean consumeAll : new boolean[] { true, false }) {
 +      MockTokenizer tokenizer = whitespaceMockTokenizer("A1");
 +      tokenizer.setEnableChecks(consumeAll);
 +      TokenStream stream = new ConcatenateFilter(tokenizer, ' ');
 +      assertTokenStreamContents(stream, new String[] { "A1" });
 +    }
 +  }
 +
 +  public void testEmpty() throws IOException {
 +    Tokenizer tokenizer = whitespaceMockTokenizer("");
 +    ConcatenateFilter filter = new ConcatenateFilter(tokenizer, ' ');
 +    assertTokenStreamContents(filter, new String[0]);
 +  }
 +}
 \ No newline at end of file
 Index: lucene/NOTICE.txt
 IDEA additional info:
 Subsystem: com.intellij.openapi.diff.impl.patch.CharsetEP
 <+>UTF-8
 ===================================================================
 --- lucene/NOTICE.txt	(revision 7bb3e5c2482c7b73ed2dd26ff4be4613e7f44872)
 +++ lucene/NOTICE.txt	(date 1526677162000)
 @@ -202,3 +202,12 @@
  which can be obtained from

    https://bitbucket.org/eunjeon/mecab-ko-dic/downloads/mecab-ko-dic-2.0.3-20170922.tar.gz
 +
 +The ConcatenateFilter came from the OpenSextant Solr Text Tagger,
 +Copyright 2013 The MITRE Corporation. All Rights Reserved.
 +
 +  This software was produced for the U. S. Government
 +  under Contract No. W15P7T-11-C-F600, and is
 +  subject to the Rights in Noncommercial Computer Software
 +  and Noncommercial Computer Software Documentation
 +  Clause 252.227-7014 (JUN 1995)
 \ No newline at end of file
 Index: lucene/analysis/common/src/resources/META-INF/services/org.apache.lucene.analysis.util.TokenFilterFactory
 IDEA additional info:
 Subsystem: com.intellij.openapi.diff.impl.patch.CharsetEP
 <+>UTF-8
 ===================================================================
 --- lucene/analysis/common/src/resources/META-INF/services/org.apache.lucene.analysis.util.TokenFilterFactory	(revision 7bb3e5c2482c7b73ed2dd26ff4be4613e7f44872)
 +++ lucene/analysis/common/src/resources/META-INF/services/org.apache.lucene.analysis.util.TokenFilterFactory	(date 1526675800000)
 @@ -63,6 +63,7 @@
  org.apache.lucene.analysis.minhash.MinHashFilterFactory
  org.apache.lucene.analysis.miscellaneous.ASCIIFoldingFilterFactory
  org.apache.lucene.analysis.miscellaneous.CapitalizationFilterFactory
 +org.apache.lucene.analysis.miscellaneous.ConcatenateFilterFactory
  org.apache.lucene.analysis.miscellaneous.CodepointCountFilterFactory
  org.apache.lucene.analysis.miscellaneous.DateRecognizerFilterFactory
  org.apache.lucene.analysis.miscellaneous.DelimitedTermFrequencyTokenFilterFactory
 Index: lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/ConcatenateFilterFactory.java
 IDEA additional info:
 Subsystem: com.intellij.openapi.diff.impl.patch.CharsetEP
 <+>UTF-8
 ===================================================================
 --- lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/ConcatenateFilterFactory.java	(date 1526675396000)
 +++ lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/ConcatenateFilterFactory.java	(date 1526675396000)
 @@ -0,0 +1,56 @@
 +/*
 + This software was produced for the U. S. Government
 + under Contract No. W15P7T-11-C-F600, and is
 + subject to the Rights in Noncommercial Computer Software
 + and Noncommercial Computer Software Documentation
 + Clause 252.227-7014 (JUN 1995)
 +
 + Copyright 2013 The MITRE Corporation. All Rights Reserved.
 +
 + Licensed under the Apache License, Version 2.0 (the "License");
 + you may not use this file except in compliance with the License.
 + You may obtain a copy of the License at
 +
 +     http://www.apache.org/licenses/LICENSE-2.0
 +
 + Unless required by applicable law or agreed to in writing, software
 + distributed under the License is distributed on an "AS IS" BASIS,
 + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 + See the License for the specific language governing permissions and
 + limitations under the License.
 + */
 +
 +package org.apache.lucene.analysis.miscellaneous;
 +
 +import java.util.Map;
 +
 +import org.apache.lucene.analysis.TokenStream;
 +import org.apache.lucene.analysis.util.TokenFilterFactory;
 +
 +/**
 + * Factory for {@link ConcatenateFilter}.
 + *
 + * <pre class="prettyprint">
 + * The {@code separator} property is optional and defaults to the space character.
 + * </pre>
 + * @see ConcatenateFilter
 + * @since 7.4.0
 + */
 +public class ConcatenateFilterFactory extends TokenFilterFactory {
 +
 +  public static final String SEPARATOR_KEY = "separator";
 +  private final char separator;
 +
 +  public ConcatenateFilterFactory(Map<String, String> args) {
 +    super(args);
 +    separator = getChar(args, SEPARATOR_KEY, ' ');
 +    if (!args.isEmpty()) {
 +      throw new IllegalArgumentException("Unknown parameters: " + args);
 +    }
 +  }
 +
 +  @Override
 +  public TokenStream create(TokenStream input) {
 +    return new ConcatenateFilter(input, separator);
 +  }
 +}
 Index: lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/FingerprintFilter.java
 IDEA additional info:
 Subsystem: com.intellij.openapi.diff.impl.patch.CharsetEP
 <+>UTF-8
 ===================================================================
 --- lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/FingerprintFilter.java	(revision 7bb3e5c2482c7b73ed2dd26ff4be4613e7f44872)
 +++ lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/FingerprintFilter.java	(date 1526673044000)
 @@ -81,8 +81,7 @@

    @Override
    public final boolean incrementToken() throws IOException {
 -    if (uniqueTerms != null) {
 -      // We have already built the single output token - there's no more
 +    if (inputEnded) {
        return false;
      }
      boolean result = buildSingleOutputToken();
 @@ -177,6 +176,7 @@
        }
      });

 +    //TODO lets append directly to termAttribute?
      StringBuilder sb = new StringBuilder();
      for (Object item : items) {
        if (sb.length() >= 1) {
 Index: lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/ConcatenateFilter.java
 IDEA additional info:
 Subsystem: com.intellij.openapi.diff.impl.patch.CharsetEP
 <+>UTF-8
 ===================================================================
 --- lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/ConcatenateFilter.java	(date 1526678320000)
 +++ lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/ConcatenateFilter.java	(date 1526678320000)
 @@ -0,0 +1,139 @@
 +/*
 + This software was produced for the U. S. Government
 + under Contract No. W15P7T-11-C-F600, and is
 + subject to the Rights in Noncommercial Computer Software
 + and Noncommercial Computer Software Documentation
 + Clause 252.227-7014 (JUN 1995)
 +
 + Copyright 2013 The MITRE Corporation. All Rights Reserved.
 +
 + Licensed under the Apache License, Version 2.0 (the "License");
 + you may not use this file except in compliance with the License.
 + You may obtain a copy of the License at
 +
 +     http://www.apache.org/licenses/LICENSE-2.0
 +
 + Unless required by applicable law or agreed to in writing, software
 + distributed under the License is distributed on an "AS IS" BASIS,
 + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 + See the License for the specific language governing permissions and
 + limitations under the License.
 + */
 +
 +package org.apache.lucene.analysis.miscellaneous;
 +
 +import java.io.IOException;
 +
 +import org.apache.lucene.analysis.TokenFilter;
 +import org.apache.lucene.analysis.TokenStream;
 +import org.apache.lucene.analysis.shingle.ShingleFilter;
 +import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
 +import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
 +import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
 +import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute;
 +import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
 +import org.apache.lucene.util.AttributeSource;
 +
 +/**
 + * Concatenates/Joins every incoming token with a configured separator into one output token.
 + *
 + * The behavior of this filter is undefined when {@link PositionIncrementAttribute} and {@link PositionLengthAttribute}
 + * have non-default values.  Currently these attributes are simply ignored but in the future, this filter might
 + * insert an additional separator for posInc gaps, and it may produce additional concatenated/joined tokens if there are
 + * multiple tokens at the same position.
 + */
 +public class ConcatenateFilter extends TokenFilter {
 +  /*
 +  TODO use GraphTokenStreamFiniteStrings to handle arbitrary analysis
 +   */
 +
 +  private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
 +  private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
 +  private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class);
 +  private final PositionLengthAttribute posLenAtt = addAttribute(PositionLengthAttribute.class);
 +  private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class);
 +
 +  private AttributeSource.State finalState;
 +
 +  private final char separator;
 +  private boolean inputEnded = false;
 +  private StringBuilder buf = new StringBuilder(128);
 +
 +  /**
 +   * @param input The input TokenStream
 +   * @param separator the separator placed between each token
 +   */
 +  public ConcatenateFilter(TokenStream input, char separator) {
 +    super(input);
 +    this.separator = separator;
 +  }
 +
 +  @Override
 +  public void reset() throws IOException {
 +    super.reset();
 +    inputEnded = false;
 +    finalState = null;
 +  }
 +
 +  @Override
 +  public final boolean incrementToken() throws IOException {
 +    //note: this code is identical to that of FingerprintFilter
 +    if (inputEnded) {
 +      return false;
 +    }
 +    boolean result = buildSingleOutputToken();
 +    finalState = captureState();
 +    return result;
 +  }
 +
 +  /**
 +   * Gathers all tokens from input then concatenates.
 +   *
 +   * @return false for end of stream; true otherwise
 +   */
 +  private final boolean buildSingleOutputToken() throws IOException {
 +    inputEnded = false;
 +
 +    buf.setLength(0);
 +    boolean firstTerm = true;
 +    while (input.incrementToken()) {
 +      if (!firstTerm) {
 +        buf.append(separator);
 +      }
 +      //TODO consider indexing special chars when posInc > 1 (stop words). We ignore for now.
 +      buf.append(termAtt);
 +      firstTerm = false;
 +    }
 +    input.end();//call here so we can see end of stream offsets
 +    inputEnded = true;
 +
 +    //no input tokens, not even 1
 +    if (firstTerm) {
 +      return false;
 +    }
 +
 +    termAtt.setEmpty().append(buf);
 +    //Setting the other attributes ultimately won't have much effect but lets be thorough
 +    offsetAtt.setOffset(0, offsetAtt.endOffset());
 +    posIncrAtt.setPositionIncrement(1);
 +    posLenAtt.setPositionLength(1);//or do we add up the positions?  Probably not used any way.
 +    typeAtt.setType(ShingleFilter.DEFAULT_TOKEN_TYPE);//"shingle"
 +
 +    return true;
 +  }
 +
 +  @Override
 +  public void end() throws IOException {
 +    //note: this code is identical to that of FingerprintFilter
 +    if (!inputEnded) {
 +      // Rare case - If an IOException occurs while performing buildSingleOutputToken
 +      // we may not have called input.end() already
 +      input.end();
 +      inputEnded = true;
 +    }
 +
 +    if (finalState != null) {
 +      restoreState(finalState);
 +    }
 +  }
 +}
	Index: lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestFingerprintFilter.java
	IDEA additional info:
	Subsystem: com.intellij.openapi.diff.impl.patch.CharsetEP
	<+>UTF-8
	===================================================================
	--- lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestFingerprintFilter.java (revision 7bb3e5c2482c7b73ed2dd26ff4be4613e7f44872)
	+++ lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestFingerprintFilter.java (date 1526676568000)
	@@ -69,4 +69,13 @@
	}
	}

	+ public void testEmpty() throws Exception {
	+ for (final boolean consumeAll : new boolean[] { true, false }) {
	+ MockTokenizer tokenizer = whitespaceMockTokenizer("");
	+ tokenizer.setEnableChecks(consumeAll);
	+ TokenStream stream = new FingerprintFilter(tokenizer);
	+ assertTokenStreamContents(stream, new String[0]);
	+ }
	+ }
	+
	}
	Index: lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestConcatenateFilterFactory.java
	IDEA additional info:
	Subsystem: com.intellij.openapi.diff.impl.patch.CharsetEP
	<+>UTF-8
	===================================================================
	--- lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestConcatenateFilterFactory.java (date 1526675919000)
	+++ lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestConcatenateFilterFactory.java (date 1526675919000)
	@@ -0,0 +1,55 @@
	+/*
	+ * Licensed to the Apache Software Foundation (ASF) under one or more
	+ * contributor license agreements. See the NOTICE file distributed with
	+ * this work for additional information regarding copyright ownership.
	+ * The ASF licenses this file to You under the Apache License, Version 2.0
	+ * (the "License"); you may not use this file except in compliance with
	+ * the License. You may obtain a copy of the License at
	+ *
	+ * http://www.apache.org/licenses/LICENSE-2.0
	+ *
	+ * Unless required by applicable law or agreed to in writing, software
	+ * distributed under the License is distributed on an "AS IS" BASIS,
	+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	+ * See the License for the specific language governing permissions and
	+ * limitations under the License.
	+ */
	+
	+package org.apache.lucene.analysis.miscellaneous;
	+
	+import java.io.Reader;
	+import java.io.StringReader;
	+
	+import org.apache.lucene.analysis.MockTokenizer;
	+import org.apache.lucene.analysis.TokenStream;
	+import org.apache.lucene.analysis.util.BaseTokenStreamFactoryTestCase;
	+
	+public class TestConcatenateFilterFactory extends BaseTokenStreamFactoryTestCase {
	+ public void test() throws Exception {
	+ for (final boolean consumeAll : new boolean[]{true, false}) {
	+ Reader reader = new StringReader("A1 B2 A1 D4 C3");
	+ MockTokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
	+ tokenizer.setReader(reader);
	+ tokenizer.setEnableChecks(consumeAll);
	+ TokenStream stream = tokenizer;
	+ stream = tokenFilterFactory("Concatenate",
	+ ConcatenateFilterFactory.SEPARATOR_KEY, "_"
	+ ).create(stream);
	+ assertTokenStreamContents(stream, new String[]{"A1_B2_A1_D4_C3"});
	+ }
	+ }
	+
	+ public void testRequired() throws Exception {
	+ // no params are required
	+ tokenFilterFactory("Concatenate");
	+ }
	+
	+ /**
	+ * Test that bogus arguments result in exception
	+ */
	+ public void testBogusArguments() throws Exception {
	+ IllegalArgumentException expected = expectThrows(IllegalArgumentException.class, () ->
	+ tokenFilterFactory("Concatenate", "bogusArg", "bogusValue"));
	+ assertTrue(expected.getMessage().contains("Unknown parameters"));
	+ }
	+}
	Index: lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestConcatenateFilter.java
	IDEA additional info:
	Subsystem: com.intellij.openapi.diff.impl.patch.CharsetEP
	<+>UTF-8
	===================================================================
	--- lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestConcatenateFilter.java (date 1526678320000)
	+++ lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestConcatenateFilter.java (date 1526678320000)
	@@ -0,0 +1,64 @@
	+/*
	+ This software was produced for the U. S. Government
	+ under Contract No. W15P7T-11-C-F600, and is
	+ subject to the Rights in Noncommercial Computer Software
	+ and Noncommercial Computer Software Documentation
	+ Clause 252.227-7014 (JUN 1995)
	+
	+ Copyright 2013 The MITRE Corporation. All Rights Reserved.
	+
	+ Licensed under the Apache License, Version 2.0 (the "License");
	+ you may not use this file except in compliance with the License.
	+ You may obtain a copy of the License at
	+
	+ http://www.apache.org/licenses/LICENSE-2.0
	+
	+ Unless required by applicable law or agreed to in writing, software
	+ distributed under the License is distributed on an "AS IS" BASIS,
	+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	+ See the License for the specific language governing permissions and
	+ limitations under the License.
	+ */
	+
	+package org.apache.lucene.analysis.miscellaneous;
	+
	+import java.io.IOException;
	+
	+import org.apache.lucene.analysis.BaseTokenStreamTestCase;
	+import org.apache.lucene.analysis.MockTokenizer;
	+import org.apache.lucene.analysis.TokenStream;
	+import org.apache.lucene.analysis.Tokenizer;
	+
	+public class TestConcatenateFilter extends BaseTokenStreamTestCase {
	+
	+ public void testTypical() throws IOException {
	+ String NYC = "new york city";
	+ Tokenizer tokenizer = whitespaceMockTokenizer(NYC);
	+ ConcatenateFilter filter = new ConcatenateFilter(tokenizer, ' ');
	+ assertTokenStreamContents(filter, new String[]{NYC},
	+ new int[]{0}, new int[]{NYC.length()}, new String[]{"shingle"},
	+ new int[]{1}, null, NYC.length(), true);
	+ }
	+
	+ public void testCustomSeparator() throws IOException {
	+ String NYC = "new york city";
	+ Tokenizer tokenizer = whitespaceMockTokenizer(NYC);
	+ ConcatenateFilter filter = new ConcatenateFilter(tokenizer, '_');
	+ assertTokenStreamContents(filter, new String[]{NYC.replace(' ', '_')});
	+ }
	+
	+ public void testSingleToken() throws Exception {
	+ for (final boolean consumeAll : new boolean[] { true, false }) {
	+ MockTokenizer tokenizer = whitespaceMockTokenizer("A1");
	+ tokenizer.setEnableChecks(consumeAll);
	+ TokenStream stream = new ConcatenateFilter(tokenizer, ' ');
	+ assertTokenStreamContents(stream, new String[] { "A1" });
	+ }
	+ }
	+
	+ public void testEmpty() throws IOException {
	+ Tokenizer tokenizer = whitespaceMockTokenizer("");
	+ ConcatenateFilter filter = new ConcatenateFilter(tokenizer, ' ');
	+ assertTokenStreamContents(filter, new String[0]);
	+ }
	+}
	\ No newline at end of file
	Index: lucene/NOTICE.txt
	IDEA additional info:
	Subsystem: com.intellij.openapi.diff.impl.patch.CharsetEP
	<+>UTF-8
	===================================================================
	--- lucene/NOTICE.txt (revision 7bb3e5c2482c7b73ed2dd26ff4be4613e7f44872)
	+++ lucene/NOTICE.txt (date 1526677162000)
	@@ -202,3 +202,12 @@
	which can be obtained from

	https://bitbucket.org/eunjeon/mecab-ko-dic/downloads/mecab-ko-dic-2.0.3-20170922.tar.gz
	+
	+The ConcatenateFilter came from the OpenSextant Solr Text Tagger,
	+Copyright 2013 The MITRE Corporation. All Rights Reserved.
	+
	+ This software was produced for the U. S. Government
	+ under Contract No. W15P7T-11-C-F600, and is
	+ subject to the Rights in Noncommercial Computer Software
	+ and Noncommercial Computer Software Documentation
	+ Clause 252.227-7014 (JUN 1995)
	\ No newline at end of file
	Index: lucene/analysis/common/src/resources/META-INF/services/org.apache.lucene.analysis.util.TokenFilterFactory
	IDEA additional info:
	Subsystem: com.intellij.openapi.diff.impl.patch.CharsetEP
	<+>UTF-8
	===================================================================
	--- lucene/analysis/common/src/resources/META-INF/services/org.apache.lucene.analysis.util.TokenFilterFactory (revision 7bb3e5c2482c7b73ed2dd26ff4be4613e7f44872)
	+++ lucene/analysis/common/src/resources/META-INF/services/org.apache.lucene.analysis.util.TokenFilterFactory (date 1526675800000)
	@@ -63,6 +63,7 @@
	org.apache.lucene.analysis.minhash.MinHashFilterFactory
	org.apache.lucene.analysis.miscellaneous.ASCIIFoldingFilterFactory
	org.apache.lucene.analysis.miscellaneous.CapitalizationFilterFactory
	+org.apache.lucene.analysis.miscellaneous.ConcatenateFilterFactory
	org.apache.lucene.analysis.miscellaneous.CodepointCountFilterFactory
	org.apache.lucene.analysis.miscellaneous.DateRecognizerFilterFactory
	org.apache.lucene.analysis.miscellaneous.DelimitedTermFrequencyTokenFilterFactory
	Index: lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/ConcatenateFilterFactory.java
	IDEA additional info:
	Subsystem: com.intellij.openapi.diff.impl.patch.CharsetEP
	<+>UTF-8
	===================================================================
	--- lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/ConcatenateFilterFactory.java (date 1526675396000)
	+++ lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/ConcatenateFilterFactory.java (date 1526675396000)
	@@ -0,0 +1,56 @@
	+/*
	+ This software was produced for the U. S. Government
	+ under Contract No. W15P7T-11-C-F600, and is
	+ subject to the Rights in Noncommercial Computer Software
	+ and Noncommercial Computer Software Documentation
	+ Clause 252.227-7014 (JUN 1995)
	+
	+ Copyright 2013 The MITRE Corporation. All Rights Reserved.
	+
	+ Licensed under the Apache License, Version 2.0 (the "License");
	+ you may not use this file except in compliance with the License.
	+ You may obtain a copy of the License at
	+
	+ http://www.apache.org/licenses/LICENSE-2.0
	+
	+ Unless required by applicable law or agreed to in writing, software
	+ distributed under the License is distributed on an "AS IS" BASIS,
	+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	+ See the License for the specific language governing permissions and
	+ limitations under the License.
	+ */
	+
	+package org.apache.lucene.analysis.miscellaneous;
	+
	+import java.util.Map;
	+
	+import org.apache.lucene.analysis.TokenStream;
	+import org.apache.lucene.analysis.util.TokenFilterFactory;
	+
	+/**
	+ * Factory for {@link ConcatenateFilter}.
	+ *
	+ * <pre class="prettyprint">
	+ * The {@code separator} property is optional and defaults to the space character.
	+ * </pre>
	+ * @see ConcatenateFilter
	+ * @since 7.4.0
	+ */
	+public class ConcatenateFilterFactory extends TokenFilterFactory {
	+
	+ public static final String SEPARATOR_KEY = "separator";
	+ private final char separator;
	+
	+ public ConcatenateFilterFactory(Map<String, String> args) {
	+ super(args);
	+ separator = getChar(args, SEPARATOR_KEY, ' ');
	+ if (!args.isEmpty()) {
	+ throw new IllegalArgumentException("Unknown parameters: " + args);
	+ }
	+ }
	+
	+ @Override
	+ public TokenStream create(TokenStream input) {
	+ return new ConcatenateFilter(input, separator);
	+ }
	+}
	Index: lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/FingerprintFilter.java
	IDEA additional info:
	Subsystem: com.intellij.openapi.diff.impl.patch.CharsetEP
	<+>UTF-8
	===================================================================
	--- lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/FingerprintFilter.java (revision 7bb3e5c2482c7b73ed2dd26ff4be4613e7f44872)
	+++ lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/FingerprintFilter.java (date 1526673044000)
	@@ -81,8 +81,7 @@

	@Override
	public final boolean incrementToken() throws IOException {
	- if (uniqueTerms != null) {
	- // We have already built the single output token - there's no more
	+ if (inputEnded) {
	return false;
	}
	boolean result = buildSingleOutputToken();
	@@ -177,6 +176,7 @@
	}
	});

	+ //TODO lets append directly to termAttribute?
	StringBuilder sb = new StringBuilder();
	for (Object item : items) {
	if (sb.length() >= 1) {
	Index: lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/ConcatenateFilter.java
	IDEA additional info:
	Subsystem: com.intellij.openapi.diff.impl.patch.CharsetEP
	<+>UTF-8
	===================================================================
	--- lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/ConcatenateFilter.java (date 1526678320000)
	+++ lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/ConcatenateFilter.java (date 1526678320000)
	@@ -0,0 +1,139 @@
	+/*
	+ This software was produced for the U. S. Government
	+ under Contract No. W15P7T-11-C-F600, and is
	+ subject to the Rights in Noncommercial Computer Software
	+ and Noncommercial Computer Software Documentation
	+ Clause 252.227-7014 (JUN 1995)
	+
	+ Copyright 2013 The MITRE Corporation. All Rights Reserved.
	+
	+ Licensed under the Apache License, Version 2.0 (the "License");
	+ you may not use this file except in compliance with the License.
	+ You may obtain a copy of the License at
	+
	+ http://www.apache.org/licenses/LICENSE-2.0
	+
	+ Unless required by applicable law or agreed to in writing, software
	+ distributed under the License is distributed on an "AS IS" BASIS,
	+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	+ See the License for the specific language governing permissions and
	+ limitations under the License.
	+ */
	+
	+package org.apache.lucene.analysis.miscellaneous;
	+
	+import java.io.IOException;
	+
	+import org.apache.lucene.analysis.TokenFilter;
	+import org.apache.lucene.analysis.TokenStream;
	+import org.apache.lucene.analysis.shingle.ShingleFilter;
	+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
	+import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
	+import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
	+import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute;
	+import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
	+import org.apache.lucene.util.AttributeSource;
	+
	+/**
	+ * Concatenates/Joins every incoming token with a configured separator into one output token.
	+ *
	+ * The behavior of this filter is undefined when {@link PositionIncrementAttribute} and {@link PositionLengthAttribute}
	+ * have non-default values. Currently these attributes are simply ignored but in the future, this filter might
	+ * insert an additional separator for posInc gaps, and it may produce additional concatenated/joined tokens if there are
	+ * multiple tokens at the same position.
	+ */
	+public class ConcatenateFilter extends TokenFilter {
	+ /*
	+ TODO use GraphTokenStreamFiniteStrings to handle arbitrary analysis
	+ */
	+
	+ private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
	+ private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
	+ private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class);
	+ private final PositionLengthAttribute posLenAtt = addAttribute(PositionLengthAttribute.class);
	+ private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class);
	+
	+ private AttributeSource.State finalState;
	+
	+ private final char separator;
	+ private boolean inputEnded = false;
	+ private StringBuilder buf = new StringBuilder(128);
	+
	+ /**
	+ * @param input The input TokenStream
	+ * @param separator the separator placed between each token
	+ */
	+ public ConcatenateFilter(TokenStream input, char separator) {
	+ super(input);
	+ this.separator = separator;
	+ }
	+
	+ @Override
	+ public void reset() throws IOException {
	+ super.reset();
	+ inputEnded = false;
	+ finalState = null;
	+ }
	+
	+ @Override
	+ public final boolean incrementToken() throws IOException {
	+ //note: this code is identical to that of FingerprintFilter
	+ if (inputEnded) {
	+ return false;
	+ }
	+ boolean result = buildSingleOutputToken();
	+ finalState = captureState();
	+ return result;
	+ }
	+
	+ /**
	+ * Gathers all tokens from input then concatenates.
	+ *
	+ * @return false for end of stream; true otherwise
	+ */
	+ private final boolean buildSingleOutputToken() throws IOException {
	+ inputEnded = false;
	+
	+ buf.setLength(0);
	+ boolean firstTerm = true;
	+ while (input.incrementToken()) {
	+ if (!firstTerm) {
	+ buf.append(separator);
	+ }
	+ //TODO consider indexing special chars when posInc > 1 (stop words). We ignore for now.
	+ buf.append(termAtt);
	+ firstTerm = false;
	+ }
	+ input.end();//call here so we can see end of stream offsets
	+ inputEnded = true;
	+
	+ //no input tokens, not even 1
	+ if (firstTerm) {
	+ return false;
	+ }
	+
	+ termAtt.setEmpty().append(buf);
	+ //Setting the other attributes ultimately won't have much effect but lets be thorough
	+ offsetAtt.setOffset(0, offsetAtt.endOffset());
	+ posIncrAtt.setPositionIncrement(1);
	+ posLenAtt.setPositionLength(1);//or do we add up the positions? Probably not used any way.
	+ typeAtt.setType(ShingleFilter.DEFAULT_TOKEN_TYPE);//"shingle"
	+
	+ return true;
	+ }
	+
	+ @Override
	+ public void end() throws IOException {
	+ //note: this code is identical to that of FingerprintFilter
	+ if (!inputEnded) {
	+ // Rare case - If an IOException occurs while performing buildSingleOutputToken
	+ // we may not have called input.end() already
	+ input.end();
	+ inputEnded = true;
	+ }
	+
	+ if (finalState != null) {
	+ restoreState(finalState);
	+ }
	+ }
	+}