blob: 61219352d953a87e4a14fa1c59ee9e9b44afb834 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.analysis.miscellaneous;
import java.io.IOException;
import java.io.StringReader;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.analysis.StopFilter;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.synonym.SynonymFilter;
import org.apache.lucene.analysis.synonym.SynonymGraphFilter;
import org.apache.lucene.analysis.synonym.SynonymMap;
import org.apache.lucene.util.CharsRef;
import org.apache.lucene.util.CharsRefBuilder;
import org.junit.Test;
public class TestConcatenateGraphFilter extends BaseTokenStreamTestCase {
private static final char SEP_LABEL = (char) ConcatenateGraphFilter.SEP_LABEL;
@Test
public void testBasic() throws Exception {
Tokenizer tokenStream = new MockTokenizer(MockTokenizer.WHITESPACE, true);
String input = "mykeyword";
tokenStream.setReader(new StringReader(input));
ConcatenateGraphFilter stream = new ConcatenateGraphFilter(tokenStream);
assertTokenStreamContents(stream, new String[] {input}, null, null, new int[] { 1 });
}
@Test
public void testWithNoPreserveSep() throws Exception {
Tokenizer tokenStream = new MockTokenizer(MockTokenizer.WHITESPACE, true);
String input = "mykeyword another keyword";
tokenStream.setReader(new StringReader(input));
ConcatenateGraphFilter stream = new ConcatenateGraphFilter(tokenStream, null, false, 100);
assertTokenStreamContents(stream, new String[] {"mykeywordanotherkeyword"}, null, null, new int[] { 1 });
}
@Test
public void testWithMultipleTokens() throws Exception {
Tokenizer tokenStream = new MockTokenizer(MockTokenizer.WHITESPACE, true);
String input = "mykeyword another keyword";
tokenStream.setReader(new StringReader(input));
ConcatenateGraphFilter stream = new ConcatenateGraphFilter(tokenStream);
CharsRefBuilder builder = new CharsRefBuilder();
builder.append("mykeyword");
builder.append(SEP_LABEL);
builder.append("another");
builder.append(SEP_LABEL);
builder.append("keyword");
assertTokenStreamContents(stream, new String[]{builder.toCharsRef().toString()}, null, null, new int[]{1});
}
@Test
public void testWithSynonym() throws Exception {
SynonymMap.Builder builder = new SynonymMap.Builder(true);
builder.add(new CharsRef("mykeyword"), new CharsRef("mysynonym"), true);
Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, true);
tokenizer.setReader(new StringReader("mykeyword"));
SynonymFilter filter = new SynonymFilter(tokenizer, builder.build(), true);
ConcatenateGraphFilter stream = new ConcatenateGraphFilter(filter);
assertTokenStreamContents(stream, new String[] {"mykeyword", "mysynonym"}, null, null, new int[] { 1, 0 });
}
@Test
public void testWithSynonyms() throws Exception {
SynonymMap.Builder builder = new SynonymMap.Builder(true);
builder.add(new CharsRef("mykeyword"), new CharsRef("mysynonym"), true);
Tokenizer tokenStream = new MockTokenizer(MockTokenizer.WHITESPACE, true);
String input = "mykeyword another keyword";
tokenStream.setReader(new StringReader(input));
SynonymFilter filter = new SynonymFilter(tokenStream, builder.build(), true);
ConcatenateGraphFilter stream = new ConcatenateGraphFilter(filter, SEP_LABEL, false, 100);
String[] expectedOutputs = new String[2];
CharsRefBuilder expectedOutput = new CharsRefBuilder();
expectedOutput.append("mykeyword");
expectedOutput.append(SEP_LABEL);
expectedOutput.append("another");
expectedOutput.append(SEP_LABEL);
expectedOutput.append("keyword");
expectedOutputs[0] = expectedOutput.toCharsRef().toString();
expectedOutput.clear();
expectedOutput.append("mysynonym");
expectedOutput.append(SEP_LABEL);
expectedOutput.append("another");
expectedOutput.append(SEP_LABEL);
expectedOutput.append("keyword");
expectedOutputs[1] = expectedOutput.toCharsRef().toString();
assertTokenStreamContents(stream, expectedOutputs, null, null, new int[]{1, 0});
}
@Test
public void testWithStopword() throws Exception {
for (boolean preservePosInc : new boolean[]{true, false}) {
Tokenizer tokenStream = new MockTokenizer(MockTokenizer.WHITESPACE, true);
String input = "a mykeyword a keyword"; //LUCENE-8344 add "a"
tokenStream.setReader(new StringReader(input));
TokenFilter tokenFilter = new StopFilter(tokenStream, StopFilter.makeStopSet("a"));
ConcatenateGraphFilter concatStream = new ConcatenateGraphFilter(tokenFilter, SEP_LABEL, preservePosInc, 10);
CharsRefBuilder builder = new CharsRefBuilder();
if (preservePosInc) {
builder.append(SEP_LABEL);
}
builder.append("mykeyword");
builder.append(SEP_LABEL);
if (preservePosInc) {
builder.append(SEP_LABEL);
}
builder.append("keyword");
// if (preservePosInc) { LUCENE-8344 uncomment
// builder.append(SEP_LABEL);
// }
assertTokenStreamContents(concatStream, new String[]{builder.toCharsRef().toString()});
}
}
@Test
public void testValidNumberOfExpansions() throws IOException {
SynonymMap.Builder builder = new SynonymMap.Builder(true);
for (int i = 0; i < 256; i++) {
builder.add(new CharsRef("" + (i+1)), new CharsRef("" + (1000 + (i+1))), true);
}
StringBuilder valueBuilder = new StringBuilder();
for (int i = 0 ; i < 8 ; i++) {
valueBuilder.append(i+1);
valueBuilder.append(" ");
}
MockTokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, true);
tokenizer.setReader(new StringReader(valueBuilder.toString()));
SynonymFilter filter = new SynonymFilter(tokenizer, builder.build(), true);
int count;
try (ConcatenateGraphFilter stream = new ConcatenateGraphFilter(filter)) {
stream.reset();
ConcatenateGraphFilter.BytesRefBuilderTermAttribute attr = stream.addAttribute(ConcatenateGraphFilter.BytesRefBuilderTermAttribute.class);
count = 0;
while (stream.incrementToken()) {
count++;
assertNotNull(attr.getBytesRef());
assertTrue(attr.getBytesRef().length > 0);
}
}
assertEquals(count, 256);
}
public void testEmpty() throws IOException {
Tokenizer tokenizer = whitespaceMockTokenizer("");
ConcatenateGraphFilter filter = new ConcatenateGraphFilter(tokenizer);
assertTokenStreamContents(filter, new String[0]);
}
@Test
public void testSeparator() throws IOException {
Tokenizer tokenStream = new MockTokenizer(MockTokenizer.SIMPLE, true);
String input = "...mykeyword.another.keyword.";
tokenStream.setReader(new StringReader(input));
ConcatenateGraphFilter stream = new ConcatenateGraphFilter(tokenStream, ' ', false, 100); //not \u001F
assertTokenStreamContents(stream, new String[] {"mykeyword another keyword"}, null, null, new int[] { 1 });
}
@Test
public void testSeparatorWithStopWords() throws IOException {
Tokenizer tokenStream = new MockTokenizer(MockTokenizer.WHITESPACE, false);
String input = "A B C D E F J H";
tokenStream.setReader(new StringReader(input));
TokenStream tokenFilter = new StopFilter(tokenStream, StopFilter.makeStopSet("A", "D", "E", "J"));
ConcatenateGraphFilter stream = new ConcatenateGraphFilter(tokenFilter, '-', false, 100);
assertTokenStreamContents(stream, new String[] {"B-C-F-H"}, null, null, new int[] { 1 });
}
@Test
public void testSeparatorWithStopWordsAndPreservePositionIncrements() throws IOException {
Tokenizer tokenStream = new MockTokenizer(MockTokenizer.WHITESPACE, false);
String input = "A B C D E F J H";
tokenStream.setReader(new StringReader(input));
TokenStream tokenFilter = new StopFilter(tokenStream, StopFilter.makeStopSet("A", "D", "E", "J"));
ConcatenateGraphFilter stream = new ConcatenateGraphFilter(tokenFilter, '-', true, 100);
assertTokenStreamContents(stream, new String[] {"-B-C---F--H"}, null, null, new int[] { 1 });
}
@Test
public void testSeparatorWithSynonyms() throws IOException {
SynonymMap.Builder builder = new SynonymMap.Builder(true);
builder.add(new CharsRef("mykeyword"), new CharsRef("mysynonym"), true);
builder.add(new CharsRef("mykeyword"), new CharsRef("three words synonym"), true);
Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, true);
String input = " mykeyword another keyword ";
tokenizer.setReader(new StringReader(input));
SynonymGraphFilter filter = new SynonymGraphFilter(tokenizer, builder.build(), true);
ConcatenateGraphFilter stream = new ConcatenateGraphFilter(filter, '-', false, 100);
assertTokenStreamContents(stream, new String[] {
"mykeyword-another-keyword",
"mysynonym-another-keyword",
"three words synonym-another-keyword"
}, null, null, new int[] { 1, 0 ,0});
}
}