blob: c33b3545e22ccb285bda889d68362e2ee60a436d [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.analysis.synonym;
import java.io.IOException;
import java.io.StringReader;
import java.text.ParseException;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;
import java.util.Locale;
import java.util.Set;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.MockAnalyzer;
import org.apache.lucene.analysis.MockGraphTokenFilter;
import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.core.FlattenGraphFilter;
import org.apache.lucene.analysis.tokenattributes.*;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.RandomIndexWriter;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.PhraseQuery;
import org.apache.lucene.store.ByteArrayDataInput;
import org.apache.lucene.store.Directory;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.CharsRef;
import org.apache.lucene.util.CharsRefBuilder;
import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.IntsRef;
import org.apache.lucene.util.IntsRefBuilder;
import org.apache.lucene.util.TestUtil;
import org.apache.lucene.util.automaton.Automaton;
import org.apache.lucene.util.automaton.AutomatonTestUtil;
import org.apache.lucene.util.automaton.Operations;
import org.apache.lucene.util.automaton.TooComplexToDeterminizeException;
import org.apache.lucene.util.automaton.Transition;
import org.apache.lucene.util.fst.Util;
public class TestSynonymGraphFilter extends BaseTokenStreamTestCase {
/** Set as a side effect by {@link #getAnalyzer} and {@link #getFlattenAnalyzer}. */
private SynonymGraphFilter synFilter;
private FlattenGraphFilter flattenFilter;
public void testBasicKeepOrigOneOutput() throws Exception {
SynonymMap.Builder b = new SynonymMap.Builder();
add(b, "a b", "x", true);
Analyzer a = getAnalyzer(b, true);
assertAnalyzesTo(a,
"c a b",
new String[] {"c", "x", "a", "b"},
new int[] { 0, 2, 2, 4},
new int[] { 1, 5, 3, 5},
new String[] {"word", "SYNONYM", "word", "word"},
new int[] { 1, 1, 0, 1},
new int[] { 1, 2, 1, 1});
a.close();
}
public void testMixedKeepOrig() throws Exception {
SynonymMap.Builder b = new SynonymMap.Builder();
add(b, "a b", "x", true);
add(b, "e f", "y", false);
Analyzer a = getAnalyzer(b, true);
assertAnalyzesTo(a,
"c a b c e f g",
new String[] {"c", "x", "a", "b", "c", "y", "g"},
new int[] { 0, 2, 2, 4, 6, 8, 12},
new int[] { 1, 5, 3, 5, 7, 11, 13},
new String[] {"word", "SYNONYM", "word", "word", "word", "SYNONYM", "word"},
new int[] { 1, 1, 0, 1, 1, 1, 1},
new int[] { 1, 2, 1, 1, 1, 1, 1});
a.close();
}
public void testNoParseAfterBuffer() throws Exception {
SynonymMap.Builder b = new SynonymMap.Builder();
add(b, "b a", "x", true);
Analyzer a = getAnalyzer(b, true);
assertAnalyzesTo(a,
"b b b",
new String[] {"b", "b", "b"},
new int[] { 0, 2, 4},
new int[] { 1, 3, 5},
new String[] {"word", "word", "word"},
new int[] { 1, 1, 1},
new int[] { 1, 1, 1});
a.close();
}
public void testOneInputMultipleOutputKeepOrig() throws Exception {
SynonymMap.Builder b = new SynonymMap.Builder();
add(b, "a b", "x", true);
add(b, "a b", "y", true);
Analyzer a = getAnalyzer(b, true);
assertAnalyzesTo(a,
"c a b c",
new String[] {"c", "x", "y", "a", "b", "c"},
new int[] { 0, 2, 2, 2, 4, 6},
new int[] { 1, 5, 5, 3, 5, 7},
new String[] {"word", "SYNONYM", "SYNONYM", "word", "word", "word"},
new int[] { 1, 1, 0, 0, 1, 1, 1, 1},
new int[] { 1, 2, 2, 1, 1, 1, 1, 1});
a.close();
}
/**
* Verify type of token and positionLength after analyzer.
*/
public void testPositionLengthAndTypeSimple() throws Exception {
String testFile =
"spider man, spiderman";
Analyzer analyzer = solrSynsToAnalyzer(testFile);
assertAnalyzesToPositions(analyzer, "spider man",
new String[]{"spiderman", "spider", "man"},
new String[]{"SYNONYM", "word", "word"},
new int[]{1, 0, 1},
new int[]{2, 1, 1});
}
/**
* parse a syn file with some escaped syntax chars
*/
public void testEscapedStuff() throws Exception {
String testFile =
"a\\=>a => b\\=>b\n" +
"a\\,a => b\\,b";
Analyzer analyzer = solrSynsToAnalyzer(testFile);
assertAnalyzesTo(analyzer, "ball",
new String[]{"ball"},
new int[]{1});
assertAnalyzesTo(analyzer, "a=>a",
new String[]{"b=>b"},
new int[]{1});
assertAnalyzesTo(analyzer, "a,a",
new String[]{"b,b"},
new int[]{1});
analyzer.close();
}
/**
* parse a syn file with bad syntax
*/
public void testInvalidAnalyzesToNothingOutput() throws Exception {
String testFile = "a => 1";
Analyzer analyzer = new MockAnalyzer(random(), MockTokenizer.SIMPLE, false);
SolrSynonymParser parser = new SolrSynonymParser(true, true, analyzer);
expectThrows(ParseException.class, () -> parser.parse(new StringReader(testFile)));
analyzer.close();
}
/**
* parse a syn file with bad syntax
*/
public void testInvalidDoubleMap() throws Exception {
String testFile = "a => b => c";
Analyzer analyzer = new MockAnalyzer(random());
SolrSynonymParser parser = new SolrSynonymParser(true, true, analyzer);
expectThrows(ParseException.class, () -> parser.parse(new StringReader(testFile)));
analyzer.close();
}
/**
* Tests some simple examples from the solr wiki
*/
public void testSimple() throws Exception {
String testFile =
"i-pod, ipod, ipoooood\n" +
"foo => foo bar\n" +
"foo => baz\n" +
"this test, that testing";
Analyzer analyzer = solrSynsToAnalyzer(testFile);
assertAnalyzesTo(analyzer, "ball",
new String[]{"ball"},
new int[]{1});
assertAnalyzesTo(analyzer, "i-pod",
new String[]{"ipod", "ipoooood", "i-pod"},
new int[]{1, 0, 0});
assertAnalyzesTo(analyzer, "foo",
new String[]{"foo", "baz", "bar"},
new int[]{1, 0, 1});
assertAnalyzesTo(analyzer, "this test",
new String[]{"that", "this", "testing", "test"},
new int[]{1, 0, 1, 0});
analyzer.close();
}
public void testBufferLength() throws Exception {
String testFile =
"c => 8 2 5 6 7\n" +
"f c e d f, 1\n" +
"c g a f d, 6 5 5\n" +
"e c => 4\n" +
"g => 5\n" +
"a g b f e => 5 0 7 7\n" +
"b => 1";
Analyzer analyzer = solrSynsToAnalyzer(testFile);
String doc = "b c g a f b d";
String[] expected = new String[]{"1", "8", "2", "5", "6", "7", "5", "a", "f", "1", "d"};
assertAnalyzesTo(analyzer, doc, expected);
}
private Analyzer solrSynsToAnalyzer(String syns) throws IOException, ParseException {
Analyzer analyzer = new MockAnalyzer(random());
SolrSynonymParser parser = new SolrSynonymParser(true, true, analyzer);
parser.parse(new StringReader(syns));
analyzer.close();
return getFlattenAnalyzer(parser, true);
}
public void testMoreThanOneLookAhead() throws Exception {
SynonymMap.Builder b = new SynonymMap.Builder();
add(b, "a b c d", "x", true);
Analyzer a = getAnalyzer(b, true);
assertAnalyzesTo(a,
"a b c e",
new String[] {"a", "b", "c", "e"},
new int[] { 0, 2, 4, 6},
new int[] { 1, 3, 5, 7},
new String[] {"word", "word", "word", "word"},
new int[] { 1, 1, 1, 1},
new int[] { 1, 1, 1, 1});
a.close();
}
public void testLookaheadAfterParse() throws Exception {
SynonymMap.Builder b = new SynonymMap.Builder();
add(b, "b b", "x", true);
add(b, "b", "y", true);
Analyzer a = getAnalyzer(b, true);
assertAnalyzesTo(a, "b a b b",
new String[] {"y", "b", "a", "x", "b", "b"},
new int[] {0, 0, 2, 4, 4, 6},
new int[] {1, 1, 3, 7, 5, 7},
null,
new int[] {1, 0, 1, 1, 0, 1},
new int[] {1, 1, 1, 2, 1, 1},
true);
}
public void testLookaheadSecondParse() throws Exception {
SynonymMap.Builder b = new SynonymMap.Builder();
add(b, "b b b", "x", true);
add(b, "b", "y", true);
Analyzer a = getAnalyzer(b, true);
assertAnalyzesTo(a, "b b",
new String[] {"y", "b", "y", "b"},
new int[] { 0, 0, 2, 2},
new int[] { 1, 1, 3, 3},
null,
new int[] { 1, 0, 1, 0},
new int[] { 1, 1, 1, 1},
true);
}
public void testOneInputMultipleOutputNoKeepOrig() throws Exception {
SynonymMap.Builder b = new SynonymMap.Builder();
add(b, "a b", "x", false);
add(b, "a b", "y", false);
Analyzer a = getAnalyzer(b, true);
assertAnalyzesTo(a,
"c a b c",
new String[] {"c", "x", "y", "c"},
new int[] { 0, 2, 2, 6},
new int[] { 1, 5, 5, 7},
new String[] {"word", "SYNONYM", "SYNONYM", "word"},
new int[] { 1, 1, 0, 1},
new int[] { 1, 1, 1, 1});
a.close();
}
public void testOneInputMultipleOutputMixedKeepOrig() throws Exception {
SynonymMap.Builder b = new SynonymMap.Builder();
add(b, "a b", "x", true);
add(b, "a b", "y", false);
Analyzer a = getAnalyzer(b, true);
assertAnalyzesTo(a,
"c a b c",
new String[] {"c", "x", "y", "a", "b", "c"},
new int[] { 0, 2, 2, 2, 4, 6},
new int[] { 1, 5, 5, 3, 5, 7},
new String[] {"word", "SYNONYM", "SYNONYM", "word", "word", "word"},
new int[] { 1, 1, 0, 0, 1, 1, 1, 1},
new int[] { 1, 2, 2, 1, 1, 1, 1, 1});
a.close();
}
public void testSynAtEnd() throws Exception {
SynonymMap.Builder b = new SynonymMap.Builder();
add(b, "a b", "x", true);
Analyzer a = getAnalyzer(b, true);
assertAnalyzesTo(a,
"c d e a b",
new String[] {"c", "d", "e", "x", "a", "b"},
new int[] { 0, 2, 4, 6, 6, 8},
new int[] { 1, 3, 5, 9, 7, 9},
new String[] {"word", "word", "word", "SYNONYM", "word", "word"},
new int[] { 1, 1, 1, 1, 0, 1},
new int[] { 1, 1, 1, 2, 1, 1});
a.close();
}
public void testTwoSynsInARow() throws Exception {
SynonymMap.Builder b = new SynonymMap.Builder();
add(b, "a", "x", false);
Analyzer a = getAnalyzer(b, true);
assertAnalyzesTo(a,
"c a a b",
new String[] {"c", "x", "x", "b"},
new int[] { 0, 2, 4, 6},
new int[] { 1, 3, 5, 7},
new String[] {"word", "SYNONYM", "SYNONYM", "word"},
new int[] { 1, 1, 1, 1},
new int[] { 1, 1, 1, 1});
a.close();
}
public void testBasicKeepOrigTwoOutputs() throws Exception {
SynonymMap.Builder b = new SynonymMap.Builder();
add(b, "a b", "x y", true);
add(b, "a b", "m n o", true);
Analyzer a = getAnalyzer(b, true);
assertAnalyzesTo(a,
"c a b d",
new String[] {"c", "x", "m", "a", "y", "n", "o", "b", "d"},
new int[] { 0, 2, 2, 2, 2, 2, 2, 4, 6},
new int[] { 1, 5, 5, 3, 5, 5, 5, 5, 7},
new String[] {"word", "SYNONYM", "SYNONYM", "word", "SYNONYM", "SYNONYM", "SYNONYM", "word", "word"},
new int[] { 1, 1, 0, 0, 1, 1, 1, 1, 1},
new int[] { 1, 1, 2, 4, 4, 1, 2, 1, 1});
a.close();
}
public void testNoCaptureIfNoMatch() throws Exception {
SynonymMap.Builder b = new SynonymMap.Builder();
add(b, "a b", "x y", true);
Analyzer a = getAnalyzer(b, true);
assertAnalyzesTo(a,
"c d d",
new String[] {"c", "d", "d"},
new int[] { 0, 2, 4},
new int[] { 1, 3, 5},
new String[] {"word", "word", "word"},
new int[] { 1, 1, 1},
new int[] { 1, 1, 1});
assertEquals(0, synFilter.getCaptureCount());
a.close();
}
public void testBasicNotKeepOrigOneOutput() throws Exception {
SynonymMap.Builder b = new SynonymMap.Builder();
add(b, "a b", "x", false);
Analyzer a = getAnalyzer(b, true);
assertAnalyzesTo(a,
"c a b",
new String[] {"c", "x"},
new int[] {0, 2},
new int[] {1, 5},
new String[] {"word", "SYNONYM"},
new int[] {1, 1},
new int[] {1, 1});
a.close();
}
public void testBasicNoKeepOrigTwoOutputs() throws Exception {
SynonymMap.Builder b = new SynonymMap.Builder();
add(b, "a b", "x y", false);
add(b, "a b", "m n o", false);
Analyzer a = getAnalyzer(b, true);
assertAnalyzesTo(a,
"c a b d",
new String[] {"c", "x", "m", "y", "n", "o", "d"},
new int[] { 0, 2, 2, 2, 2, 2, 6},
new int[] { 1, 5, 5, 5, 5, 5, 7},
new String[] {"word", "SYNONYM", "SYNONYM", "SYNONYM", "SYNONYM", "SYNONYM", "word"},
new int[] { 1, 1, 0, 1, 1, 1, 1},
new int[] { 1, 1, 2, 3, 1, 1, 1});
a.close();
}
public void testIgnoreCase() throws Exception {
SynonymMap.Builder b = new SynonymMap.Builder();
add(b, "a b", "x y", false);
add(b, "a b", "m n o", false);
Analyzer a = getAnalyzer(b, true);
assertAnalyzesTo(a,
"c A B D",
new String[] {"c", "x", "m", "y", "n", "o", "D"},
new int[] { 0, 2, 2, 2, 2, 2, 6},
new int[] { 1, 5, 5, 5, 5, 5, 7},
new String[] {"word", "SYNONYM", "SYNONYM", "SYNONYM", "SYNONYM", "SYNONYM", "word"},
new int[] { 1, 1, 0, 1, 1, 1, 1},
new int[] { 1, 1, 2, 3, 1, 1, 1});
a.close();
}
public void testDoNotIgnoreCase() throws Exception {
SynonymMap.Builder b = new SynonymMap.Builder();
add(b, "a b", "x y", false);
add(b, "a b", "m n o", false);
Analyzer a = getAnalyzer(b, false);
assertAnalyzesTo(a,
"c A B D",
new String[] {"c", "A", "B", "D"},
new int[] { 0, 2, 4, 6},
new int[] { 1, 3, 5, 7},
new String[] {"word", "word", "word", "word"},
new int[] { 1, 1, 1, 1},
new int[] { 1, 1, 1, 1});
a.close();
}
public void testBufferedFinish1() throws Exception {
SynonymMap.Builder b = new SynonymMap.Builder();
add(b, "a b c", "m n o", false);
Analyzer a = getAnalyzer(b, true);
assertAnalyzesTo(a,
"c a b",
new String[] {"c", "a", "b"},
new int[] { 0, 2, 4},
new int[] { 1, 3, 5},
new String[] {"word", "word", "word"},
new int[] { 1, 1, 1},
new int[] { 1, 1, 1});
a.close();
}
public void testBufferedFinish2() throws Exception {
SynonymMap.Builder b = new SynonymMap.Builder();
add(b, "a b", "m n o", false);
add(b, "d e", "m n o", false);
Analyzer a = getAnalyzer(b, true);
assertAnalyzesTo(a,
"c a d",
new String[] {"c", "a", "d"},
new int[] { 0, 2, 4},
new int[] { 1, 3, 5},
new String[] {"word", "word", "word"},
new int[] { 1, 1, 1},
new int[] { 1, 1, 1});
a.close();
}
public void testCanReuse() throws Exception {
SynonymMap.Builder b = new SynonymMap.Builder();
add(b, "a b", "x", true);
Analyzer a = getAnalyzer(b, true);
for(int i=0;i<10;i++) {
assertAnalyzesTo(a,
"c a b",
new String[] {"c", "x", "a", "b"},
new int[] { 0, 2, 2, 4},
new int[] { 1, 5, 3, 5},
new String[] {"word", "SYNONYM", "word", "word"},
new int[] { 1, 1, 0, 1},
new int[] { 1, 2, 1, 1});
}
a.close();
}
/** Multiple input tokens map to a single output token */
public void testManyToOne() throws Exception {
SynonymMap.Builder b = new SynonymMap.Builder();
add(b, "a b c", "z", true);
Analyzer a = getAnalyzer(b, true);
assertAnalyzesTo(a,
"a b c d",
new String[] {"z", "a", "b", "c", "d"},
new int[] { 0, 0, 2, 4, 6},
new int[] { 5, 1, 3, 5, 7},
new String[] {"SYNONYM", "word", "word", "word", "word"},
new int[] { 1, 0, 1, 1, 1},
new int[] { 3, 1, 1, 1, 1});
a.close();
}
public void testBufferAfterMatch() throws Exception {
SynonymMap.Builder b = new SynonymMap.Builder();
add(b, "a b c d", "x", true);
add(b, "a b", "y", false);
// The 'c' token has to be buffered because SynGraphFilter
// needs to know whether a b c d -> x matches:
Analyzer a = getAnalyzer(b, true);
assertAnalyzesTo(a,
"f a b c e",
new String[] {"f", "y", "c", "e"},
new int[] { 0, 2, 6, 8},
new int[] { 1, 5, 7, 9},
new String[] {"word", "SYNONYM", "word", "word"},
new int[] { 1, 1, 1, 1},
new int[] { 1, 1, 1, 1});
a.close();
}
public void testZeroSyns() throws Exception {
Tokenizer tokenizer = new MockTokenizer();
tokenizer.setReader(new StringReader("aa bb"));
IllegalArgumentException ex = expectThrows(IllegalArgumentException.class, () ->
new SynonymGraphFilter(tokenizer, new SynonymMap.Builder(true).build(), true));
assertEquals("fst must be non-null", ex.getMessage());
}
public void testOutputHangsOffEnd() throws Exception {
SynonymMap.Builder b = new SynonymMap.Builder(true);
final boolean keepOrig = false;
// b hangs off the end (no input token under it):
add(b, "a", "a b", keepOrig);
Analyzer a = getFlattenAnalyzer(b, true);
assertAnalyzesTo(a, "a",
new String[] {"a", "b"},
new int[] { 0, 0},
new int[] { 1, 1},
null,
new int[] { 1, 1},
new int[] { 1, 1},
true);
a.close();
}
public void testDedup() throws Exception {
SynonymMap.Builder b = new SynonymMap.Builder(true);
final boolean keepOrig = false;
add(b, "a b", "ab", keepOrig);
add(b, "a b", "ab", keepOrig);
add(b, "a b", "ab", keepOrig);
Analyzer a = getFlattenAnalyzer(b, true);
assertAnalyzesTo(a, "a b",
new String[]{"ab"},
new int[]{1});
a.close();
}
public void testNoDedup() throws Exception {
// dedup is false:
SynonymMap.Builder b = new SynonymMap.Builder(false);
final boolean keepOrig = false;
add(b, "a b", "ab", keepOrig);
add(b, "a b", "ab", keepOrig);
add(b, "a b", "ab", keepOrig);
Analyzer a = getFlattenAnalyzer(b, true);
assertAnalyzesTo(a, "a b",
new String[]{"ab", "ab", "ab"},
new int[]{1, 0, 0});
a.close();
}
public void testMatching() throws Exception {
SynonymMap.Builder b = new SynonymMap.Builder(true);
final boolean keepOrig = false;
add(b, "a b", "ab", keepOrig);
add(b, "a c", "ac", keepOrig);
add(b, "a", "aa", keepOrig);
add(b, "b", "bb", keepOrig);
add(b, "z x c v", "zxcv", keepOrig);
add(b, "x c", "xc", keepOrig);
Analyzer a = getFlattenAnalyzer(b, true);
checkOneTerm(a, "$", "$");
checkOneTerm(a, "a", "aa");
checkOneTerm(a, "b", "bb");
assertAnalyzesTo(a, "a $",
new String[]{"aa", "$"},
new int[]{1, 1});
assertAnalyzesTo(a, "$ a",
new String[]{"$", "aa"},
new int[]{1, 1});
assertAnalyzesTo(a, "a a",
new String[]{"aa", "aa"},
new int[]{1, 1});
assertAnalyzesTo(a, "z x c v",
new String[]{"zxcv"},
new int[]{1});
assertAnalyzesTo(a, "z x c $",
new String[]{"z", "xc", "$"},
new int[]{1, 1, 1});
a.close();
}
public void testBasic1() throws Exception {
SynonymMap.Builder b = new SynonymMap.Builder(true);
add(b, "a", "foo", true);
add(b, "a b", "bar fee", true);
add(b, "b c", "dog collar", true);
add(b, "c d", "dog harness holder extras", true);
add(b, "m c e", "dog barks loudly", false);
add(b, "i j k", "feep", true);
add(b, "e f", "foo bar", false);
add(b, "e f", "baz bee", false);
add(b, "z", "boo", false);
add(b, "y", "bee", true);
Analyzer a = getFlattenAnalyzer(b, true);
assertAnalyzesTo(a, "a b c",
new String[] {"bar", "a", "fee", "b", "c"},
new int[] {1, 0, 1, 0, 1});
assertAnalyzesTo(a, "x a b c d",
new String[] {"x", "bar", "a", "fee", "b", "dog", "c", "harness", "d", "holder", "extras"},
new int[] {1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1});
assertAnalyzesTo(a, "a b a",
new String[] {"bar", "a", "fee", "b", "foo", "a"},
new int[] {1, 0, 1, 0, 1, 0});
// outputs no longer add to one another:
assertAnalyzesTo(a, "c d c d",
new String[] {"dog", "c", "harness", "d", "holder", "extras", "dog", "c", "harness", "d", "holder", "extras"},
new int[] {1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1});
// two outputs for same input
assertAnalyzesTo(a, "e f",
new String[] {"foo", "baz", "bar", "bee"},
new int[] {1, 0, 1, 0});
// verify multi-word / single-output offsets:
assertAnalyzesTo(a, "g i j k g",
new String[] {"g", "feep", "i", "j", "k", "g"},
new int[] {1, 1, 0, 1, 1, 1});
// mixed keepOrig true/false:
assertAnalyzesTo(a, "a m c e x",
new String[] {"foo", "a", "dog", "barks", "loudly", "x"},
new int[] {1, 0, 1, 1, 1, 1});
assertAnalyzesTo(a, "c d m c e x",
new String[] {"dog", "c", "harness", "d", "holder", "extras", "dog", "barks", "loudly","x"},
new int[] {1, 0, 1, 0, 1, 1, 1, 1, 1, 1});
assertTrue(synFilter.getCaptureCount() > 0);
// no captureStates when no syns matched
assertAnalyzesTo(a, "p q r s t",
new String[] {"p", "q", "r", "s", "t"},
new int[] {1, 1, 1, 1, 1});
assertEquals(0, synFilter.getCaptureCount());
// captureStates are necessary for the single-token syn case:
assertAnalyzesTo(a, "p q z y t",
new String[] {"p", "q", "boo", "bee", "y", "t"},
new int[] {1, 1, 1, 1, 0, 1});
assertTrue(synFilter.getCaptureCount() > 0);
}
public void testBasic2() throws Exception {
boolean keepOrig = true;
do {
keepOrig = !keepOrig;
SynonymMap.Builder b = new SynonymMap.Builder(true);
add(b,"aaa", "aaaa1 aaaa2 aaaa3", keepOrig);
add(b, "bbb", "bbbb1 bbbb2", keepOrig);
Analyzer a = getFlattenAnalyzer(b, true);
if (keepOrig) {
assertAnalyzesTo(a, "xyzzy bbb pot of gold",
new String[] {"xyzzy", "bbbb1", "bbb", "bbbb2", "pot", "of", "gold"},
new int[] {1, 1, 0, 1, 1, 1, 1});
assertAnalyzesTo(a, "xyzzy aaa pot of gold",
new String[] {"xyzzy", "aaaa1", "aaa", "aaaa2", "aaaa2", "pot", "of", "gold"},
new int[] {1, 1, 0, 1, 1, 1, 1, 1});
} else {
assertAnalyzesTo(a, "xyzzy bbb pot of gold",
new String[] {"xyzzy", "bbbb1", "bbbb2", "pot", "of", "gold"},
new int[] {1, 1, 1, 1, 1, 1});
assertAnalyzesTo(a, "xyzzy aaa pot of gold",
new String[] {"xyzzy", "aaaa1", "aaaa2", "aaaa3", "pot", "of", "gold"},
new int[] {1, 1, 1, 1, 1, 1, 1});
}
} while (keepOrig);
}
/** If we expand synonyms during indexing, it's a bit better than
* SynonymFilter is today, but still necessarily has false
* positive and negative PhraseQuery matches because we do not
* index posLength, so we lose information. */
public void testFlattenedGraph() throws Exception {
SynonymMap.Builder b = new SynonymMap.Builder();
add(b, "wtf", "what the fudge", true);
Analyzer a = getFlattenAnalyzer(b, true);
assertAnalyzesTo(a, "wtf happened",
new String[] {"what", "wtf", "the", "fudge", "happened"},
new int[] { 0, 0, 0, 0, 4},
new int[] { 3, 3, 3, 3, 12},
null,
new int[] { 1, 0, 1, 1, 1},
new int[] { 1, 3, 1, 1, 1},
true);
Directory dir = newDirectory();
RandomIndexWriter w = new RandomIndexWriter(random(), dir, a);
Document doc = new Document();
doc.add(newTextField("field", "wtf happened", Field.Store.NO));
w.addDocument(doc);
IndexReader r = w.getReader();
w.close();
IndexSearcher s = newSearcher(r);
// Good (this should not match, and doesn't):
assertEquals(0, s.count(new PhraseQuery("field", "what", "happened")));
// Bad (this should match, but doesn't):
assertEquals(0, s.count(new PhraseQuery("field", "wtf", "happened")));
// Good (this should match, and does):
assertEquals(1, s.count(new PhraseQuery("field", "what", "the", "fudge", "happened")));
// Bad (this should not match, but does):
assertEquals(1, s.count(new PhraseQuery("field", "wtf", "the")));
IOUtils.close(r, dir);
}
// Needs TermAutomatonQuery, which is in sandbox still:
/*
public void testAccurateGraphQuery1() throws Exception {
Directory dir = newDirectory();
RandomIndexWriter w = new RandomIndexWriter(random(), dir);
Document doc = new Document();
doc.add(newTextField("field", "wtf happened", Field.Store.NO));
w.addDocument(doc);
IndexReader r = w.getReader();
w.close();
IndexSearcher s = newSearcher(r);
SynonymMap.Builder b = new SynonymMap.Builder();
add(b, "what the fudge", "wtf", true);
SynonymMap map = b.build();
TokenStreamToTermAutomatonQuery ts2q = new TokenStreamToTermAutomatonQuery();
TokenStream in = new CannedTokenStream(0, 23, new Token[] {
token("what", 1, 1, 0, 4),
token("the", 1, 1, 5, 8),
token("fudge", 1, 1, 9, 14),
token("happened", 1, 1, 15, 23),
});
assertEquals(1, s.count(ts2q.toQuery("field", new SynonymGraphFilter(in, map, true))));
in = new CannedTokenStream(0, 12, new Token[] {
token("wtf", 1, 1, 0, 3),
token("happened", 1, 1, 4, 12),
});
assertEquals(1, s.count(ts2q.toQuery("field", new SynonymGraphFilter(in, map, true))));
// "what happened" should NOT match:
in = new CannedTokenStream(0, 13, new Token[] {
token("what", 1, 1, 0, 4),
token("happened", 1, 1, 5, 13),
});
assertEquals(0, s.count(ts2q.toQuery("field", new SynonymGraphFilter(in, map, true))));
IOUtils.close(r, dir);
}
*/
/** If we expand synonyms at search time, the results are correct. */
// Needs TermAutomatonQuery, which is in sandbox still:
/*
public void testAccurateGraphQuery2() throws Exception {
Directory dir = newDirectory();
RandomIndexWriter w = new RandomIndexWriter(random(), dir);
Document doc = new Document();
doc.add(newTextField("field", "say wtf happened", Field.Store.NO));
w.addDocument(doc);
IndexReader r = w.getReader();
w.close();
IndexSearcher s = newSearcher(r);
SynonymMap.Builder b = new SynonymMap.Builder();
add(b, "what the fudge", "wtf", true);
SynonymMap map = b.build();
TokenStream in = new CannedTokenStream(0, 26, new Token[] {
token("say", 1, 1, 0, 3),
token("what", 1, 1, 3, 7),
token("the", 1, 1, 8, 11),
token("fudge", 1, 1, 12, 17),
token("happened", 1, 1, 18, 26),
});
TokenStreamToTermAutomatonQuery ts2q = new TokenStreamToTermAutomatonQuery();
assertEquals(1, s.count(ts2q.toQuery("field", new SynonymGraphFilter(in, map, true))));
// "what happened" should NOT match:
in = new CannedTokenStream(0, 13, new Token[] {
token("what", 1, 1, 0, 4),
token("happened", 1, 1, 5, 13),
});
assertEquals(0, s.count(ts2q.toQuery("field", new SynonymGraphFilter(in, map, true))));
IOUtils.close(r, dir);
}
*/
// Needs TermAutomatonQuery, which is in sandbox still:
/*
public void testAccurateGraphQuery3() throws Exception {
Directory dir = newDirectory();
RandomIndexWriter w = new RandomIndexWriter(random(), dir);
Document doc = new Document();
doc.add(newTextField("field", "say what the fudge happened", Field.Store.NO));
w.addDocument(doc);
IndexReader r = w.getReader();
w.close();
IndexSearcher s = newSearcher(r);
SynonymMap.Builder b = new SynonymMap.Builder();
add(b, "wtf", "what the fudge", true);
SynonymMap map = b.build();
TokenStream in = new CannedTokenStream(0, 15, new Token[] {
token("say", 1, 1, 0, 3),
token("wtf", 1, 1, 3, 6),
token("happened", 1, 1, 7, 15),
});
TokenStreamToTermAutomatonQuery ts2q = new TokenStreamToTermAutomatonQuery();
assertEquals(1, s.count(ts2q.toQuery("field", new SynonymGraphFilter(in, map, true))));
// "what happened" should NOT match:
in = new CannedTokenStream(0, 13, new Token[] {
token("what", 1, 1, 0, 4),
token("happened", 1, 1, 5, 13),
});
assertEquals(0, s.count(ts2q.toQuery("field", new SynonymGraphFilter(in, map, true))));
IOUtils.close(r, dir);
}
private static Token token(String term, int posInc, int posLength, int startOffset, int endOffset) {
final Token t = new Token(term, startOffset, endOffset);
t.setPositionIncrement(posInc);
t.setPositionLength(posLength);
return t;
}
*/
private String randomNonEmptyString() {
while(true) {
String s = TestUtil.randomUnicodeString(random()).trim();
//String s = TestUtil.randomSimpleString(random()).trim();
if (s.length() != 0 && s.indexOf('\u0000') == -1) {
return s;
}
}
}
// Adds MockGraphTokenFilter after SynFilter:
public void testRandomGraphAfter() throws Exception {
final int numIters = atLeast(3);
for (int i = 0; i < numIters; i++) {
SynonymMap.Builder b = new SynonymMap.Builder(random().nextBoolean());
final int numEntries = atLeast(10);
for (int j = 0; j < numEntries; j++) {
add(b, randomNonEmptyString(), randomNonEmptyString(), random().nextBoolean());
}
final SynonymMap map = b.build();
final boolean ignoreCase = random().nextBoolean();
final boolean doFlatten = random().nextBoolean();
final Analyzer analyzer = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName) {
Tokenizer tokenizer = new MockTokenizer(MockTokenizer.SIMPLE, true);
TokenStream syns = new SynonymGraphFilter(tokenizer, map, ignoreCase);
TokenStream graph = new MockGraphTokenFilter(random(), syns);
if (doFlatten) {
graph = new FlattenGraphFilter(graph);
}
return new TokenStreamComponents(tokenizer, graph);
}
};
checkRandomData(random(), analyzer, 100);
analyzer.close();
}
}
public void testEmptyStringInput() throws IOException {
final int numIters = atLeast(10);
for (int i = 0; i < numIters; i++) {
SynonymMap.Builder b = new SynonymMap.Builder(random().nextBoolean());
final int numEntries = atLeast(10);
for (int j = 0; j < numEntries; j++) {
add(b, randomNonEmptyString(), randomNonEmptyString(), random().nextBoolean());
}
final boolean ignoreCase = random().nextBoolean();
Analyzer analyzer = getAnalyzer(b, ignoreCase);
checkAnalysisConsistency(random(), analyzer, random().nextBoolean(), "");
analyzer.close();
}
}
/** simple random test, doesn't verify correctness.
* does verify it doesnt throw exceptions, or that the stream doesn't misbehave
*/
public void testRandom2() throws Exception {
final int numIters = atLeast(3);
for (int i = 0; i < numIters; i++) {
SynonymMap.Builder b = new SynonymMap.Builder(random().nextBoolean());
final int numEntries = atLeast(10);
for (int j = 0; j < numEntries; j++) {
add(b, randomNonEmptyString(), randomNonEmptyString(), random().nextBoolean());
}
final boolean ignoreCase = random().nextBoolean();
final boolean doFlatten = random().nextBoolean();
Analyzer analyzer;
if (doFlatten) {
analyzer = getFlattenAnalyzer(b, ignoreCase);
} else {
analyzer = getAnalyzer(b, ignoreCase);
}
checkRandomData(random(), analyzer, 100);
analyzer.close();
}
}
/** simple random test like testRandom2, but for larger docs
*/
public void testRandomHuge() throws Exception {
final int numIters = atLeast(1);
for (int i = 0; i < numIters; i++) {
SynonymMap.Builder b = new SynonymMap.Builder(random().nextBoolean());
final int numEntries = atLeast(10);
if (VERBOSE) {
System.out.println("TEST: iter=" + i + " numEntries=" + numEntries);
}
for (int j = 0; j < numEntries; j++) {
add(b, randomNonEmptyString(), randomNonEmptyString(), random().nextBoolean());
}
final boolean ignoreCase = random().nextBoolean();
final boolean doFlatten = random().nextBoolean();
Analyzer analyzer;
if (doFlatten) {
analyzer = getFlattenAnalyzer(b, ignoreCase);
} else {
analyzer = getAnalyzer(b, ignoreCase);
}
checkRandomData(random(), analyzer, 100, 1024);
analyzer.close();
}
}
public void testEmptyTerm() throws IOException {
final int numIters = atLeast(10);
for (int i = 0; i < numIters; i++) {
SynonymMap.Builder b = new SynonymMap.Builder(random().nextBoolean());
final int numEntries = atLeast(10);
for (int j = 0; j < numEntries; j++) {
add(b, randomNonEmptyString(), randomNonEmptyString(), random().nextBoolean());
}
final boolean ignoreCase = random().nextBoolean();
final Analyzer analyzer = getAnalyzer(b, ignoreCase);
checkAnalysisConsistency(random(), analyzer, random().nextBoolean(), "");
analyzer.close();
}
}
// LUCENE-3375
public void testVanishingTermsNoFlatten() throws Exception {
String testFile =
"aaa => aaaa1 aaaa2 aaaa3\n" +
"bbb => bbbb1 bbbb2\n";
Analyzer analyzer = solrSynsToAnalyzer(testFile);
assertAnalyzesTo(analyzer, "xyzzy bbb pot of gold",
new String[] { "xyzzy", "bbbb1", "bbbb2", "pot", "of", "gold" });
// xyzzy aaa pot of gold -> xyzzy aaaa1 aaaa2 aaaa3 gold
assertAnalyzesTo(analyzer, "xyzzy aaa pot of gold",
new String[] { "xyzzy", "aaaa1", "aaaa2", "aaaa3", "pot", "of", "gold" });
analyzer.close();
}
// LUCENE-3375
public void testVanishingTermsWithFlatten() throws Exception {
String testFile =
"aaa => aaaa1 aaaa2 aaaa3\n" +
"bbb => bbbb1 bbbb2\n";
Analyzer analyzer = solrSynsToAnalyzer(testFile);
assertAnalyzesTo(analyzer, "xyzzy bbb pot of gold",
new String[] { "xyzzy", "bbbb1", "bbbb2", "pot", "of", "gold" });
// xyzzy aaa pot of gold -> xyzzy aaaa1 aaaa2 aaaa3 gold
assertAnalyzesTo(analyzer, "xyzzy aaa pot of gold",
new String[] { "xyzzy", "aaaa1", "aaaa2", "aaaa3", "pot", "of", "gold" });
analyzer.close();
}
public void testBuilderDedup() throws Exception {
SynonymMap.Builder b = new SynonymMap.Builder(true);
final boolean keepOrig = false;
add(b, "a b", "ab", keepOrig);
add(b, "a b", "ab", keepOrig);
add(b, "a b", "ab", keepOrig);
Analyzer a = getAnalyzer(b, true);
assertAnalyzesTo(a, "a b",
new String[] { "ab" },
new int[] { 1 });
a.close();
}
public void testBuilderNoDedup() throws Exception {
SynonymMap.Builder b = new SynonymMap.Builder(false);
final boolean keepOrig = false;
add(b, "a b", "ab", keepOrig);
add(b, "a b", "ab", keepOrig);
add(b, "a b", "ab", keepOrig);
Analyzer a = getAnalyzer(b, true);
assertAnalyzesTo(a, "a b",
new String[] { "ab", "ab", "ab" },
new int[] { 1, 0, 0 });
a.close();
}
public void testRecursion1() throws Exception {
SynonymMap.Builder b = new SynonymMap.Builder(true);
final boolean keepOrig = false;
add(b, "zoo", "zoo", keepOrig);
Analyzer a = getAnalyzer(b, true);
assertAnalyzesTo(a, "zoo zoo $ zoo",
new String[] { "zoo", "zoo", "$", "zoo" },
new int[] { 1, 1, 1, 1 });
a.close();
}
public void testRecursion2() throws Exception {
SynonymMap.Builder b = new SynonymMap.Builder(true);
final boolean keepOrig = false;
add(b, "zoo", "zoo", keepOrig);
add(b, "zoo", "zoo zoo", keepOrig);
Analyzer a = getAnalyzer(b, true);
// verify("zoo zoo $ zoo", "zoo/zoo zoo/zoo/zoo $/zoo zoo/zoo zoo");
assertAnalyzesTo(a, "zoo zoo $ zoo",
new String[] { "zoo", "zoo", "zoo", "zoo", "zoo", "zoo", "$", "zoo", "zoo", "zoo" },
new int[] { 1, 0, 1, 1, 0, 1, 1, 1, 0, 1 });
a.close();
}
public void testRecursion3() throws Exception {
SynonymMap.Builder b = new SynonymMap.Builder(true);
final boolean keepOrig = true;
add(b, "zoo zoo", "zoo", keepOrig);
Analyzer a = getFlattenAnalyzer(b, true);
assertAnalyzesTo(a, "zoo zoo $ zoo",
new String[]{"zoo", "zoo", "zoo", "$", "zoo"},
new int[]{1, 0, 1, 1, 1});
a.close();
}
public void testRecursion4() throws Exception {
SynonymMap.Builder b = new SynonymMap.Builder(true);
final boolean keepOrig = true;
add(b, "zoo zoo", "zoo", keepOrig);
add(b, "zoo", "zoo zoo", keepOrig);
Analyzer a = getFlattenAnalyzer(b, true);
assertAnalyzesTo(a, "zoo zoo $ zoo",
new String[]{"zoo", "zoo", "zoo", "$", "zoo", "zoo", "zoo"},
new int[]{1, 0, 1, 1, 1, 0, 1});
a.close();
}
public void testKeepOrig() throws Exception {
SynonymMap.Builder b = new SynonymMap.Builder(true);
final boolean keepOrig = true;
add(b, "a b", "ab", keepOrig);
add(b, "a c", "ac", keepOrig);
add(b, "a", "aa", keepOrig);
add(b, "b", "bb", keepOrig);
add(b, "z x c v", "zxcv", keepOrig);
add(b, "x c", "xc", keepOrig);
Analyzer a = getAnalyzer(b, true);
assertAnalyzesTo(a, "$",
new String[] { "$" },
new int[] { 1 });
assertAnalyzesTo(a, "a",
new String[] { "aa", "a" },
new int[] { 1, 0 });
assertAnalyzesTo(a, "a",
new String[] { "aa", "a" },
new int[] { 1, 0 });
assertAnalyzesTo(a, "$ a",
new String[] { "$", "aa", "a" },
new int[] { 1, 1, 0 });
assertAnalyzesTo(a, "a $",
new String[] { "aa", "a", "$" },
new int[] { 1, 0, 1 });
assertAnalyzesTo(a, "$ a !",
new String[] { "$", "aa", "a", "!" },
new int[] { 1, 1, 0, 1 });
assertAnalyzesTo(a, "a a",
new String[] { "aa", "a", "aa", "a" },
new int[] { 1, 0, 1, 0 });
assertAnalyzesTo(a, "b",
new String[] { "bb", "b" },
new int[] { 1, 0 });
assertAnalyzesTo(a, "z x c v",
new String[] { "zxcv", "z", "x", "c", "v" },
new int[] { 1, 0, 1, 1, 1 });
assertAnalyzesTo(a, "z x c $",
new String[] { "z", "xc", "x", "c", "$" },
new int[] { 1, 1, 0, 1, 1 });
a.close();
}
/**
* verify type of token and positionLengths on synonyms of different word counts, with non preserving, explicit rules.
*/
public void testNonPreservingMultiwordSynonyms() throws Exception {
String testFile =
"aaa => two words\n" +
"bbb => one two, very many multiple words\n" +
"ee ff, gg, h i j k, h i => one\n" +
"cc dd => usa,united states,u s a,united states of america";
Analyzer analyzer = solrSynsToAnalyzer(testFile);
assertAnalyzesTo(analyzer, "aaa",
new String[]{"two", "words"},
new int[]{0, 0},
new int[]{3, 3},
new String[]{"SYNONYM", "SYNONYM"},
new int[]{1, 1},
new int[]{1, 1});
assertAnalyzesToPositions(analyzer, "amazing aaa",
new String[]{"amazing", "two", "words"},
new String[]{"word", "SYNONYM", "SYNONYM"},
new int[]{1, 1, 1},
new int[]{1, 1, 1});
assertAnalyzesTo(analyzer, "p bbb s",
new String[]{"p", "one", "very", "two", "many", "multiple", "words", "s"},
new int[]{0, 2, 2, 2, 2, 2, 2, 6},
new int[]{1, 5, 5, 5, 5, 5, 5, 7},
new String[]{"word", "SYNONYM", "SYNONYM", "SYNONYM", "SYNONYM", "SYNONYM", "SYNONYM", "word"},
new int[]{1, 1, 0, 1, 0, 1, 1, 1},
new int[]{1, 1, 1, 3, 1, 1, 1, 1});
assertAnalyzesTo(analyzer, "p ee ff s",
new String[]{"p", "one", "s"},
new int[]{0, 2, 8},
new int[]{1, 7, 9},
new String[]{"word", "SYNONYM", "word"},
new int[]{1, 1, 1},
new int[]{1, 1, 1});
assertAnalyzesTo(analyzer, "p h i j s",
new String[]{"p", "one", "j", "s"},
new int[]{0, 2, 6, 8},
new int[]{1, 5, 7, 9},
new String[]{"word", "SYNONYM", "word", "word"},
new int[]{1, 1, 1, 1},
new int[]{1, 1, 1, 1});
analyzer.close();
}
private Analyzer getAnalyzer(SynonymMap.Builder b, final boolean ignoreCase) throws IOException {
final SynonymMap map = b.build();
return new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName) {
Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
// Make a local variable so testRandomHuge doesn't share it across threads!
SynonymGraphFilter synFilter = new SynonymGraphFilter(tokenizer, map, ignoreCase);
TestSynonymGraphFilter.this.flattenFilter = null;
TestSynonymGraphFilter.this.synFilter = synFilter;
return new TokenStreamComponents(tokenizer, synFilter);
}
};
}
/** Appends FlattenGraphFilter too */
private Analyzer getFlattenAnalyzer(SynonymMap.Builder b, boolean ignoreCase) throws IOException {
final SynonymMap map = b.build();
return new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName) {
Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, true);
// Make a local variable so testRandomHuge doesn't share it across threads!
SynonymGraphFilter synFilter = new SynonymGraphFilter(tokenizer, map, ignoreCase);
FlattenGraphFilter flattenFilter = new FlattenGraphFilter(synFilter);
TestSynonymGraphFilter.this.synFilter = synFilter;
TestSynonymGraphFilter.this.flattenFilter = flattenFilter;
return new TokenStreamComponents(tokenizer, flattenFilter);
}
};
}
private void add(SynonymMap.Builder b, String input, String output, boolean keepOrig) {
if (VERBOSE) {
//System.out.println(" add input=" + input + " output=" + output + " keepOrig=" + keepOrig);
}
CharsRefBuilder inputCharsRef = new CharsRefBuilder();
SynonymMap.Builder.join(input.split(" +"), inputCharsRef);
CharsRefBuilder outputCharsRef = new CharsRefBuilder();
SynonymMap.Builder.join(output.split(" +"), outputCharsRef);
b.add(inputCharsRef.get(), outputCharsRef.get(), keepOrig);
}
private char[] randomBinaryChars(int minLen, int maxLen, double bias, char base) {
int len = TestUtil.nextInt(random(), minLen, maxLen);
char[] chars = new char[len];
for(int i=0;i<len;i++) {
char ch;
if (random().nextDouble() < bias) {
ch = base;
} else {
ch = (char) (base+1);
}
chars[i] = ch;
}
return chars;
}
private static String toTokenString(char[] chars) {
StringBuilder b = new StringBuilder();
for(char c : chars) {
if (b.length() > 0) {
b.append(' ');
}
b.append(c);
}
return b.toString();
}
private static class OneSyn {
char[] in;
char[] out;
boolean keepOrig;
@Override
public String toString() {
return toTokenString(in) + " --> " + toTokenString(out) + " (keepOrig=" + keepOrig + ")";
}
}
public void testRandomSyns() throws Exception {
int synCount = atLeast(10);
double bias = random().nextDouble();
boolean dedup = random().nextBoolean();
boolean flatten = random().nextBoolean();
SynonymMap.Builder b = new SynonymMap.Builder(dedup);
List<OneSyn> syns = new ArrayList<>();
// Makes random syns from random a / b tokens, mapping to random x / y tokens
if (VERBOSE) {
System.out.println("TEST: make " + synCount + " syns");
System.out.println(" bias for a over b=" + bias);
System.out.println(" dedup=" + dedup);
System.out.println(" flatten=" + flatten);
}
int maxSynLength = 0;
for(int i=0;i<synCount;i++) {
OneSyn syn = new OneSyn();
syn.in = randomBinaryChars(1, 5, bias, 'a');
syn.out = randomBinaryChars(1, 5, 0.5, 'x');
syn.keepOrig = random().nextBoolean();
syns.add(syn);
maxSynLength = Math.max(maxSynLength, syn.in.length);
if (VERBOSE) {
System.out.println(" " + syn);
}
add(b, toTokenString(syn.in), toTokenString(syn.out), syn.keepOrig);
}
// Compute max allowed lookahead for flatten filter:
int maxFlattenLookahead = 0;
if (flatten) {
for(int i=0;i<synCount;i++) {
OneSyn syn1 = syns.get(i);
int count = syn1.out.length;
boolean keepOrig = syn1.keepOrig;
for(int j=0;j<synCount;j++) {
OneSyn syn2 = syns.get(i);
keepOrig |= syn2.keepOrig;
if (syn1.in.equals(syn2.in)) {
count += syn2.out.length;
}
}
if (keepOrig) {
count += syn1.in.length;
}
maxFlattenLookahead = Math.max(maxFlattenLookahead, count);
}
}
// Only used w/ VERBOSE:
Analyzer aNoFlattened;
if (VERBOSE) {
aNoFlattened = getAnalyzer(b, true);
} else {
aNoFlattened = null;
}
Analyzer a;
if (flatten) {
a = getFlattenAnalyzer(b, true);
} else {
a = getAnalyzer(b, true);
}
int iters = atLeast(1);
for(int iter=0;iter<iters;iter++) {
String doc = toTokenString(randomBinaryChars(50, 100, bias, 'a'));
//String doc = toTokenString(randomBinaryChars(10, 50, bias, 'a'));
if (VERBOSE) {
System.out.println("TEST: iter="+ iter + " doc=" + doc);
}
Automaton expected = slowSynFilter(doc, syns, flatten);
if (VERBOSE) {
System.out.println(" expected:\n" + expected.toDot());
if (flatten) {
Automaton unflattened = toAutomaton(aNoFlattened.tokenStream("field", new StringReader(doc)));
System.out.println(" actual unflattened:\n" + unflattened.toDot());
}
}
Automaton actual = toAutomaton(a.tokenStream("field", new StringReader(doc)));
if (VERBOSE) {
System.out.println(" actual:\n" + actual.toDot());
}
assertTrue("maxLookaheadUsed=" + synFilter.getMaxLookaheadUsed() + " maxSynLength=" + maxSynLength,
synFilter.getMaxLookaheadUsed() <= maxSynLength);
if (flatten) {
assertTrue("flatten maxLookaheadUsed=" + flattenFilter.getMaxLookaheadUsed() + " maxFlattenLookahead=" + maxFlattenLookahead,
flattenFilter.getMaxLookaheadUsed() <= maxFlattenLookahead);
}
checkAnalysisConsistency(random(), a, random().nextBoolean(), doc);
// We can easily have a non-deterministic automaton at this point, e.g. if
// more than one syn matched at given point, or if the syn mapped to an
// output token that also happens to be in the input:
try {
actual = Operations.determinize(actual, 50000);
} catch (TooComplexToDeterminizeException tctde) {
// Unfortunately the syns can easily create difficult-to-determinize graphs:
assertTrue(approxEquals(actual, expected));
continue;
}
try {
expected = Operations.determinize(expected, 50000);
} catch (TooComplexToDeterminizeException tctde) {
// Unfortunately the syns can easily create difficult-to-determinize graphs:
assertTrue(approxEquals(actual, expected));
continue;
}
assertTrue(approxEquals(actual, expected));
assertTrue(Operations.sameLanguage(actual, expected));
}
a.close();
}
/** Only used when true equality is too costly to check! */
private boolean approxEquals(Automaton actual, Automaton expected) {
// Don't collapse these into one line else the thread stack won't say which direction failed!:
boolean b1 = approxSubsetOf(actual, expected);
boolean b2 = approxSubsetOf(expected, actual);
return b1 && b2;
}
private boolean approxSubsetOf(Automaton a1, Automaton a2) {
AutomatonTestUtil.RandomAcceptedStrings ras = new AutomatonTestUtil.RandomAcceptedStrings(a1);
for(int i=0;i<2000;i++) {
int[] ints = ras.getRandomAcceptedString(random());
IntsRef path = new IntsRef(ints, 0, ints.length);
if (accepts(a2, path) == false) {
throw new RuntimeException("a2 does not accept " + path);
}
}
// Presumed true
return true;
}
/** Like {@link Operations#run} except the incoming automaton is allowed to be non-deterministic. */
private static boolean accepts(Automaton a, IntsRef path) {
Set<Integer> states = new HashSet<>();
states.add(0);
Transition t = new Transition();
for(int i=0;i<path.length;i++) {
int digit = path.ints[path.offset+i];
Set<Integer> nextStates = new HashSet<>();
for(int state : states) {
int count = a.initTransition(state, t);
for(int j=0;j<count;j++) {
a.getNextTransition(t);
if (digit >= t.min && digit <= t.max) {
nextStates.add(t.dest);
}
}
}
states = nextStates;
if (states.isEmpty()) {
return false;
}
}
for(int state : states) {
if (a.isAccept(state)) {
return true;
}
}
return false;
}
/** Stupid, slow brute-force, yet hopefully bug-free, synonym filter. */
private Automaton slowSynFilter(String doc, List<OneSyn> syns, boolean flatten) {
String[] tokens = doc.split(" +");
if (VERBOSE) {
System.out.println(" doc has " + tokens.length + " tokens");
}
int i=0;
Automaton.Builder a = new Automaton.Builder();
int lastState = a.createState();
while (i<tokens.length) {
// Consider all possible syn matches starting at this point:
assert tokens[i].length() == 1;
if (VERBOSE) {
System.out.println(" i=" + i);
}
List<OneSyn> matches = new ArrayList<>();
for(OneSyn syn : syns) {
if (i + syn.in.length <= tokens.length) {
boolean match = true;
for(int j=0;j<syn.in.length;j++) {
if (tokens[i+j].charAt(0) != syn.in[j]) {
match = false;
break;
}
}
if (match) {
if (matches.isEmpty() == false) {
if (syn.in.length < matches.get(0).in.length) {
// Greedy matching: we already found longer syns matching here
continue;
} else if (syn.in.length > matches.get(0).in.length) {
// Greedy matching: all previous matches were shorter, so we drop them
matches.clear();
} else {
// Keep the current matches: we allow multiple synonyms matching the same input string
}
}
matches.add(syn);
}
}
}
int nextState = a.createState();
if (matches.isEmpty() == false) {
// We have match(es) starting at this token
if (VERBOSE) {
System.out.println(" matches @ i=" + i + ": " + matches);
}
// We keepOrig if any of the matches said to:
boolean keepOrig = false;
for(OneSyn syn : matches) {
keepOrig |= syn.keepOrig;
}
List<Integer> flatStates;
if (flatten) {
flatStates = new ArrayList<>();
} else {
flatStates = null;
}
if (keepOrig) {
// Add path for the original tokens
addSidePath(a, lastState, nextState, matches.get(0).in, flatStates);
}
for(OneSyn syn : matches) {
addSidePath(a, lastState, nextState, syn.out, flatStates);
}
i += matches.get(0).in.length;
} else {
a.addTransition(lastState, nextState, tokens[i].charAt(0));
i++;
}
lastState = nextState;
}
a.setAccept(lastState, true);
return topoSort(a.finish());
}
/** Just creates a side path from startState to endState with the provided tokens. */
private static void addSidePath(Automaton.Builder a, int startState, int endState, char[] tokens, List<Integer> flatStates) {
int lastState = startState;
for(int i=0;i<tokens.length;i++) {
int nextState;
if (i == tokens.length-1) {
nextState = endState;
} else if (flatStates == null || i >= flatStates.size()) {
nextState = a.createState();
if (flatStates != null) {
assert i == flatStates.size();
flatStates.add(nextState);
}
} else {
nextState = flatStates.get(i);
}
a.addTransition(lastState, nextState, tokens[i]);
lastState = nextState;
}
}
private Automaton toAutomaton(TokenStream ts) throws IOException {
PositionIncrementAttribute posIncAtt = ts.addAttribute(PositionIncrementAttribute.class);
PositionLengthAttribute posLenAtt = ts.addAttribute(PositionLengthAttribute.class);
CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
ts.reset();
Automaton a = new Automaton();
int srcNode = -1;
int destNode = -1;
int state = a.createState();
while (ts.incrementToken()) {
assert termAtt.length() == 1;
char c = termAtt.charAt(0);
int posInc = posIncAtt.getPositionIncrement();
if (posInc != 0) {
srcNode += posInc;
while (state < srcNode) {
state = a.createState();
}
}
destNode = srcNode + posLenAtt.getPositionLength();
while (state < destNode) {
state = a.createState();
}
a.addTransition(srcNode, destNode, c);
}
ts.end();
ts.close();
a.finishState();
a.setAccept(destNode, true);
return a;
}
/*
private String toDot(TokenStream ts) throws IOException {
PositionIncrementAttribute posIncAtt = ts.addAttribute(PositionIncrementAttribute.class);
PositionLengthAttribute posLenAtt = ts.addAttribute(PositionLengthAttribute.class);
CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
TypeAttribute typeAtt = ts.addAttribute(TypeAttribute.class);
ts.reset();
int srcNode = -1;
int destNode = -1;
StringBuilder b = new StringBuilder();
b.append("digraph Automaton {\n");
b.append(" rankdir = LR\n");
b.append(" node [width=0.2, height=0.2, fontsize=8]\n");
b.append(" initial [shape=plaintext,label=\"\"]\n");
b.append(" initial -> 0\n");
while (ts.incrementToken()) {
int posInc = posIncAtt.getPositionIncrement();
if (posInc != 0) {
srcNode += posInc;
b.append(" ");
b.append(srcNode);
b.append(" [shape=circle,label=\"" + srcNode + "\"]\n");
}
destNode = srcNode + posLenAtt.getPositionLength();
b.append(" ");
b.append(srcNode);
b.append(" -> ");
b.append(destNode);
b.append(" [label=\"");
b.append(termAtt);
b.append("\"");
if (typeAtt.type().equals("word") == false) {
b.append(" color=red");
}
b.append("]\n");
}
ts.end();
ts.close();
b.append('}');
return b.toString();
}
*/
/** Renumbers nodes according to their topo sort */
private Automaton topoSort(Automaton in) {
int[] newToOld = Operations.topoSortStates(in);
int[] oldToNew = new int[newToOld.length];
Automaton.Builder a = new Automaton.Builder();
//System.out.println("remap:");
for(int i=0;i<newToOld.length;i++) {
a.createState();
oldToNew[newToOld[i]] = i;
//System.out.println(" " + newToOld[i] + " -> " + i);
if (in.isAccept(newToOld[i])) {
a.setAccept(i, true);
//System.out.println(" **");
}
}
Transition t = new Transition();
for(int i=0;i<newToOld.length;i++) {
int count = in.initTransition(newToOld[i], t);
for(int j=0;j<count;j++) {
in.getNextTransition(t);
a.addTransition(i, oldToNew[t.dest], t.min, t.max);
}
}
return a.finish();
}
/**
* verify type of token and positionLengths on synonyms of different word counts.
*/
public void testPositionLengthAndType() throws Exception {
String testFile =
"spider man, spiderman\n" +
"usa,united states,u s a,united states of america";
Analyzer analyzer = new MockAnalyzer(random());
SolrSynonymParser parser = new SolrSynonymParser(true, true, analyzer);
parser.parse(new StringReader(testFile));
analyzer.close();
SynonymMap map = parser.build();
analyzer = getFlattenAnalyzer(parser, true);
BytesRef value = Util.get(map.fst, Util.toUTF32(new CharsRef("usa"), new IntsRefBuilder()));
ByteArrayDataInput bytesReader = new ByteArrayDataInput(value.bytes, value.offset, value.length);
final int code = bytesReader.readVInt();
final int count = code >>> 1;
final int[] synonymsIdxs = new int[count];
for (int i = 0; i < count; i++) {
synonymsIdxs[i] = bytesReader.readVInt();
}
BytesRef scratchBytes = new BytesRef();
map.words.get(synonymsIdxs[2], scratchBytes);
int synonymLength = 1;
for (int i = scratchBytes.offset; i < scratchBytes.offset + scratchBytes.length; i++) {
if (scratchBytes.bytes[i] == SynonymMap.WORD_SEPARATOR) {
synonymLength++;
}
}
assertEquals(count, 3);
assertEquals(synonymLength, 4);
assertAnalyzesTo(analyzer, "spider man",
new String[]{"spiderman", "spider", "man"},
new int[]{0, 0, 7},
new int[]{10, 6, 10},
new String[]{"SYNONYM", "word", "word"},
new int[]{1, 0, 1},
new int[]{2, 1, 1});
assertAnalyzesToPositions(analyzer, "amazing spider man",
new String[]{"amazing", "spiderman", "spider", "man"},
new String[]{"word", "SYNONYM", "word", "word"},
new int[]{1, 1, 0, 1},
new int[]{1, 2, 1, 1});
// System.out.println(toDot(getAnalyzer(parser, true).tokenStream("field", new StringReader("the usa is wealthy"))));
assertAnalyzesTo(analyzer, "the united states of america is wealthy",
new String[]{"the", "usa", "united", "u", "united", "states", "s", "states", "a", "of", "america", "is", "wealthy"},
new int[] {0, 4, 4, 4, 4, 11, 11, 11, 18, 18, 21, 29, 32},
new int[] {3, 28, 10, 10, 10, 28, 17, 17, 28, 20, 28, 31, 39},
new String[]{"word", "SYNONYM", "SYNONYM", "SYNONYM", "word", "SYNONYM", "SYNONYM", "word", "SYNONYM", "word", "word", "word", "word"},
new int[] {1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1},
new int[] {1, 4, 1, 1, 1, 3, 1, 1, 2, 1, 1, 1, 1});
assertAnalyzesToPositions(analyzer, "spiderman",
new String[]{"spider", "spiderman", "man"},
new String[]{"SYNONYM", "word", "SYNONYM"},
new int[]{1, 0, 1},
new int[]{1, 2, 1});
assertAnalyzesTo(analyzer, "spiderman enemies",
new String[]{"spider", "spiderman", "man", "enemies"},
new int[]{0, 0, 0, 10},
new int[]{9, 9, 9, 17},
new String[]{"SYNONYM", "word", "SYNONYM", "word"},
new int[]{1, 0, 1, 1},
new int[]{1, 2, 1, 1});
assertAnalyzesTo(analyzer, "the usa is wealthy",
new String[]{"the", "united", "u", "united", "usa", "states", "s", "states", "a", "of", "america", "is", "wealthy"},
new int[] {0, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 8, 11},
new int[] {3, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 10, 18},
new String[]{"word", "SYNONYM", "SYNONYM", "SYNONYM", "word", "SYNONYM", "SYNONYM", "SYNONYM", "SYNONYM", "SYNONYM", "SYNONYM", "word", "word"},
new int[] {1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1},
new int[] {1, 1, 1, 1, 4, 3, 1, 1, 2, 1, 1, 1, 1});
assertGraphStrings(analyzer, "the usa is wealthy", new String[] {
"the usa is wealthy",
"the united states is wealthy",
"the u s a is wealthy",
"the united states of america is wealthy",
// Wrong. Here only due to "sausagization" of the multi word synonyms.
"the u states is wealthy",
"the u states a is wealthy",
"the u s of america is wealthy",
"the u states of america is wealthy",
"the united s a is wealthy",
"the united states a is wealthy",
"the united s of america is wealthy"});
assertAnalyzesTo(analyzer, "the united states is wealthy",
new String[]{"the", "usa", "u", "united", "united", "s", "states", "states", "a", "of", "america", "is", "wealthy"},
new int[] {0, 4, 4, 4, 4, 11, 11, 11, 11, 11, 11, 18, 21},
new int[] {3, 17, 10, 10, 10, 17, 17, 17, 17, 17, 17, 20, 28},
new String[]{"word", "SYNONYM", "SYNONYM", "SYNONYM", "word", "SYNONYM", "SYNONYM", "word", "SYNONYM", "SYNONYM", "SYNONYM", "word", "word"},
new int[] {1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1},
new int[] {1, 4, 1, 1, 1, 1, 1, 3, 2, 1, 1, 1, 1},
false);
assertAnalyzesTo(analyzer, "the united states of balance",
new String[]{"the", "usa", "u", "united", "united", "s", "states", "states", "a", "of", "america", "of", "balance"},
new int[] {0, 4, 4, 4, 4, 11, 11, 11, 11, 11, 11, 18, 21},
new int[] {3, 17, 10, 10, 10, 17, 17, 17, 17, 17, 17, 20, 28},
new String[]{"word", "SYNONYM", "SYNONYM", "SYNONYM", "word", "SYNONYM", "SYNONYM", "word", "SYNONYM", "SYNONYM", "SYNONYM", "word", "word"},
new int[] {1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1},
new int[] {1, 4, 1, 1, 1, 1, 1, 3, 2, 1, 1, 1, 1});
analyzer.close();
}
public void testMultiwordOffsets() throws Exception {
SynonymMap.Builder b = new SynonymMap.Builder(true);
final boolean keepOrig = true;
add(b, "national hockey league", "nhl", keepOrig);
Analyzer a = getFlattenAnalyzer(b, true);
assertAnalyzesTo(a, "national hockey league",
new String[]{"nhl", "national", "hockey", "league"},
new int[]{0, 0, 9, 16},
new int[]{22, 8, 15, 22},
new int[]{1, 0, 1, 1});
a.close();
}
public void testIncludeOrig() throws Exception {
SynonymMap.Builder b = new SynonymMap.Builder(true);
final boolean keepOrig = true;
add(b, "a b", "ab", keepOrig);
add(b, "a c", "ac", keepOrig);
add(b, "a", "aa", keepOrig);
add(b, "b", "bb", keepOrig);
add(b, "z x c v", "zxcv", keepOrig);
add(b, "x c", "xc", keepOrig);
Analyzer a = getFlattenAnalyzer(b, true);
assertAnalyzesTo(a, "$",
new String[]{"$"},
new int[]{1});
assertAnalyzesTo(a, "a",
new String[]{"aa", "a"},
new int[]{1, 0});
assertAnalyzesTo(a, "a",
new String[]{"aa", "a"},
new int[]{1, 0});
assertAnalyzesTo(a, "$ a",
new String[]{"$", "aa", "a"},
new int[]{1, 1, 0});
assertAnalyzesTo(a, "a $",
new String[]{"aa", "a", "$"},
new int[]{1, 0, 1});
assertAnalyzesTo(a, "$ a !",
new String[]{"$", "aa", "a", "!"},
new int[]{1, 1, 0, 1});
assertAnalyzesTo(a, "a a",
new String[]{"aa", "a", "aa", "a"},
new int[]{1, 0, 1, 0});
assertAnalyzesTo(a, "b",
new String[]{"bb", "b"},
new int[]{1, 0});
assertAnalyzesTo(a, "z x c v",
new String[]{"zxcv", "z", "x", "c", "v"},
new int[]{1, 0, 1, 1, 1});
assertAnalyzesTo(a, "z x c $",
new String[]{"z", "xc", "x", "c", "$"},
new int[]{1, 1, 0, 1, 1});
a.close();
}
public void testUpperCase() throws IOException {
assertMapping("word", "synonym");
assertMapping("word".toUpperCase(Locale.ROOT), "synonym");
}
private void assertMapping(String inputString, String outputString) throws IOException {
SynonymMap.Builder builder = new SynonymMap.Builder(false);
// the rules must be lowercased up front, but the incoming tokens will be case insensitive:
CharsRef input = SynonymMap.Builder.join(inputString.toLowerCase(Locale.ROOT).split(" "), new CharsRefBuilder());
CharsRef output = SynonymMap.Builder.join(outputString.split(" "), new CharsRefBuilder());
builder.add(input, output, true);
Analyzer analyzer = new CustomAnalyzer(builder.build());
TokenStream tokenStream = analyzer.tokenStream("field", inputString);
assertTokenStreamContents(tokenStream, new String[]{
outputString, inputString
});
}
static class CustomAnalyzer extends Analyzer {
private SynonymMap synonymMap;
CustomAnalyzer(SynonymMap synonymMap) {
this.synonymMap = synonymMap;
}
@Override
protected TokenStreamComponents createComponents(String s) {
Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
TokenStream tokenStream = new SynonymGraphFilter(tokenizer, synonymMap, true); // Ignore case True
return new TokenStreamComponents(tokenizer, tokenStream);
}
}
}