| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| |
| package org.apache.lucene.analysis.synonym; |
| |
| import java.io.IOException; |
| import java.io.StringReader; |
| import java.text.ParseException; |
| import java.util.ArrayList; |
| import java.util.HashSet; |
| import java.util.List; |
| import java.util.Locale; |
| import java.util.Set; |
| |
| import org.apache.lucene.analysis.Analyzer; |
| import org.apache.lucene.analysis.BaseTokenStreamTestCase; |
| import org.apache.lucene.analysis.MockAnalyzer; |
| import org.apache.lucene.analysis.MockGraphTokenFilter; |
| import org.apache.lucene.analysis.MockTokenizer; |
| import org.apache.lucene.analysis.TokenStream; |
| import org.apache.lucene.analysis.Tokenizer; |
| import org.apache.lucene.analysis.core.FlattenGraphFilter; |
| import org.apache.lucene.analysis.tokenattributes.*; |
| import org.apache.lucene.document.Document; |
| import org.apache.lucene.document.Field; |
| import org.apache.lucene.index.IndexReader; |
| import org.apache.lucene.index.RandomIndexWriter; |
| import org.apache.lucene.search.IndexSearcher; |
| import org.apache.lucene.search.PhraseQuery; |
| import org.apache.lucene.store.ByteArrayDataInput; |
| import org.apache.lucene.store.Directory; |
| import org.apache.lucene.util.BytesRef; |
| import org.apache.lucene.util.CharsRef; |
| import org.apache.lucene.util.CharsRefBuilder; |
| import org.apache.lucene.util.IOUtils; |
| import org.apache.lucene.util.IntsRef; |
| import org.apache.lucene.util.IntsRefBuilder; |
| import org.apache.lucene.util.TestUtil; |
| import org.apache.lucene.util.automaton.Automaton; |
| import org.apache.lucene.util.automaton.AutomatonTestUtil; |
| import org.apache.lucene.util.automaton.Operations; |
| import org.apache.lucene.util.automaton.TooComplexToDeterminizeException; |
| import org.apache.lucene.util.automaton.Transition; |
| import org.apache.lucene.util.fst.Util; |
| |
| public class TestSynonymGraphFilter extends BaseTokenStreamTestCase { |
| |
| /** Set as a side effect by {@link #getAnalyzer} and {@link #getFlattenAnalyzer}. */ |
| private SynonymGraphFilter synFilter; |
| private FlattenGraphFilter flattenFilter; |
| |
| public void testBasicKeepOrigOneOutput() throws Exception { |
| SynonymMap.Builder b = new SynonymMap.Builder(); |
| add(b, "a b", "x", true); |
| |
| Analyzer a = getAnalyzer(b, true); |
| assertAnalyzesTo(a, |
| "c a b", |
| new String[] {"c", "x", "a", "b"}, |
| new int[] { 0, 2, 2, 4}, |
| new int[] { 1, 5, 3, 5}, |
| new String[] {"word", "SYNONYM", "word", "word"}, |
| new int[] { 1, 1, 0, 1}, |
| new int[] { 1, 2, 1, 1}); |
| a.close(); |
| } |
| |
| public void testMixedKeepOrig() throws Exception { |
| SynonymMap.Builder b = new SynonymMap.Builder(); |
| add(b, "a b", "x", true); |
| add(b, "e f", "y", false); |
| |
| Analyzer a = getAnalyzer(b, true); |
| assertAnalyzesTo(a, |
| "c a b c e f g", |
| new String[] {"c", "x", "a", "b", "c", "y", "g"}, |
| new int[] { 0, 2, 2, 4, 6, 8, 12}, |
| new int[] { 1, 5, 3, 5, 7, 11, 13}, |
| new String[] {"word", "SYNONYM", "word", "word", "word", "SYNONYM", "word"}, |
| new int[] { 1, 1, 0, 1, 1, 1, 1}, |
| new int[] { 1, 2, 1, 1, 1, 1, 1}); |
| a.close(); |
| } |
| |
| public void testNoParseAfterBuffer() throws Exception { |
| SynonymMap.Builder b = new SynonymMap.Builder(); |
| add(b, "b a", "x", true); |
| |
| Analyzer a = getAnalyzer(b, true); |
| assertAnalyzesTo(a, |
| "b b b", |
| new String[] {"b", "b", "b"}, |
| new int[] { 0, 2, 4}, |
| new int[] { 1, 3, 5}, |
| new String[] {"word", "word", "word"}, |
| new int[] { 1, 1, 1}, |
| new int[] { 1, 1, 1}); |
| a.close(); |
| } |
| |
| public void testOneInputMultipleOutputKeepOrig() throws Exception { |
| SynonymMap.Builder b = new SynonymMap.Builder(); |
| add(b, "a b", "x", true); |
| add(b, "a b", "y", true); |
| |
| Analyzer a = getAnalyzer(b, true); |
| assertAnalyzesTo(a, |
| "c a b c", |
| new String[] {"c", "x", "y", "a", "b", "c"}, |
| new int[] { 0, 2, 2, 2, 4, 6}, |
| new int[] { 1, 5, 5, 3, 5, 7}, |
| new String[] {"word", "SYNONYM", "SYNONYM", "word", "word", "word"}, |
| new int[] { 1, 1, 0, 0, 1, 1, 1, 1}, |
| new int[] { 1, 2, 2, 1, 1, 1, 1, 1}); |
| a.close(); |
| } |
| |
| /** |
| * Verify type of token and positionLength after analyzer. |
| */ |
| public void testPositionLengthAndTypeSimple() throws Exception { |
| String testFile = |
| "spider man, spiderman"; |
| |
| Analyzer analyzer = solrSynsToAnalyzer(testFile); |
| |
| assertAnalyzesToPositions(analyzer, "spider man", |
| new String[]{"spiderman", "spider", "man"}, |
| new String[]{"SYNONYM", "word", "word"}, |
| new int[]{1, 0, 1}, |
| new int[]{2, 1, 1}); |
| } |
| |
| /** |
| * parse a syn file with some escaped syntax chars |
| */ |
| public void testEscapedStuff() throws Exception { |
| String testFile = |
| "a\\=>a => b\\=>b\n" + |
| "a\\,a => b\\,b"; |
| Analyzer analyzer = solrSynsToAnalyzer(testFile); |
| |
| assertAnalyzesTo(analyzer, "ball", |
| new String[]{"ball"}, |
| new int[]{1}); |
| |
| assertAnalyzesTo(analyzer, "a=>a", |
| new String[]{"b=>b"}, |
| new int[]{1}); |
| |
| assertAnalyzesTo(analyzer, "a,a", |
| new String[]{"b,b"}, |
| new int[]{1}); |
| analyzer.close(); |
| } |
| |
| /** |
| * parse a syn file with bad syntax |
| */ |
| public void testInvalidAnalyzesToNothingOutput() throws Exception { |
| String testFile = "a => 1"; |
| Analyzer analyzer = new MockAnalyzer(random(), MockTokenizer.SIMPLE, false); |
| SolrSynonymParser parser = new SolrSynonymParser(true, true, analyzer); |
| expectThrows(ParseException.class, () -> parser.parse(new StringReader(testFile))); |
| analyzer.close(); |
| } |
| |
| /** |
| * parse a syn file with bad syntax |
| */ |
| public void testInvalidDoubleMap() throws Exception { |
| String testFile = "a => b => c"; |
| Analyzer analyzer = new MockAnalyzer(random()); |
| SolrSynonymParser parser = new SolrSynonymParser(true, true, analyzer); |
| expectThrows(ParseException.class, () -> parser.parse(new StringReader(testFile))); |
| analyzer.close(); |
| } |
| |
| /** |
| * Tests some simple examples from the solr wiki |
| */ |
| public void testSimple() throws Exception { |
| String testFile = |
| "i-pod, ipod, ipoooood\n" + |
| "foo => foo bar\n" + |
| "foo => baz\n" + |
| "this test, that testing"; |
| |
| Analyzer analyzer = solrSynsToAnalyzer(testFile); |
| |
| assertAnalyzesTo(analyzer, "ball", |
| new String[]{"ball"}, |
| new int[]{1}); |
| |
| assertAnalyzesTo(analyzer, "i-pod", |
| new String[]{"ipod", "ipoooood", "i-pod"}, |
| new int[]{1, 0, 0}); |
| |
| assertAnalyzesTo(analyzer, "foo", |
| new String[]{"foo", "baz", "bar"}, |
| new int[]{1, 0, 1}); |
| |
| assertAnalyzesTo(analyzer, "this test", |
| new String[]{"that", "this", "testing", "test"}, |
| new int[]{1, 0, 1, 0}); |
| analyzer.close(); |
| } |
| |
| public void testBufferLength() throws Exception { |
| String testFile = |
| "c => 8 2 5 6 7\n" + |
| "f c e d f, 1\n" + |
| "c g a f d, 6 5 5\n" + |
| "e c => 4\n" + |
| "g => 5\n" + |
| "a g b f e => 5 0 7 7\n" + |
| "b => 1"; |
| Analyzer analyzer = solrSynsToAnalyzer(testFile); |
| |
| String doc = "b c g a f b d"; |
| String[] expected = new String[]{"1", "8", "2", "5", "6", "7", "5", "a", "f", "1", "d"}; |
| assertAnalyzesTo(analyzer, doc, expected); |
| } |
| |
| private Analyzer solrSynsToAnalyzer(String syns) throws IOException, ParseException { |
| Analyzer analyzer = new MockAnalyzer(random()); |
| SolrSynonymParser parser = new SolrSynonymParser(true, true, analyzer); |
| parser.parse(new StringReader(syns)); |
| analyzer.close(); |
| return getFlattenAnalyzer(parser, true); |
| } |
| |
| public void testMoreThanOneLookAhead() throws Exception { |
| SynonymMap.Builder b = new SynonymMap.Builder(); |
| add(b, "a b c d", "x", true); |
| |
| Analyzer a = getAnalyzer(b, true); |
| assertAnalyzesTo(a, |
| "a b c e", |
| new String[] {"a", "b", "c", "e"}, |
| new int[] { 0, 2, 4, 6}, |
| new int[] { 1, 3, 5, 7}, |
| new String[] {"word", "word", "word", "word"}, |
| new int[] { 1, 1, 1, 1}, |
| new int[] { 1, 1, 1, 1}); |
| a.close(); |
| } |
| |
| public void testLookaheadAfterParse() throws Exception { |
| SynonymMap.Builder b = new SynonymMap.Builder(); |
| add(b, "b b", "x", true); |
| add(b, "b", "y", true); |
| |
| Analyzer a = getAnalyzer(b, true); |
| |
| assertAnalyzesTo(a, "b a b b", |
| new String[] {"y", "b", "a", "x", "b", "b"}, |
| new int[] {0, 0, 2, 4, 4, 6}, |
| new int[] {1, 1, 3, 7, 5, 7}, |
| null, |
| new int[] {1, 0, 1, 1, 0, 1}, |
| new int[] {1, 1, 1, 2, 1, 1}, |
| true); |
| } |
| |
| public void testLookaheadSecondParse() throws Exception { |
| SynonymMap.Builder b = new SynonymMap.Builder(); |
| add(b, "b b b", "x", true); |
| add(b, "b", "y", true); |
| |
| Analyzer a = getAnalyzer(b, true); |
| |
| assertAnalyzesTo(a, "b b", |
| new String[] {"y", "b", "y", "b"}, |
| new int[] { 0, 0, 2, 2}, |
| new int[] { 1, 1, 3, 3}, |
| null, |
| new int[] { 1, 0, 1, 0}, |
| new int[] { 1, 1, 1, 1}, |
| true); |
| } |
| |
| public void testOneInputMultipleOutputNoKeepOrig() throws Exception { |
| SynonymMap.Builder b = new SynonymMap.Builder(); |
| add(b, "a b", "x", false); |
| add(b, "a b", "y", false); |
| |
| Analyzer a = getAnalyzer(b, true); |
| assertAnalyzesTo(a, |
| "c a b c", |
| new String[] {"c", "x", "y", "c"}, |
| new int[] { 0, 2, 2, 6}, |
| new int[] { 1, 5, 5, 7}, |
| new String[] {"word", "SYNONYM", "SYNONYM", "word"}, |
| new int[] { 1, 1, 0, 1}, |
| new int[] { 1, 1, 1, 1}); |
| a.close(); |
| } |
| |
| public void testOneInputMultipleOutputMixedKeepOrig() throws Exception { |
| SynonymMap.Builder b = new SynonymMap.Builder(); |
| add(b, "a b", "x", true); |
| add(b, "a b", "y", false); |
| |
| Analyzer a = getAnalyzer(b, true); |
| assertAnalyzesTo(a, |
| "c a b c", |
| new String[] {"c", "x", "y", "a", "b", "c"}, |
| new int[] { 0, 2, 2, 2, 4, 6}, |
| new int[] { 1, 5, 5, 3, 5, 7}, |
| new String[] {"word", "SYNONYM", "SYNONYM", "word", "word", "word"}, |
| new int[] { 1, 1, 0, 0, 1, 1, 1, 1}, |
| new int[] { 1, 2, 2, 1, 1, 1, 1, 1}); |
| a.close(); |
| } |
| |
| public void testSynAtEnd() throws Exception { |
| SynonymMap.Builder b = new SynonymMap.Builder(); |
| add(b, "a b", "x", true); |
| |
| Analyzer a = getAnalyzer(b, true); |
| assertAnalyzesTo(a, |
| "c d e a b", |
| new String[] {"c", "d", "e", "x", "a", "b"}, |
| new int[] { 0, 2, 4, 6, 6, 8}, |
| new int[] { 1, 3, 5, 9, 7, 9}, |
| new String[] {"word", "word", "word", "SYNONYM", "word", "word"}, |
| new int[] { 1, 1, 1, 1, 0, 1}, |
| new int[] { 1, 1, 1, 2, 1, 1}); |
| a.close(); |
| } |
| |
| public void testTwoSynsInARow() throws Exception { |
| SynonymMap.Builder b = new SynonymMap.Builder(); |
| add(b, "a", "x", false); |
| |
| Analyzer a = getAnalyzer(b, true); |
| assertAnalyzesTo(a, |
| "c a a b", |
| new String[] {"c", "x", "x", "b"}, |
| new int[] { 0, 2, 4, 6}, |
| new int[] { 1, 3, 5, 7}, |
| new String[] {"word", "SYNONYM", "SYNONYM", "word"}, |
| new int[] { 1, 1, 1, 1}, |
| new int[] { 1, 1, 1, 1}); |
| a.close(); |
| } |
| |
| public void testBasicKeepOrigTwoOutputs() throws Exception { |
| SynonymMap.Builder b = new SynonymMap.Builder(); |
| add(b, "a b", "x y", true); |
| add(b, "a b", "m n o", true); |
| |
| Analyzer a = getAnalyzer(b, true); |
| assertAnalyzesTo(a, |
| "c a b d", |
| new String[] {"c", "x", "m", "a", "y", "n", "o", "b", "d"}, |
| new int[] { 0, 2, 2, 2, 2, 2, 2, 4, 6}, |
| new int[] { 1, 5, 5, 3, 5, 5, 5, 5, 7}, |
| new String[] {"word", "SYNONYM", "SYNONYM", "word", "SYNONYM", "SYNONYM", "SYNONYM", "word", "word"}, |
| new int[] { 1, 1, 0, 0, 1, 1, 1, 1, 1}, |
| new int[] { 1, 1, 2, 4, 4, 1, 2, 1, 1}); |
| a.close(); |
| } |
| |
| public void testNoCaptureIfNoMatch() throws Exception { |
| SynonymMap.Builder b = new SynonymMap.Builder(); |
| add(b, "a b", "x y", true); |
| |
| Analyzer a = getAnalyzer(b, true); |
| |
| assertAnalyzesTo(a, |
| "c d d", |
| new String[] {"c", "d", "d"}, |
| new int[] { 0, 2, 4}, |
| new int[] { 1, 3, 5}, |
| new String[] {"word", "word", "word"}, |
| new int[] { 1, 1, 1}, |
| new int[] { 1, 1, 1}); |
| assertEquals(0, synFilter.getCaptureCount()); |
| a.close(); |
| } |
| |
| public void testBasicNotKeepOrigOneOutput() throws Exception { |
| SynonymMap.Builder b = new SynonymMap.Builder(); |
| add(b, "a b", "x", false); |
| |
| Analyzer a = getAnalyzer(b, true); |
| assertAnalyzesTo(a, |
| "c a b", |
| new String[] {"c", "x"}, |
| new int[] {0, 2}, |
| new int[] {1, 5}, |
| new String[] {"word", "SYNONYM"}, |
| new int[] {1, 1}, |
| new int[] {1, 1}); |
| a.close(); |
| } |
| |
| public void testBasicNoKeepOrigTwoOutputs() throws Exception { |
| SynonymMap.Builder b = new SynonymMap.Builder(); |
| add(b, "a b", "x y", false); |
| add(b, "a b", "m n o", false); |
| |
| Analyzer a = getAnalyzer(b, true); |
| assertAnalyzesTo(a, |
| "c a b d", |
| new String[] {"c", "x", "m", "y", "n", "o", "d"}, |
| new int[] { 0, 2, 2, 2, 2, 2, 6}, |
| new int[] { 1, 5, 5, 5, 5, 5, 7}, |
| new String[] {"word", "SYNONYM", "SYNONYM", "SYNONYM", "SYNONYM", "SYNONYM", "word"}, |
| new int[] { 1, 1, 0, 1, 1, 1, 1}, |
| new int[] { 1, 1, 2, 3, 1, 1, 1}); |
| a.close(); |
| } |
| |
| public void testIgnoreCase() throws Exception { |
| SynonymMap.Builder b = new SynonymMap.Builder(); |
| add(b, "a b", "x y", false); |
| add(b, "a b", "m n o", false); |
| |
| Analyzer a = getAnalyzer(b, true); |
| assertAnalyzesTo(a, |
| "c A B D", |
| new String[] {"c", "x", "m", "y", "n", "o", "D"}, |
| new int[] { 0, 2, 2, 2, 2, 2, 6}, |
| new int[] { 1, 5, 5, 5, 5, 5, 7}, |
| new String[] {"word", "SYNONYM", "SYNONYM", "SYNONYM", "SYNONYM", "SYNONYM", "word"}, |
| new int[] { 1, 1, 0, 1, 1, 1, 1}, |
| new int[] { 1, 1, 2, 3, 1, 1, 1}); |
| a.close(); |
| } |
| |
| public void testDoNotIgnoreCase() throws Exception { |
| SynonymMap.Builder b = new SynonymMap.Builder(); |
| add(b, "a b", "x y", false); |
| add(b, "a b", "m n o", false); |
| |
| Analyzer a = getAnalyzer(b, false); |
| assertAnalyzesTo(a, |
| "c A B D", |
| new String[] {"c", "A", "B", "D"}, |
| new int[] { 0, 2, 4, 6}, |
| new int[] { 1, 3, 5, 7}, |
| new String[] {"word", "word", "word", "word"}, |
| new int[] { 1, 1, 1, 1}, |
| new int[] { 1, 1, 1, 1}); |
| a.close(); |
| } |
| |
| public void testBufferedFinish1() throws Exception { |
| SynonymMap.Builder b = new SynonymMap.Builder(); |
| add(b, "a b c", "m n o", false); |
| |
| Analyzer a = getAnalyzer(b, true); |
| assertAnalyzesTo(a, |
| "c a b", |
| new String[] {"c", "a", "b"}, |
| new int[] { 0, 2, 4}, |
| new int[] { 1, 3, 5}, |
| new String[] {"word", "word", "word"}, |
| new int[] { 1, 1, 1}, |
| new int[] { 1, 1, 1}); |
| a.close(); |
| } |
| |
| public void testBufferedFinish2() throws Exception { |
| SynonymMap.Builder b = new SynonymMap.Builder(); |
| add(b, "a b", "m n o", false); |
| add(b, "d e", "m n o", false); |
| |
| Analyzer a = getAnalyzer(b, true); |
| assertAnalyzesTo(a, |
| "c a d", |
| new String[] {"c", "a", "d"}, |
| new int[] { 0, 2, 4}, |
| new int[] { 1, 3, 5}, |
| new String[] {"word", "word", "word"}, |
| new int[] { 1, 1, 1}, |
| new int[] { 1, 1, 1}); |
| a.close(); |
| } |
| |
| public void testCanReuse() throws Exception { |
| SynonymMap.Builder b = new SynonymMap.Builder(); |
| add(b, "a b", "x", true); |
| Analyzer a = getAnalyzer(b, true); |
| for(int i=0;i<10;i++) { |
| assertAnalyzesTo(a, |
| "c a b", |
| new String[] {"c", "x", "a", "b"}, |
| new int[] { 0, 2, 2, 4}, |
| new int[] { 1, 5, 3, 5}, |
| new String[] {"word", "SYNONYM", "word", "word"}, |
| new int[] { 1, 1, 0, 1}, |
| new int[] { 1, 2, 1, 1}); |
| } |
| a.close(); |
| } |
| |
| /** Multiple input tokens map to a single output token */ |
| public void testManyToOne() throws Exception { |
| SynonymMap.Builder b = new SynonymMap.Builder(); |
| add(b, "a b c", "z", true); |
| |
| Analyzer a = getAnalyzer(b, true); |
| assertAnalyzesTo(a, |
| "a b c d", |
| new String[] {"z", "a", "b", "c", "d"}, |
| new int[] { 0, 0, 2, 4, 6}, |
| new int[] { 5, 1, 3, 5, 7}, |
| new String[] {"SYNONYM", "word", "word", "word", "word"}, |
| new int[] { 1, 0, 1, 1, 1}, |
| new int[] { 3, 1, 1, 1, 1}); |
| a.close(); |
| } |
| |
| public void testBufferAfterMatch() throws Exception { |
| SynonymMap.Builder b = new SynonymMap.Builder(); |
| add(b, "a b c d", "x", true); |
| add(b, "a b", "y", false); |
| |
| // The 'c' token has to be buffered because SynGraphFilter |
| // needs to know whether a b c d -> x matches: |
| Analyzer a = getAnalyzer(b, true); |
| assertAnalyzesTo(a, |
| "f a b c e", |
| new String[] {"f", "y", "c", "e"}, |
| new int[] { 0, 2, 6, 8}, |
| new int[] { 1, 5, 7, 9}, |
| new String[] {"word", "SYNONYM", "word", "word"}, |
| new int[] { 1, 1, 1, 1}, |
| new int[] { 1, 1, 1, 1}); |
| a.close(); |
| } |
| |
| public void testZeroSyns() throws Exception { |
| Tokenizer tokenizer = new MockTokenizer(); |
| tokenizer.setReader(new StringReader("aa bb")); |
| |
| IllegalArgumentException ex = expectThrows(IllegalArgumentException.class, () -> |
| new SynonymGraphFilter(tokenizer, new SynonymMap.Builder(true).build(), true)); |
| assertEquals("fst must be non-null", ex.getMessage()); |
| } |
| |
| public void testOutputHangsOffEnd() throws Exception { |
| SynonymMap.Builder b = new SynonymMap.Builder(true); |
| final boolean keepOrig = false; |
| // b hangs off the end (no input token under it): |
| add(b, "a", "a b", keepOrig); |
| Analyzer a = getFlattenAnalyzer(b, true); |
| assertAnalyzesTo(a, "a", |
| new String[] {"a", "b"}, |
| new int[] { 0, 0}, |
| new int[] { 1, 1}, |
| null, |
| new int[] { 1, 1}, |
| new int[] { 1, 1}, |
| true); |
| a.close(); |
| } |
| |
| public void testDedup() throws Exception { |
| SynonymMap.Builder b = new SynonymMap.Builder(true); |
| final boolean keepOrig = false; |
| add(b, "a b", "ab", keepOrig); |
| add(b, "a b", "ab", keepOrig); |
| add(b, "a b", "ab", keepOrig); |
| Analyzer a = getFlattenAnalyzer(b, true); |
| |
| assertAnalyzesTo(a, "a b", |
| new String[]{"ab"}, |
| new int[]{1}); |
| a.close(); |
| } |
| |
| public void testNoDedup() throws Exception { |
| // dedup is false: |
| SynonymMap.Builder b = new SynonymMap.Builder(false); |
| final boolean keepOrig = false; |
| add(b, "a b", "ab", keepOrig); |
| add(b, "a b", "ab", keepOrig); |
| add(b, "a b", "ab", keepOrig); |
| Analyzer a = getFlattenAnalyzer(b, true); |
| |
| assertAnalyzesTo(a, "a b", |
| new String[]{"ab", "ab", "ab"}, |
| new int[]{1, 0, 0}); |
| a.close(); |
| } |
| |
| public void testMatching() throws Exception { |
| SynonymMap.Builder b = new SynonymMap.Builder(true); |
| final boolean keepOrig = false; |
| add(b, "a b", "ab", keepOrig); |
| add(b, "a c", "ac", keepOrig); |
| add(b, "a", "aa", keepOrig); |
| add(b, "b", "bb", keepOrig); |
| add(b, "z x c v", "zxcv", keepOrig); |
| add(b, "x c", "xc", keepOrig); |
| |
| Analyzer a = getFlattenAnalyzer(b, true); |
| |
| checkOneTerm(a, "$", "$"); |
| checkOneTerm(a, "a", "aa"); |
| checkOneTerm(a, "b", "bb"); |
| |
| assertAnalyzesTo(a, "a $", |
| new String[]{"aa", "$"}, |
| new int[]{1, 1}); |
| |
| assertAnalyzesTo(a, "$ a", |
| new String[]{"$", "aa"}, |
| new int[]{1, 1}); |
| |
| assertAnalyzesTo(a, "a a", |
| new String[]{"aa", "aa"}, |
| new int[]{1, 1}); |
| |
| assertAnalyzesTo(a, "z x c v", |
| new String[]{"zxcv"}, |
| new int[]{1}); |
| |
| assertAnalyzesTo(a, "z x c $", |
| new String[]{"z", "xc", "$"}, |
| new int[]{1, 1, 1}); |
| a.close(); |
| } |
| |
| public void testBasic1() throws Exception { |
| SynonymMap.Builder b = new SynonymMap.Builder(true); |
| add(b, "a", "foo", true); |
| add(b, "a b", "bar fee", true); |
| add(b, "b c", "dog collar", true); |
| add(b, "c d", "dog harness holder extras", true); |
| add(b, "m c e", "dog barks loudly", false); |
| add(b, "i j k", "feep", true); |
| |
| add(b, "e f", "foo bar", false); |
| add(b, "e f", "baz bee", false); |
| |
| add(b, "z", "boo", false); |
| add(b, "y", "bee", true); |
| Analyzer a = getFlattenAnalyzer(b, true); |
| |
| assertAnalyzesTo(a, "a b c", |
| new String[] {"bar", "a", "fee", "b", "c"}, |
| new int[] {1, 0, 1, 0, 1}); |
| |
| assertAnalyzesTo(a, "x a b c d", |
| new String[] {"x", "bar", "a", "fee", "b", "dog", "c", "harness", "d", "holder", "extras"}, |
| new int[] {1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1}); |
| |
| assertAnalyzesTo(a, "a b a", |
| new String[] {"bar", "a", "fee", "b", "foo", "a"}, |
| new int[] {1, 0, 1, 0, 1, 0}); |
| |
| // outputs no longer add to one another: |
| assertAnalyzesTo(a, "c d c d", |
| new String[] {"dog", "c", "harness", "d", "holder", "extras", "dog", "c", "harness", "d", "holder", "extras"}, |
| new int[] {1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1}); |
| |
| // two outputs for same input |
| assertAnalyzesTo(a, "e f", |
| new String[] {"foo", "baz", "bar", "bee"}, |
| new int[] {1, 0, 1, 0}); |
| |
| // verify multi-word / single-output offsets: |
| assertAnalyzesTo(a, "g i j k g", |
| new String[] {"g", "feep", "i", "j", "k", "g"}, |
| new int[] {1, 1, 0, 1, 1, 1}); |
| |
| // mixed keepOrig true/false: |
| assertAnalyzesTo(a, "a m c e x", |
| new String[] {"foo", "a", "dog", "barks", "loudly", "x"}, |
| new int[] {1, 0, 1, 1, 1, 1}); |
| assertAnalyzesTo(a, "c d m c e x", |
| new String[] {"dog", "c", "harness", "d", "holder", "extras", "dog", "barks", "loudly","x"}, |
| new int[] {1, 0, 1, 0, 1, 1, 1, 1, 1, 1}); |
| assertTrue(synFilter.getCaptureCount() > 0); |
| |
| // no captureStates when no syns matched |
| assertAnalyzesTo(a, "p q r s t", |
| new String[] {"p", "q", "r", "s", "t"}, |
| new int[] {1, 1, 1, 1, 1}); |
| assertEquals(0, synFilter.getCaptureCount()); |
| |
| // captureStates are necessary for the single-token syn case: |
| assertAnalyzesTo(a, "p q z y t", |
| new String[] {"p", "q", "boo", "bee", "y", "t"}, |
| new int[] {1, 1, 1, 1, 0, 1}); |
| assertTrue(synFilter.getCaptureCount() > 0); |
| } |
| |
| public void testBasic2() throws Exception { |
| boolean keepOrig = true; |
| do { |
| keepOrig = !keepOrig; |
| |
| SynonymMap.Builder b = new SynonymMap.Builder(true); |
| add(b,"aaa", "aaaa1 aaaa2 aaaa3", keepOrig); |
| add(b, "bbb", "bbbb1 bbbb2", keepOrig); |
| Analyzer a = getFlattenAnalyzer(b, true); |
| |
| if (keepOrig) { |
| assertAnalyzesTo(a, "xyzzy bbb pot of gold", |
| new String[] {"xyzzy", "bbbb1", "bbb", "bbbb2", "pot", "of", "gold"}, |
| new int[] {1, 1, 0, 1, 1, 1, 1}); |
| assertAnalyzesTo(a, "xyzzy aaa pot of gold", |
| new String[] {"xyzzy", "aaaa1", "aaa", "aaaa2", "aaaa2", "pot", "of", "gold"}, |
| new int[] {1, 1, 0, 1, 1, 1, 1, 1}); |
| } else { |
| assertAnalyzesTo(a, "xyzzy bbb pot of gold", |
| new String[] {"xyzzy", "bbbb1", "bbbb2", "pot", "of", "gold"}, |
| new int[] {1, 1, 1, 1, 1, 1}); |
| assertAnalyzesTo(a, "xyzzy aaa pot of gold", |
| new String[] {"xyzzy", "aaaa1", "aaaa2", "aaaa3", "pot", "of", "gold"}, |
| new int[] {1, 1, 1, 1, 1, 1, 1}); |
| } |
| } while (keepOrig); |
| } |
| |
| /** If we expand synonyms during indexing, it's a bit better than |
| * SynonymFilter is today, but still necessarily has false |
| * positive and negative PhraseQuery matches because we do not |
| * index posLength, so we lose information. */ |
| public void testFlattenedGraph() throws Exception { |
| |
| SynonymMap.Builder b = new SynonymMap.Builder(); |
| add(b, "wtf", "what the fudge", true); |
| |
| Analyzer a = getFlattenAnalyzer(b, true); |
| |
| assertAnalyzesTo(a, "wtf happened", |
| new String[] {"what", "wtf", "the", "fudge", "happened"}, |
| new int[] { 0, 0, 0, 0, 4}, |
| new int[] { 3, 3, 3, 3, 12}, |
| null, |
| new int[] { 1, 0, 1, 1, 1}, |
| new int[] { 1, 3, 1, 1, 1}, |
| true); |
| |
| Directory dir = newDirectory(); |
| RandomIndexWriter w = new RandomIndexWriter(random(), dir, a); |
| Document doc = new Document(); |
| doc.add(newTextField("field", "wtf happened", Field.Store.NO)); |
| w.addDocument(doc); |
| IndexReader r = w.getReader(); |
| w.close(); |
| |
| IndexSearcher s = newSearcher(r); |
| |
| // Good (this should not match, and doesn't): |
| assertEquals(0, s.count(new PhraseQuery("field", "what", "happened"))); |
| |
| // Bad (this should match, but doesn't): |
| assertEquals(0, s.count(new PhraseQuery("field", "wtf", "happened"))); |
| |
| // Good (this should match, and does): |
| assertEquals(1, s.count(new PhraseQuery("field", "what", "the", "fudge", "happened"))); |
| |
| // Bad (this should not match, but does): |
| assertEquals(1, s.count(new PhraseQuery("field", "wtf", "the"))); |
| |
| IOUtils.close(r, dir); |
| } |
| |
| // Needs TermAutomatonQuery, which is in sandbox still: |
| /* |
| public void testAccurateGraphQuery1() throws Exception { |
| Directory dir = newDirectory(); |
| RandomIndexWriter w = new RandomIndexWriter(random(), dir); |
| Document doc = new Document(); |
| doc.add(newTextField("field", "wtf happened", Field.Store.NO)); |
| w.addDocument(doc); |
| IndexReader r = w.getReader(); |
| w.close(); |
| |
| IndexSearcher s = newSearcher(r); |
| |
| SynonymMap.Builder b = new SynonymMap.Builder(); |
| add(b, "what the fudge", "wtf", true); |
| |
| SynonymMap map = b.build(); |
| |
| TokenStreamToTermAutomatonQuery ts2q = new TokenStreamToTermAutomatonQuery(); |
| |
| TokenStream in = new CannedTokenStream(0, 23, new Token[] { |
| token("what", 1, 1, 0, 4), |
| token("the", 1, 1, 5, 8), |
| token("fudge", 1, 1, 9, 14), |
| token("happened", 1, 1, 15, 23), |
| }); |
| |
| assertEquals(1, s.count(ts2q.toQuery("field", new SynonymGraphFilter(in, map, true)))); |
| |
| in = new CannedTokenStream(0, 12, new Token[] { |
| token("wtf", 1, 1, 0, 3), |
| token("happened", 1, 1, 4, 12), |
| }); |
| |
| assertEquals(1, s.count(ts2q.toQuery("field", new SynonymGraphFilter(in, map, true)))); |
| |
| // "what happened" should NOT match: |
| in = new CannedTokenStream(0, 13, new Token[] { |
| token("what", 1, 1, 0, 4), |
| token("happened", 1, 1, 5, 13), |
| }); |
| assertEquals(0, s.count(ts2q.toQuery("field", new SynonymGraphFilter(in, map, true)))); |
| |
| IOUtils.close(r, dir); |
| } |
| */ |
| |
| /** If we expand synonyms at search time, the results are correct. */ |
| // Needs TermAutomatonQuery, which is in sandbox still: |
| /* |
| public void testAccurateGraphQuery2() throws Exception { |
| Directory dir = newDirectory(); |
| RandomIndexWriter w = new RandomIndexWriter(random(), dir); |
| Document doc = new Document(); |
| doc.add(newTextField("field", "say wtf happened", Field.Store.NO)); |
| w.addDocument(doc); |
| IndexReader r = w.getReader(); |
| w.close(); |
| |
| IndexSearcher s = newSearcher(r); |
| |
| SynonymMap.Builder b = new SynonymMap.Builder(); |
| add(b, "what the fudge", "wtf", true); |
| |
| SynonymMap map = b.build(); |
| |
| TokenStream in = new CannedTokenStream(0, 26, new Token[] { |
| token("say", 1, 1, 0, 3), |
| token("what", 1, 1, 3, 7), |
| token("the", 1, 1, 8, 11), |
| token("fudge", 1, 1, 12, 17), |
| token("happened", 1, 1, 18, 26), |
| }); |
| |
| TokenStreamToTermAutomatonQuery ts2q = new TokenStreamToTermAutomatonQuery(); |
| |
| assertEquals(1, s.count(ts2q.toQuery("field", new SynonymGraphFilter(in, map, true)))); |
| |
| // "what happened" should NOT match: |
| in = new CannedTokenStream(0, 13, new Token[] { |
| token("what", 1, 1, 0, 4), |
| token("happened", 1, 1, 5, 13), |
| }); |
| assertEquals(0, s.count(ts2q.toQuery("field", new SynonymGraphFilter(in, map, true)))); |
| |
| IOUtils.close(r, dir); |
| } |
| */ |
| |
| // Needs TermAutomatonQuery, which is in sandbox still: |
| /* |
| public void testAccurateGraphQuery3() throws Exception { |
| Directory dir = newDirectory(); |
| RandomIndexWriter w = new RandomIndexWriter(random(), dir); |
| Document doc = new Document(); |
| doc.add(newTextField("field", "say what the fudge happened", Field.Store.NO)); |
| w.addDocument(doc); |
| IndexReader r = w.getReader(); |
| w.close(); |
| |
| IndexSearcher s = newSearcher(r); |
| |
| SynonymMap.Builder b = new SynonymMap.Builder(); |
| add(b, "wtf", "what the fudge", true); |
| |
| SynonymMap map = b.build(); |
| |
| TokenStream in = new CannedTokenStream(0, 15, new Token[] { |
| token("say", 1, 1, 0, 3), |
| token("wtf", 1, 1, 3, 6), |
| token("happened", 1, 1, 7, 15), |
| }); |
| |
| TokenStreamToTermAutomatonQuery ts2q = new TokenStreamToTermAutomatonQuery(); |
| |
| assertEquals(1, s.count(ts2q.toQuery("field", new SynonymGraphFilter(in, map, true)))); |
| |
| // "what happened" should NOT match: |
| in = new CannedTokenStream(0, 13, new Token[] { |
| token("what", 1, 1, 0, 4), |
| token("happened", 1, 1, 5, 13), |
| }); |
| assertEquals(0, s.count(ts2q.toQuery("field", new SynonymGraphFilter(in, map, true)))); |
| |
| IOUtils.close(r, dir); |
| } |
| |
| private static Token token(String term, int posInc, int posLength, int startOffset, int endOffset) { |
| final Token t = new Token(term, startOffset, endOffset); |
| t.setPositionIncrement(posInc); |
| t.setPositionLength(posLength); |
| return t; |
| } |
| */ |
| |
| private String randomNonEmptyString() { |
| while(true) { |
| String s = TestUtil.randomUnicodeString(random()).trim(); |
| //String s = TestUtil.randomSimpleString(random()).trim(); |
| if (s.length() != 0 && s.indexOf('\u0000') == -1) { |
| return s; |
| } |
| } |
| } |
| |
| // Adds MockGraphTokenFilter after SynFilter: |
| public void testRandomGraphAfter() throws Exception { |
| final int numIters = atLeast(3); |
| for (int i = 0; i < numIters; i++) { |
| SynonymMap.Builder b = new SynonymMap.Builder(random().nextBoolean()); |
| final int numEntries = atLeast(10); |
| for (int j = 0; j < numEntries; j++) { |
| add(b, randomNonEmptyString(), randomNonEmptyString(), random().nextBoolean()); |
| } |
| final SynonymMap map = b.build(); |
| final boolean ignoreCase = random().nextBoolean(); |
| final boolean doFlatten = random().nextBoolean(); |
| |
| final Analyzer analyzer = new Analyzer() { |
| @Override |
| protected TokenStreamComponents createComponents(String fieldName) { |
| Tokenizer tokenizer = new MockTokenizer(MockTokenizer.SIMPLE, true); |
| TokenStream syns = new SynonymGraphFilter(tokenizer, map, ignoreCase); |
| TokenStream graph = new MockGraphTokenFilter(random(), syns); |
| if (doFlatten) { |
| graph = new FlattenGraphFilter(graph); |
| } |
| return new TokenStreamComponents(tokenizer, graph); |
| } |
| }; |
| |
| checkRandomData(random(), analyzer, 100); |
| analyzer.close(); |
| } |
| } |
| |
| public void testEmptyStringInput() throws IOException { |
| final int numIters = atLeast(10); |
| for (int i = 0; i < numIters; i++) { |
| SynonymMap.Builder b = new SynonymMap.Builder(random().nextBoolean()); |
| final int numEntries = atLeast(10); |
| for (int j = 0; j < numEntries; j++) { |
| add(b, randomNonEmptyString(), randomNonEmptyString(), random().nextBoolean()); |
| } |
| final boolean ignoreCase = random().nextBoolean(); |
| |
| Analyzer analyzer = getAnalyzer(b, ignoreCase); |
| |
| checkAnalysisConsistency(random(), analyzer, random().nextBoolean(), ""); |
| analyzer.close(); |
| } |
| } |
| |
| /** simple random test, doesn't verify correctness. |
| * does verify it doesnt throw exceptions, or that the stream doesn't misbehave |
| */ |
| public void testRandom2() throws Exception { |
| final int numIters = atLeast(3); |
| for (int i = 0; i < numIters; i++) { |
| SynonymMap.Builder b = new SynonymMap.Builder(random().nextBoolean()); |
| final int numEntries = atLeast(10); |
| for (int j = 0; j < numEntries; j++) { |
| add(b, randomNonEmptyString(), randomNonEmptyString(), random().nextBoolean()); |
| } |
| final boolean ignoreCase = random().nextBoolean(); |
| final boolean doFlatten = random().nextBoolean(); |
| |
| Analyzer analyzer; |
| if (doFlatten) { |
| analyzer = getFlattenAnalyzer(b, ignoreCase); |
| } else { |
| analyzer = getAnalyzer(b, ignoreCase); |
| } |
| |
| checkRandomData(random(), analyzer, 100); |
| analyzer.close(); |
| } |
| } |
| |
| /** simple random test like testRandom2, but for larger docs |
| */ |
| public void testRandomHuge() throws Exception { |
| final int numIters = atLeast(1); |
| for (int i = 0; i < numIters; i++) { |
| SynonymMap.Builder b = new SynonymMap.Builder(random().nextBoolean()); |
| final int numEntries = atLeast(10); |
| if (VERBOSE) { |
| System.out.println("TEST: iter=" + i + " numEntries=" + numEntries); |
| } |
| for (int j = 0; j < numEntries; j++) { |
| add(b, randomNonEmptyString(), randomNonEmptyString(), random().nextBoolean()); |
| } |
| final boolean ignoreCase = random().nextBoolean(); |
| final boolean doFlatten = random().nextBoolean(); |
| |
| Analyzer analyzer; |
| if (doFlatten) { |
| analyzer = getFlattenAnalyzer(b, ignoreCase); |
| } else { |
| analyzer = getAnalyzer(b, ignoreCase); |
| } |
| |
| checkRandomData(random(), analyzer, 100, 1024); |
| analyzer.close(); |
| } |
| } |
| |
| public void testEmptyTerm() throws IOException { |
| final int numIters = atLeast(10); |
| for (int i = 0; i < numIters; i++) { |
| SynonymMap.Builder b = new SynonymMap.Builder(random().nextBoolean()); |
| final int numEntries = atLeast(10); |
| for (int j = 0; j < numEntries; j++) { |
| add(b, randomNonEmptyString(), randomNonEmptyString(), random().nextBoolean()); |
| } |
| final boolean ignoreCase = random().nextBoolean(); |
| |
| final Analyzer analyzer = getAnalyzer(b, ignoreCase); |
| |
| checkAnalysisConsistency(random(), analyzer, random().nextBoolean(), ""); |
| analyzer.close(); |
| } |
| } |
| |
| // LUCENE-3375 |
| public void testVanishingTermsNoFlatten() throws Exception { |
| String testFile = |
| "aaa => aaaa1 aaaa2 aaaa3\n" + |
| "bbb => bbbb1 bbbb2\n"; |
| Analyzer analyzer = solrSynsToAnalyzer(testFile); |
| |
| assertAnalyzesTo(analyzer, "xyzzy bbb pot of gold", |
| new String[] { "xyzzy", "bbbb1", "bbbb2", "pot", "of", "gold" }); |
| |
| // xyzzy aaa pot of gold -> xyzzy aaaa1 aaaa2 aaaa3 gold |
| assertAnalyzesTo(analyzer, "xyzzy aaa pot of gold", |
| new String[] { "xyzzy", "aaaa1", "aaaa2", "aaaa3", "pot", "of", "gold" }); |
| analyzer.close(); |
| } |
| |
| // LUCENE-3375 |
| public void testVanishingTermsWithFlatten() throws Exception { |
| String testFile = |
| "aaa => aaaa1 aaaa2 aaaa3\n" + |
| "bbb => bbbb1 bbbb2\n"; |
| |
| Analyzer analyzer = solrSynsToAnalyzer(testFile); |
| |
| assertAnalyzesTo(analyzer, "xyzzy bbb pot of gold", |
| new String[] { "xyzzy", "bbbb1", "bbbb2", "pot", "of", "gold" }); |
| |
| // xyzzy aaa pot of gold -> xyzzy aaaa1 aaaa2 aaaa3 gold |
| assertAnalyzesTo(analyzer, "xyzzy aaa pot of gold", |
| new String[] { "xyzzy", "aaaa1", "aaaa2", "aaaa3", "pot", "of", "gold" }); |
| analyzer.close(); |
| } |
| |
| public void testBuilderDedup() throws Exception { |
| SynonymMap.Builder b = new SynonymMap.Builder(true); |
| final boolean keepOrig = false; |
| add(b, "a b", "ab", keepOrig); |
| add(b, "a b", "ab", keepOrig); |
| add(b, "a b", "ab", keepOrig); |
| Analyzer a = getAnalyzer(b, true); |
| |
| assertAnalyzesTo(a, "a b", |
| new String[] { "ab" }, |
| new int[] { 1 }); |
| a.close(); |
| } |
| |
| public void testBuilderNoDedup() throws Exception { |
| SynonymMap.Builder b = new SynonymMap.Builder(false); |
| final boolean keepOrig = false; |
| add(b, "a b", "ab", keepOrig); |
| add(b, "a b", "ab", keepOrig); |
| add(b, "a b", "ab", keepOrig); |
| Analyzer a = getAnalyzer(b, true); |
| |
| assertAnalyzesTo(a, "a b", |
| new String[] { "ab", "ab", "ab" }, |
| new int[] { 1, 0, 0 }); |
| a.close(); |
| } |
| |
| public void testRecursion1() throws Exception { |
| SynonymMap.Builder b = new SynonymMap.Builder(true); |
| final boolean keepOrig = false; |
| add(b, "zoo", "zoo", keepOrig); |
| Analyzer a = getAnalyzer(b, true); |
| |
| assertAnalyzesTo(a, "zoo zoo $ zoo", |
| new String[] { "zoo", "zoo", "$", "zoo" }, |
| new int[] { 1, 1, 1, 1 }); |
| a.close(); |
| } |
| |
| public void testRecursion2() throws Exception { |
| SynonymMap.Builder b = new SynonymMap.Builder(true); |
| final boolean keepOrig = false; |
| add(b, "zoo", "zoo", keepOrig); |
| add(b, "zoo", "zoo zoo", keepOrig); |
| Analyzer a = getAnalyzer(b, true); |
| |
| // verify("zoo zoo $ zoo", "zoo/zoo zoo/zoo/zoo $/zoo zoo/zoo zoo"); |
| assertAnalyzesTo(a, "zoo zoo $ zoo", |
| new String[] { "zoo", "zoo", "zoo", "zoo", "zoo", "zoo", "$", "zoo", "zoo", "zoo" }, |
| new int[] { 1, 0, 1, 1, 0, 1, 1, 1, 0, 1 }); |
| a.close(); |
| } |
| |
| public void testRecursion3() throws Exception { |
| SynonymMap.Builder b = new SynonymMap.Builder(true); |
| final boolean keepOrig = true; |
| add(b, "zoo zoo", "zoo", keepOrig); |
| Analyzer a = getFlattenAnalyzer(b, true); |
| |
| assertAnalyzesTo(a, "zoo zoo $ zoo", |
| new String[]{"zoo", "zoo", "zoo", "$", "zoo"}, |
| new int[]{1, 0, 1, 1, 1}); |
| a.close(); |
| } |
| |
| public void testRecursion4() throws Exception { |
| SynonymMap.Builder b = new SynonymMap.Builder(true); |
| final boolean keepOrig = true; |
| add(b, "zoo zoo", "zoo", keepOrig); |
| add(b, "zoo", "zoo zoo", keepOrig); |
| Analyzer a = getFlattenAnalyzer(b, true); |
| assertAnalyzesTo(a, "zoo zoo $ zoo", |
| new String[]{"zoo", "zoo", "zoo", "$", "zoo", "zoo", "zoo"}, |
| new int[]{1, 0, 1, 1, 1, 0, 1}); |
| a.close(); |
| } |
| |
| public void testKeepOrig() throws Exception { |
| SynonymMap.Builder b = new SynonymMap.Builder(true); |
| final boolean keepOrig = true; |
| add(b, "a b", "ab", keepOrig); |
| add(b, "a c", "ac", keepOrig); |
| add(b, "a", "aa", keepOrig); |
| add(b, "b", "bb", keepOrig); |
| add(b, "z x c v", "zxcv", keepOrig); |
| add(b, "x c", "xc", keepOrig); |
| Analyzer a = getAnalyzer(b, true); |
| |
| assertAnalyzesTo(a, "$", |
| new String[] { "$" }, |
| new int[] { 1 }); |
| assertAnalyzesTo(a, "a", |
| new String[] { "aa", "a" }, |
| new int[] { 1, 0 }); |
| assertAnalyzesTo(a, "a", |
| new String[] { "aa", "a" }, |
| new int[] { 1, 0 }); |
| assertAnalyzesTo(a, "$ a", |
| new String[] { "$", "aa", "a" }, |
| new int[] { 1, 1, 0 }); |
| assertAnalyzesTo(a, "a $", |
| new String[] { "aa", "a", "$" }, |
| new int[] { 1, 0, 1 }); |
| assertAnalyzesTo(a, "$ a !", |
| new String[] { "$", "aa", "a", "!" }, |
| new int[] { 1, 1, 0, 1 }); |
| assertAnalyzesTo(a, "a a", |
| new String[] { "aa", "a", "aa", "a" }, |
| new int[] { 1, 0, 1, 0 }); |
| assertAnalyzesTo(a, "b", |
| new String[] { "bb", "b" }, |
| new int[] { 1, 0 }); |
| assertAnalyzesTo(a, "z x c v", |
| new String[] { "zxcv", "z", "x", "c", "v" }, |
| new int[] { 1, 0, 1, 1, 1 }); |
| assertAnalyzesTo(a, "z x c $", |
| new String[] { "z", "xc", "x", "c", "$" }, |
| new int[] { 1, 1, 0, 1, 1 }); |
| a.close(); |
| } |
| |
| /** |
| * verify type of token and positionLengths on synonyms of different word counts, with non preserving, explicit rules. |
| */ |
| public void testNonPreservingMultiwordSynonyms() throws Exception { |
| String testFile = |
| "aaa => two words\n" + |
| "bbb => one two, very many multiple words\n" + |
| "ee ff, gg, h i j k, h i => one\n" + |
| "cc dd => usa,united states,u s a,united states of america"; |
| |
| Analyzer analyzer = solrSynsToAnalyzer(testFile); |
| |
| assertAnalyzesTo(analyzer, "aaa", |
| new String[]{"two", "words"}, |
| new int[]{0, 0}, |
| new int[]{3, 3}, |
| new String[]{"SYNONYM", "SYNONYM"}, |
| new int[]{1, 1}, |
| new int[]{1, 1}); |
| |
| assertAnalyzesToPositions(analyzer, "amazing aaa", |
| new String[]{"amazing", "two", "words"}, |
| new String[]{"word", "SYNONYM", "SYNONYM"}, |
| new int[]{1, 1, 1}, |
| new int[]{1, 1, 1}); |
| |
| assertAnalyzesTo(analyzer, "p bbb s", |
| new String[]{"p", "one", "very", "two", "many", "multiple", "words", "s"}, |
| new int[]{0, 2, 2, 2, 2, 2, 2, 6}, |
| new int[]{1, 5, 5, 5, 5, 5, 5, 7}, |
| new String[]{"word", "SYNONYM", "SYNONYM", "SYNONYM", "SYNONYM", "SYNONYM", "SYNONYM", "word"}, |
| new int[]{1, 1, 0, 1, 0, 1, 1, 1}, |
| new int[]{1, 1, 1, 3, 1, 1, 1, 1}); |
| |
| assertAnalyzesTo(analyzer, "p ee ff s", |
| new String[]{"p", "one", "s"}, |
| new int[]{0, 2, 8}, |
| new int[]{1, 7, 9}, |
| new String[]{"word", "SYNONYM", "word"}, |
| new int[]{1, 1, 1}, |
| new int[]{1, 1, 1}); |
| |
| assertAnalyzesTo(analyzer, "p h i j s", |
| new String[]{"p", "one", "j", "s"}, |
| new int[]{0, 2, 6, 8}, |
| new int[]{1, 5, 7, 9}, |
| new String[]{"word", "SYNONYM", "word", "word"}, |
| new int[]{1, 1, 1, 1}, |
| new int[]{1, 1, 1, 1}); |
| |
| analyzer.close(); |
| } |
| |
| private Analyzer getAnalyzer(SynonymMap.Builder b, final boolean ignoreCase) throws IOException { |
| final SynonymMap map = b.build(); |
| return new Analyzer() { |
| @Override |
| protected TokenStreamComponents createComponents(String fieldName) { |
| Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false); |
| // Make a local variable so testRandomHuge doesn't share it across threads! |
| SynonymGraphFilter synFilter = new SynonymGraphFilter(tokenizer, map, ignoreCase); |
| TestSynonymGraphFilter.this.flattenFilter = null; |
| TestSynonymGraphFilter.this.synFilter = synFilter; |
| return new TokenStreamComponents(tokenizer, synFilter); |
| } |
| }; |
| } |
| |
| /** Appends FlattenGraphFilter too */ |
| private Analyzer getFlattenAnalyzer(SynonymMap.Builder b, boolean ignoreCase) throws IOException { |
| final SynonymMap map = b.build(); |
| return new Analyzer() { |
| @Override |
| protected TokenStreamComponents createComponents(String fieldName) { |
| Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, true); |
| // Make a local variable so testRandomHuge doesn't share it across threads! |
| SynonymGraphFilter synFilter = new SynonymGraphFilter(tokenizer, map, ignoreCase); |
| FlattenGraphFilter flattenFilter = new FlattenGraphFilter(synFilter); |
| TestSynonymGraphFilter.this.synFilter = synFilter; |
| TestSynonymGraphFilter.this.flattenFilter = flattenFilter; |
| return new TokenStreamComponents(tokenizer, flattenFilter); |
| } |
| }; |
| } |
| |
| private void add(SynonymMap.Builder b, String input, String output, boolean keepOrig) { |
| if (VERBOSE) { |
| //System.out.println(" add input=" + input + " output=" + output + " keepOrig=" + keepOrig); |
| } |
| CharsRefBuilder inputCharsRef = new CharsRefBuilder(); |
| SynonymMap.Builder.join(input.split(" +"), inputCharsRef); |
| |
| CharsRefBuilder outputCharsRef = new CharsRefBuilder(); |
| SynonymMap.Builder.join(output.split(" +"), outputCharsRef); |
| |
| b.add(inputCharsRef.get(), outputCharsRef.get(), keepOrig); |
| } |
| |
| private char[] randomBinaryChars(int minLen, int maxLen, double bias, char base) { |
| int len = TestUtil.nextInt(random(), minLen, maxLen); |
| char[] chars = new char[len]; |
| for(int i=0;i<len;i++) { |
| char ch; |
| if (random().nextDouble() < bias) { |
| ch = base; |
| } else { |
| ch = (char) (base+1); |
| } |
| chars[i] = ch; |
| } |
| |
| return chars; |
| } |
| |
| private static String toTokenString(char[] chars) { |
| StringBuilder b = new StringBuilder(); |
| for(char c : chars) { |
| if (b.length() > 0) { |
| b.append(' '); |
| } |
| b.append(c); |
| } |
| return b.toString(); |
| } |
| |
| private static class OneSyn { |
| char[] in; |
| char[] out; |
| boolean keepOrig; |
| |
| @Override |
| public String toString() { |
| return toTokenString(in) + " --> " + toTokenString(out) + " (keepOrig=" + keepOrig + ")"; |
| } |
| } |
| |
| public void testRandomSyns() throws Exception { |
| int synCount = atLeast(10); |
| double bias = random().nextDouble(); |
| boolean dedup = random().nextBoolean(); |
| |
| boolean flatten = random().nextBoolean(); |
| |
| SynonymMap.Builder b = new SynonymMap.Builder(dedup); |
| List<OneSyn> syns = new ArrayList<>(); |
| // Makes random syns from random a / b tokens, mapping to random x / y tokens |
| if (VERBOSE) { |
| System.out.println("TEST: make " + synCount + " syns"); |
| System.out.println(" bias for a over b=" + bias); |
| System.out.println(" dedup=" + dedup); |
| System.out.println(" flatten=" + flatten); |
| } |
| |
| int maxSynLength = 0; |
| |
| for(int i=0;i<synCount;i++) { |
| OneSyn syn = new OneSyn(); |
| syn.in = randomBinaryChars(1, 5, bias, 'a'); |
| syn.out = randomBinaryChars(1, 5, 0.5, 'x'); |
| syn.keepOrig = random().nextBoolean(); |
| syns.add(syn); |
| |
| maxSynLength = Math.max(maxSynLength, syn.in.length); |
| |
| if (VERBOSE) { |
| System.out.println(" " + syn); |
| } |
| add(b, toTokenString(syn.in), toTokenString(syn.out), syn.keepOrig); |
| } |
| |
| // Compute max allowed lookahead for flatten filter: |
| int maxFlattenLookahead = 0; |
| if (flatten) { |
| for(int i=0;i<synCount;i++) { |
| OneSyn syn1 = syns.get(i); |
| int count = syn1.out.length; |
| boolean keepOrig = syn1.keepOrig; |
| for(int j=0;j<synCount;j++) { |
| OneSyn syn2 = syns.get(i); |
| keepOrig |= syn2.keepOrig; |
| if (syn1.in.equals(syn2.in)) { |
| count += syn2.out.length; |
| } |
| } |
| |
| if (keepOrig) { |
| count += syn1.in.length; |
| } |
| |
| maxFlattenLookahead = Math.max(maxFlattenLookahead, count); |
| } |
| } |
| |
| // Only used w/ VERBOSE: |
| Analyzer aNoFlattened; |
| if (VERBOSE) { |
| aNoFlattened = getAnalyzer(b, true); |
| } else { |
| aNoFlattened = null; |
| } |
| |
| Analyzer a; |
| if (flatten) { |
| a = getFlattenAnalyzer(b, true); |
| } else { |
| a = getAnalyzer(b, true); |
| } |
| |
| int iters = atLeast(1); |
| for(int iter=0;iter<iters;iter++) { |
| |
| String doc = toTokenString(randomBinaryChars(50, 100, bias, 'a')); |
| //String doc = toTokenString(randomBinaryChars(10, 50, bias, 'a')); |
| |
| if (VERBOSE) { |
| System.out.println("TEST: iter="+ iter + " doc=" + doc); |
| } |
| Automaton expected = slowSynFilter(doc, syns, flatten); |
| if (VERBOSE) { |
| System.out.println(" expected:\n" + expected.toDot()); |
| if (flatten) { |
| Automaton unflattened = toAutomaton(aNoFlattened.tokenStream("field", new StringReader(doc))); |
| System.out.println(" actual unflattened:\n" + unflattened.toDot()); |
| } |
| } |
| Automaton actual = toAutomaton(a.tokenStream("field", new StringReader(doc))); |
| if (VERBOSE) { |
| System.out.println(" actual:\n" + actual.toDot()); |
| } |
| |
| assertTrue("maxLookaheadUsed=" + synFilter.getMaxLookaheadUsed() + " maxSynLength=" + maxSynLength, |
| synFilter.getMaxLookaheadUsed() <= maxSynLength); |
| if (flatten) { |
| assertTrue("flatten maxLookaheadUsed=" + flattenFilter.getMaxLookaheadUsed() + " maxFlattenLookahead=" + maxFlattenLookahead, |
| flattenFilter.getMaxLookaheadUsed() <= maxFlattenLookahead); |
| } |
| |
| checkAnalysisConsistency(random(), a, random().nextBoolean(), doc); |
| // We can easily have a non-deterministic automaton at this point, e.g. if |
| // more than one syn matched at given point, or if the syn mapped to an |
| // output token that also happens to be in the input: |
| try { |
| actual = Operations.determinize(actual, 50000); |
| } catch (TooComplexToDeterminizeException tctde) { |
| // Unfortunately the syns can easily create difficult-to-determinize graphs: |
| assertTrue(approxEquals(actual, expected)); |
| continue; |
| } |
| |
| try { |
| expected = Operations.determinize(expected, 50000); |
| } catch (TooComplexToDeterminizeException tctde) { |
| // Unfortunately the syns can easily create difficult-to-determinize graphs: |
| assertTrue(approxEquals(actual, expected)); |
| continue; |
| } |
| |
| assertTrue(approxEquals(actual, expected)); |
| assertTrue(Operations.sameLanguage(actual, expected)); |
| } |
| |
| a.close(); |
| } |
| |
| /** Only used when true equality is too costly to check! */ |
| private boolean approxEquals(Automaton actual, Automaton expected) { |
| // Don't collapse these into one line else the thread stack won't say which direction failed!: |
| boolean b1 = approxSubsetOf(actual, expected); |
| boolean b2 = approxSubsetOf(expected, actual); |
| return b1 && b2; |
| } |
| |
| private boolean approxSubsetOf(Automaton a1, Automaton a2) { |
| AutomatonTestUtil.RandomAcceptedStrings ras = new AutomatonTestUtil.RandomAcceptedStrings(a1); |
| for(int i=0;i<2000;i++) { |
| int[] ints = ras.getRandomAcceptedString(random()); |
| IntsRef path = new IntsRef(ints, 0, ints.length); |
| if (accepts(a2, path) == false) { |
| throw new RuntimeException("a2 does not accept " + path); |
| } |
| } |
| |
| // Presumed true |
| return true; |
| } |
| |
| /** Like {@link Operations#run} except the incoming automaton is allowed to be non-deterministic. */ |
| private static boolean accepts(Automaton a, IntsRef path) { |
| Set<Integer> states = new HashSet<>(); |
| states.add(0); |
| Transition t = new Transition(); |
| for(int i=0;i<path.length;i++) { |
| int digit = path.ints[path.offset+i]; |
| Set<Integer> nextStates = new HashSet<>(); |
| for(int state : states) { |
| int count = a.initTransition(state, t); |
| for(int j=0;j<count;j++) { |
| a.getNextTransition(t); |
| if (digit >= t.min && digit <= t.max) { |
| nextStates.add(t.dest); |
| } |
| } |
| } |
| states = nextStates; |
| if (states.isEmpty()) { |
| return false; |
| } |
| } |
| |
| for(int state : states) { |
| if (a.isAccept(state)) { |
| return true; |
| } |
| } |
| |
| return false; |
| } |
| |
| /** Stupid, slow brute-force, yet hopefully bug-free, synonym filter. */ |
| private Automaton slowSynFilter(String doc, List<OneSyn> syns, boolean flatten) { |
| String[] tokens = doc.split(" +"); |
| if (VERBOSE) { |
| System.out.println(" doc has " + tokens.length + " tokens"); |
| } |
| int i=0; |
| Automaton.Builder a = new Automaton.Builder(); |
| int lastState = a.createState(); |
| while (i<tokens.length) { |
| // Consider all possible syn matches starting at this point: |
| assert tokens[i].length() == 1; |
| if (VERBOSE) { |
| System.out.println(" i=" + i); |
| } |
| |
| List<OneSyn> matches = new ArrayList<>(); |
| for(OneSyn syn : syns) { |
| if (i + syn.in.length <= tokens.length) { |
| boolean match = true; |
| for(int j=0;j<syn.in.length;j++) { |
| if (tokens[i+j].charAt(0) != syn.in[j]) { |
| match = false; |
| break; |
| } |
| } |
| |
| if (match) { |
| if (matches.isEmpty() == false) { |
| if (syn.in.length < matches.get(0).in.length) { |
| // Greedy matching: we already found longer syns matching here |
| continue; |
| } else if (syn.in.length > matches.get(0).in.length) { |
| // Greedy matching: all previous matches were shorter, so we drop them |
| matches.clear(); |
| } else { |
| // Keep the current matches: we allow multiple synonyms matching the same input string |
| } |
| } |
| |
| matches.add(syn); |
| } |
| } |
| } |
| |
| int nextState = a.createState(); |
| |
| if (matches.isEmpty() == false) { |
| // We have match(es) starting at this token |
| if (VERBOSE) { |
| System.out.println(" matches @ i=" + i + ": " + matches); |
| } |
| // We keepOrig if any of the matches said to: |
| boolean keepOrig = false; |
| for(OneSyn syn : matches) { |
| keepOrig |= syn.keepOrig; |
| } |
| |
| List<Integer> flatStates; |
| if (flatten) { |
| flatStates = new ArrayList<>(); |
| } else { |
| flatStates = null; |
| } |
| |
| if (keepOrig) { |
| // Add path for the original tokens |
| addSidePath(a, lastState, nextState, matches.get(0).in, flatStates); |
| } |
| |
| for(OneSyn syn : matches) { |
| addSidePath(a, lastState, nextState, syn.out, flatStates); |
| } |
| |
| i += matches.get(0).in.length; |
| } else { |
| a.addTransition(lastState, nextState, tokens[i].charAt(0)); |
| i++; |
| } |
| |
| lastState = nextState; |
| } |
| |
| a.setAccept(lastState, true); |
| |
| return topoSort(a.finish()); |
| } |
| |
| /** Just creates a side path from startState to endState with the provided tokens. */ |
| private static void addSidePath(Automaton.Builder a, int startState, int endState, char[] tokens, List<Integer> flatStates) { |
| int lastState = startState; |
| for(int i=0;i<tokens.length;i++) { |
| int nextState; |
| if (i == tokens.length-1) { |
| nextState = endState; |
| } else if (flatStates == null || i >= flatStates.size()) { |
| nextState = a.createState(); |
| if (flatStates != null) { |
| assert i == flatStates.size(); |
| flatStates.add(nextState); |
| } |
| } else { |
| nextState = flatStates.get(i); |
| } |
| a.addTransition(lastState, nextState, tokens[i]); |
| |
| lastState = nextState; |
| } |
| } |
| |
| private Automaton toAutomaton(TokenStream ts) throws IOException { |
| PositionIncrementAttribute posIncAtt = ts.addAttribute(PositionIncrementAttribute.class); |
| PositionLengthAttribute posLenAtt = ts.addAttribute(PositionLengthAttribute.class); |
| CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class); |
| ts.reset(); |
| Automaton a = new Automaton(); |
| int srcNode = -1; |
| int destNode = -1; |
| int state = a.createState(); |
| while (ts.incrementToken()) { |
| assert termAtt.length() == 1; |
| char c = termAtt.charAt(0); |
| int posInc = posIncAtt.getPositionIncrement(); |
| if (posInc != 0) { |
| srcNode += posInc; |
| while (state < srcNode) { |
| state = a.createState(); |
| } |
| } |
| destNode = srcNode + posLenAtt.getPositionLength(); |
| while (state < destNode) { |
| state = a.createState(); |
| } |
| a.addTransition(srcNode, destNode, c); |
| } |
| ts.end(); |
| ts.close(); |
| a.finishState(); |
| a.setAccept(destNode, true); |
| return a; |
| } |
| |
| /* |
| private String toDot(TokenStream ts) throws IOException { |
| PositionIncrementAttribute posIncAtt = ts.addAttribute(PositionIncrementAttribute.class); |
| PositionLengthAttribute posLenAtt = ts.addAttribute(PositionLengthAttribute.class); |
| CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class); |
| TypeAttribute typeAtt = ts.addAttribute(TypeAttribute.class); |
| ts.reset(); |
| int srcNode = -1; |
| int destNode = -1; |
| |
| StringBuilder b = new StringBuilder(); |
| b.append("digraph Automaton {\n"); |
| b.append(" rankdir = LR\n"); |
| b.append(" node [width=0.2, height=0.2, fontsize=8]\n"); |
| b.append(" initial [shape=plaintext,label=\"\"]\n"); |
| b.append(" initial -> 0\n"); |
| |
| while (ts.incrementToken()) { |
| int posInc = posIncAtt.getPositionIncrement(); |
| if (posInc != 0) { |
| srcNode += posInc; |
| b.append(" "); |
| b.append(srcNode); |
| b.append(" [shape=circle,label=\"" + srcNode + "\"]\n"); |
| } |
| destNode = srcNode + posLenAtt.getPositionLength(); |
| b.append(" "); |
| b.append(srcNode); |
| b.append(" -> "); |
| b.append(destNode); |
| b.append(" [label=\""); |
| b.append(termAtt); |
| b.append("\""); |
| if (typeAtt.type().equals("word") == false) { |
| b.append(" color=red"); |
| } |
| b.append("]\n"); |
| } |
| ts.end(); |
| ts.close(); |
| |
| b.append('}'); |
| return b.toString(); |
| } |
| */ |
| |
| /** Renumbers nodes according to their topo sort */ |
| private Automaton topoSort(Automaton in) { |
| int[] newToOld = Operations.topoSortStates(in); |
| int[] oldToNew = new int[newToOld.length]; |
| |
| Automaton.Builder a = new Automaton.Builder(); |
| //System.out.println("remap:"); |
| for(int i=0;i<newToOld.length;i++) { |
| a.createState(); |
| oldToNew[newToOld[i]] = i; |
| //System.out.println(" " + newToOld[i] + " -> " + i); |
| if (in.isAccept(newToOld[i])) { |
| a.setAccept(i, true); |
| //System.out.println(" **"); |
| } |
| } |
| |
| Transition t = new Transition(); |
| for(int i=0;i<newToOld.length;i++) { |
| int count = in.initTransition(newToOld[i], t); |
| for(int j=0;j<count;j++) { |
| in.getNextTransition(t); |
| a.addTransition(i, oldToNew[t.dest], t.min, t.max); |
| } |
| } |
| |
| return a.finish(); |
| } |
| |
| /** |
| * verify type of token and positionLengths on synonyms of different word counts. |
| */ |
| public void testPositionLengthAndType() throws Exception { |
| String testFile = |
| "spider man, spiderman\n" + |
| "usa,united states,u s a,united states of america"; |
| Analyzer analyzer = new MockAnalyzer(random()); |
| SolrSynonymParser parser = new SolrSynonymParser(true, true, analyzer); |
| |
| parser.parse(new StringReader(testFile)); |
| analyzer.close(); |
| |
| SynonymMap map = parser.build(); |
| analyzer = getFlattenAnalyzer(parser, true); |
| |
| BytesRef value = Util.get(map.fst, Util.toUTF32(new CharsRef("usa"), new IntsRefBuilder())); |
| ByteArrayDataInput bytesReader = new ByteArrayDataInput(value.bytes, value.offset, value.length); |
| final int code = bytesReader.readVInt(); |
| final int count = code >>> 1; |
| |
| final int[] synonymsIdxs = new int[count]; |
| for (int i = 0; i < count; i++) { |
| synonymsIdxs[i] = bytesReader.readVInt(); |
| } |
| |
| BytesRef scratchBytes = new BytesRef(); |
| map.words.get(synonymsIdxs[2], scratchBytes); |
| |
| int synonymLength = 1; |
| for (int i = scratchBytes.offset; i < scratchBytes.offset + scratchBytes.length; i++) { |
| if (scratchBytes.bytes[i] == SynonymMap.WORD_SEPARATOR) { |
| synonymLength++; |
| } |
| } |
| |
| assertEquals(count, 3); |
| assertEquals(synonymLength, 4); |
| |
| assertAnalyzesTo(analyzer, "spider man", |
| new String[]{"spiderman", "spider", "man"}, |
| new int[]{0, 0, 7}, |
| new int[]{10, 6, 10}, |
| new String[]{"SYNONYM", "word", "word"}, |
| new int[]{1, 0, 1}, |
| new int[]{2, 1, 1}); |
| |
| assertAnalyzesToPositions(analyzer, "amazing spider man", |
| new String[]{"amazing", "spiderman", "spider", "man"}, |
| new String[]{"word", "SYNONYM", "word", "word"}, |
| new int[]{1, 1, 0, 1}, |
| new int[]{1, 2, 1, 1}); |
| |
| // System.out.println(toDot(getAnalyzer(parser, true).tokenStream("field", new StringReader("the usa is wealthy")))); |
| |
| assertAnalyzesTo(analyzer, "the united states of america is wealthy", |
| new String[]{"the", "usa", "united", "u", "united", "states", "s", "states", "a", "of", "america", "is", "wealthy"}, |
| new int[] {0, 4, 4, 4, 4, 11, 11, 11, 18, 18, 21, 29, 32}, |
| new int[] {3, 28, 10, 10, 10, 28, 17, 17, 28, 20, 28, 31, 39}, |
| new String[]{"word", "SYNONYM", "SYNONYM", "SYNONYM", "word", "SYNONYM", "SYNONYM", "word", "SYNONYM", "word", "word", "word", "word"}, |
| new int[] {1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1}, |
| new int[] {1, 4, 1, 1, 1, 3, 1, 1, 2, 1, 1, 1, 1}); |
| |
| assertAnalyzesToPositions(analyzer, "spiderman", |
| new String[]{"spider", "spiderman", "man"}, |
| new String[]{"SYNONYM", "word", "SYNONYM"}, |
| new int[]{1, 0, 1}, |
| new int[]{1, 2, 1}); |
| |
| assertAnalyzesTo(analyzer, "spiderman enemies", |
| new String[]{"spider", "spiderman", "man", "enemies"}, |
| new int[]{0, 0, 0, 10}, |
| new int[]{9, 9, 9, 17}, |
| new String[]{"SYNONYM", "word", "SYNONYM", "word"}, |
| new int[]{1, 0, 1, 1}, |
| new int[]{1, 2, 1, 1}); |
| |
| assertAnalyzesTo(analyzer, "the usa is wealthy", |
| new String[]{"the", "united", "u", "united", "usa", "states", "s", "states", "a", "of", "america", "is", "wealthy"}, |
| new int[] {0, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 8, 11}, |
| new int[] {3, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 10, 18}, |
| new String[]{"word", "SYNONYM", "SYNONYM", "SYNONYM", "word", "SYNONYM", "SYNONYM", "SYNONYM", "SYNONYM", "SYNONYM", "SYNONYM", "word", "word"}, |
| new int[] {1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1}, |
| new int[] {1, 1, 1, 1, 4, 3, 1, 1, 2, 1, 1, 1, 1}); |
| |
| assertGraphStrings(analyzer, "the usa is wealthy", new String[] { |
| "the usa is wealthy", |
| "the united states is wealthy", |
| "the u s a is wealthy", |
| "the united states of america is wealthy", |
| // Wrong. Here only due to "sausagization" of the multi word synonyms. |
| "the u states is wealthy", |
| "the u states a is wealthy", |
| "the u s of america is wealthy", |
| "the u states of america is wealthy", |
| "the united s a is wealthy", |
| "the united states a is wealthy", |
| "the united s of america is wealthy"}); |
| |
| assertAnalyzesTo(analyzer, "the united states is wealthy", |
| new String[]{"the", "usa", "u", "united", "united", "s", "states", "states", "a", "of", "america", "is", "wealthy"}, |
| new int[] {0, 4, 4, 4, 4, 11, 11, 11, 11, 11, 11, 18, 21}, |
| new int[] {3, 17, 10, 10, 10, 17, 17, 17, 17, 17, 17, 20, 28}, |
| new String[]{"word", "SYNONYM", "SYNONYM", "SYNONYM", "word", "SYNONYM", "SYNONYM", "word", "SYNONYM", "SYNONYM", "SYNONYM", "word", "word"}, |
| new int[] {1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1}, |
| new int[] {1, 4, 1, 1, 1, 1, 1, 3, 2, 1, 1, 1, 1}, |
| false); |
| |
| assertAnalyzesTo(analyzer, "the united states of balance", |
| new String[]{"the", "usa", "u", "united", "united", "s", "states", "states", "a", "of", "america", "of", "balance"}, |
| new int[] {0, 4, 4, 4, 4, 11, 11, 11, 11, 11, 11, 18, 21}, |
| new int[] {3, 17, 10, 10, 10, 17, 17, 17, 17, 17, 17, 20, 28}, |
| new String[]{"word", "SYNONYM", "SYNONYM", "SYNONYM", "word", "SYNONYM", "SYNONYM", "word", "SYNONYM", "SYNONYM", "SYNONYM", "word", "word"}, |
| new int[] {1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1}, |
| new int[] {1, 4, 1, 1, 1, 1, 1, 3, 2, 1, 1, 1, 1}); |
| |
| analyzer.close(); |
| } |
| |
| public void testMultiwordOffsets() throws Exception { |
| SynonymMap.Builder b = new SynonymMap.Builder(true); |
| final boolean keepOrig = true; |
| add(b, "national hockey league", "nhl", keepOrig); |
| Analyzer a = getFlattenAnalyzer(b, true); |
| |
| assertAnalyzesTo(a, "national hockey league", |
| new String[]{"nhl", "national", "hockey", "league"}, |
| new int[]{0, 0, 9, 16}, |
| new int[]{22, 8, 15, 22}, |
| new int[]{1, 0, 1, 1}); |
| a.close(); |
| } |
| |
| public void testIncludeOrig() throws Exception { |
| SynonymMap.Builder b = new SynonymMap.Builder(true); |
| final boolean keepOrig = true; |
| add(b, "a b", "ab", keepOrig); |
| add(b, "a c", "ac", keepOrig); |
| add(b, "a", "aa", keepOrig); |
| add(b, "b", "bb", keepOrig); |
| add(b, "z x c v", "zxcv", keepOrig); |
| add(b, "x c", "xc", keepOrig); |
| |
| Analyzer a = getFlattenAnalyzer(b, true); |
| |
| assertAnalyzesTo(a, "$", |
| new String[]{"$"}, |
| new int[]{1}); |
| assertAnalyzesTo(a, "a", |
| new String[]{"aa", "a"}, |
| new int[]{1, 0}); |
| assertAnalyzesTo(a, "a", |
| new String[]{"aa", "a"}, |
| new int[]{1, 0}); |
| assertAnalyzesTo(a, "$ a", |
| new String[]{"$", "aa", "a"}, |
| new int[]{1, 1, 0}); |
| assertAnalyzesTo(a, "a $", |
| new String[]{"aa", "a", "$"}, |
| new int[]{1, 0, 1}); |
| assertAnalyzesTo(a, "$ a !", |
| new String[]{"$", "aa", "a", "!"}, |
| new int[]{1, 1, 0, 1}); |
| assertAnalyzesTo(a, "a a", |
| new String[]{"aa", "a", "aa", "a"}, |
| new int[]{1, 0, 1, 0}); |
| assertAnalyzesTo(a, "b", |
| new String[]{"bb", "b"}, |
| new int[]{1, 0}); |
| assertAnalyzesTo(a, "z x c v", |
| new String[]{"zxcv", "z", "x", "c", "v"}, |
| new int[]{1, 0, 1, 1, 1}); |
| assertAnalyzesTo(a, "z x c $", |
| new String[]{"z", "xc", "x", "c", "$"}, |
| new int[]{1, 1, 0, 1, 1}); |
| a.close(); |
| } |
| |
| public void testUpperCase() throws IOException { |
| assertMapping("word", "synonym"); |
| assertMapping("word".toUpperCase(Locale.ROOT), "synonym"); |
| } |
| |
| private void assertMapping(String inputString, String outputString) throws IOException { |
| SynonymMap.Builder builder = new SynonymMap.Builder(false); |
| // the rules must be lowercased up front, but the incoming tokens will be case insensitive: |
| CharsRef input = SynonymMap.Builder.join(inputString.toLowerCase(Locale.ROOT).split(" "), new CharsRefBuilder()); |
| CharsRef output = SynonymMap.Builder.join(outputString.split(" "), new CharsRefBuilder()); |
| builder.add(input, output, true); |
| Analyzer analyzer = new CustomAnalyzer(builder.build()); |
| TokenStream tokenStream = analyzer.tokenStream("field", inputString); |
| assertTokenStreamContents(tokenStream, new String[]{ |
| outputString, inputString |
| }); |
| } |
| |
| static class CustomAnalyzer extends Analyzer { |
| private SynonymMap synonymMap; |
| |
| CustomAnalyzer(SynonymMap synonymMap) { |
| this.synonymMap = synonymMap; |
| } |
| |
| @Override |
| protected TokenStreamComponents createComponents(String s) { |
| Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false); |
| TokenStream tokenStream = new SynonymGraphFilter(tokenizer, synonymMap, true); // Ignore case True |
| return new TokenStreamComponents(tokenizer, tokenStream); |
| } |
| } |
| } |