| using J2N.Text; |
| using Lucene.Net.Analysis.Core; |
| using Lucene.Net.Analysis.TokenAttributes; |
| using Lucene.Net.Support; |
| using Lucene.Net.Util; |
| using NUnit.Framework; |
| using System; |
| using System.Collections.Generic; |
| using System.Diagnostics; |
| using System.Globalization; |
| using System.IO; |
| using System.Text; |
| using System.Text.RegularExpressions; |
| using JCG = J2N.Collections.Generic; |
| using Console = Lucene.Net.Support.SystemConsole; |
| |
| namespace Lucene.Net.Analysis.Synonym |
| { |
| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| |
| public class TestSynonymMapFilter : BaseTokenStreamTestCase |
| { |
| |
| private SynonymMap.Builder b; |
| private Tokenizer tokensIn; |
| private SynonymFilter tokensOut; |
| private ICharTermAttribute termAtt; |
| private IPositionIncrementAttribute posIncrAtt; |
| private IPositionLengthAttribute posLenAtt; |
| private IOffsetAttribute offsetAtt; |
| |
| private static Regex space = new Regex(" +", RegexOptions.Compiled); |
| |
| private void Add(string input, string output, bool keepOrig) |
| { |
| if (VERBOSE) |
| { |
| Console.WriteLine(" add input=" + input + " output=" + output + " keepOrig=" + keepOrig); |
| } |
| |
| CharsRef inputCharsRef = new CharsRef(); |
| SynonymMap.Builder.Join(space.Split(input).TrimEnd(), inputCharsRef); |
| |
| CharsRef outputCharsRef = new CharsRef(); |
| SynonymMap.Builder.Join(space.Split(output).TrimEnd(), outputCharsRef); |
| |
| b.Add(inputCharsRef, outputCharsRef, keepOrig); |
| } |
| |
| private void AssertEquals(CharTermAttribute term, string expected) |
| { |
| assertEquals(expected.Length, term.Length); |
| char[] buffer = term.Buffer; |
| for (int chIDX = 0; chIDX < expected.Length; chIDX++) |
| { |
| assertEquals(expected[chIDX], buffer[chIDX]); |
| } |
| } |
| |
| // For the output string: separate positions with a space, |
| // and separate multiple tokens at each position with a |
| // /. If a token should have end offset != the input |
| // token's end offset then add :X to it: |
| |
| // TODO: we should probably refactor this guy to use/take analyzer, |
| // the tests are a little messy |
| private void Verify(string input, string output) |
| { |
| if (VERBOSE) |
| { |
| Console.WriteLine("TEST: verify input=" + input + " expectedOutput=" + output); |
| } |
| |
| tokensIn.SetReader(new StringReader(input)); |
| tokensOut.Reset(); |
| string[] expected = output.Split(' ').TrimEnd(); |
| int expectedUpto = 0; |
| while (tokensOut.IncrementToken()) |
| { |
| |
| if (VERBOSE) |
| { |
| Console.WriteLine(" incr token=" + termAtt.ToString() + " posIncr=" + posIncrAtt.PositionIncrement + " startOff=" + offsetAtt.StartOffset + " endOff=" + offsetAtt.EndOffset); |
| } |
| |
| assertTrue(expectedUpto < expected.Length); |
| int startOffset = offsetAtt.StartOffset; |
| int endOffset = offsetAtt.EndOffset; |
| |
| string[] expectedAtPos = expected[expectedUpto++].Split('/').TrimEnd(); |
| for (int atPos = 0; atPos < expectedAtPos.Length; atPos++) |
| { |
| if (atPos > 0) |
| { |
| assertTrue(tokensOut.IncrementToken()); |
| if (VERBOSE) |
| { |
| Console.WriteLine(" incr token=" + termAtt.ToString() + " posIncr=" + posIncrAtt.PositionIncrement + " startOff=" + offsetAtt.StartOffset + " endOff=" + offsetAtt.EndOffset); |
| } |
| } |
| int colonIndex = expectedAtPos[atPos].IndexOf(':'); |
| int underbarIndex = expectedAtPos[atPos].IndexOf('_'); |
| string expectedToken; |
| int expectedEndOffset; |
| int expectedPosLen; |
| if (colonIndex != -1) |
| { |
| expectedToken = expectedAtPos[atPos].Substring(0, colonIndex - 0); |
| if (underbarIndex != -1) |
| { |
| expectedEndOffset = int.Parse(expectedAtPos[atPos].Substring(1 + colonIndex, underbarIndex - (1 + colonIndex)), CultureInfo.InvariantCulture); |
| expectedPosLen = int.Parse(expectedAtPos[atPos].Substring(1 + underbarIndex), CultureInfo.InvariantCulture); |
| } |
| else |
| { |
| expectedEndOffset = int.Parse(expectedAtPos[atPos].Substring(1 + colonIndex), CultureInfo.InvariantCulture); |
| expectedPosLen = 1; |
| } |
| } |
| else |
| { |
| expectedToken = expectedAtPos[atPos]; |
| expectedEndOffset = endOffset; |
| expectedPosLen = 1; |
| } |
| assertEquals(expectedToken, termAtt.ToString()); |
| assertEquals(atPos == 0 ? 1 : 0, posIncrAtt.PositionIncrement); |
| // start/end offset of all tokens at same pos should |
| // be the same: |
| assertEquals(startOffset, offsetAtt.StartOffset); |
| assertEquals(expectedEndOffset, offsetAtt.EndOffset); |
| assertEquals(expectedPosLen, posLenAtt.PositionLength); |
| } |
| } |
| tokensOut.End(); |
| tokensOut.Dispose(); |
| if (VERBOSE) |
| { |
| Console.WriteLine(" incr: END"); |
| } |
| assertEquals(expectedUpto, expected.Length); |
| } |
| |
| [Test] |
| public virtual void TestDontKeepOrig() |
| { |
| b = new SynonymMap.Builder(true); |
| Add("a b", "foo", false); |
| |
| SynonymMap map = b.Build(); |
| Analyzer analyzer = new AnalyzerAnonymousInnerClassHelper(this, map); |
| |
| AssertAnalyzesTo(analyzer, "a b c", |
| new string[] { "foo", "c" }, |
| new int[] { 0, 4 }, |
| new int[] { 3, 5 }, |
| null, |
| new int[] { 1, 1 }, |
| new int[] { 1, 1 }, |
| true); |
| CheckAnalysisConsistency(Random, analyzer, false, "a b c"); |
| } |
| |
| private class AnalyzerAnonymousInnerClassHelper : Analyzer |
| { |
| private readonly TestSynonymMapFilter outerInstance; |
| |
| private SynonymMap map; |
| |
| public AnalyzerAnonymousInnerClassHelper(TestSynonymMapFilter outerInstance, SynonymMap map) |
| { |
| this.outerInstance = outerInstance; |
| this.map = map; |
| } |
| |
| protected override TokenStreamComponents CreateComponents(string fieldName, TextReader reader) |
| { |
| Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.SIMPLE, true); |
| return new TokenStreamComponents(tokenizer, new SynonymFilter(tokenizer, map, false)); |
| } |
| } |
| |
| [Test] |
| public virtual void TestDoKeepOrig() |
| { |
| b = new SynonymMap.Builder(true); |
| Add("a b", "foo", true); |
| |
| SynonymMap map = b.Build(); |
| |
| Analyzer analyzer = new AnalyzerAnonymousInnerClassHelper2(this, map); |
| |
| AssertAnalyzesTo(analyzer, "a b c", |
| new string[] { "a", "foo", "b", "c" }, |
| new int[] { 0, 0, 2, 4 }, |
| new int[] { 1, 3, 3, 5 }, |
| null, |
| new int[] { 1, 0, 1, 1 }, |
| new int[] { 1, 2, 1, 1 }, |
| true); |
| CheckAnalysisConsistency(Random, analyzer, false, "a b c"); |
| } |
| |
| private class AnalyzerAnonymousInnerClassHelper2 : Analyzer |
| { |
| private readonly TestSynonymMapFilter outerInstance; |
| |
| private SynonymMap map; |
| |
| public AnalyzerAnonymousInnerClassHelper2(TestSynonymMapFilter outerInstance, SynonymMap map) |
| { |
| this.outerInstance = outerInstance; |
| this.map = map; |
| } |
| |
| protected override TokenStreamComponents CreateComponents(string fieldName, TextReader reader) |
| { |
| Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.SIMPLE, true); |
| return new TokenStreamComponents(tokenizer, new SynonymFilter(tokenizer, map, false)); |
| } |
| } |
| |
| [Test] |
| public virtual void TestBasic() |
| { |
| b = new SynonymMap.Builder(true); |
| Add("a", "foo", true); |
| Add("a b", "bar fee", true); |
| Add("b c", "dog collar", true); |
| Add("c d", "dog harness holder extras", true); |
| Add("m c e", "dog barks loudly", false); |
| Add("i j k", "feep", true); |
| |
| Add("e f", "foo bar", false); |
| Add("e f", "baz bee", false); |
| |
| Add("z", "boo", false); |
| Add("y", "bee", true); |
| |
| tokensIn = new MockTokenizer(new StringReader("a"), MockTokenizer.WHITESPACE, true); |
| tokensIn.Reset(); |
| assertTrue(tokensIn.IncrementToken()); |
| assertFalse(tokensIn.IncrementToken()); |
| tokensIn.End(); |
| tokensIn.Dispose(); |
| |
| tokensOut = new SynonymFilter(tokensIn, b.Build(), true); |
| termAtt = tokensOut.AddAttribute<ICharTermAttribute>(); |
| posIncrAtt = tokensOut.AddAttribute<IPositionIncrementAttribute>(); |
| posLenAtt = tokensOut.AddAttribute<IPositionLengthAttribute>(); |
| offsetAtt = tokensOut.AddAttribute<IOffsetAttribute>(); |
| |
| Verify("a b c", "a/bar b/fee c"); |
| |
| // syn output extends beyond input tokens |
| Verify("x a b c d", "x a/bar b/fee c/dog d/harness holder extras"); |
| |
| Verify("a b a", "a/bar b/fee a/foo"); |
| |
| // outputs that add to one another: |
| Verify("c d c d", "c/dog d/harness c/holder/dog d/extras/harness holder extras"); |
| |
| // two outputs for same input |
| Verify("e f", "foo/baz bar/bee"); |
| |
| // verify multi-word / single-output offsets: |
| Verify("g i j k g", "g i/feep:7_3 j k g"); |
| |
| // mixed keepOrig true/false: |
| Verify("a m c e x", "a/foo dog barks loudly x"); |
| Verify("c d m c e x", "c/dog d/harness holder/dog extras/barks loudly x"); |
| assertTrue(tokensOut.CaptureCount > 0); |
| |
| // no captureStates when no syns matched |
| Verify("p q r s t", "p q r s t"); |
| assertEquals(0, tokensOut.CaptureCount); |
| |
| // no captureStates when only single-input syns, w/ no |
| // lookahead needed, matched |
| Verify("p q z y t", "p q boo y/bee t"); |
| assertEquals(0, tokensOut.CaptureCount); |
| } |
| |
| private string GetRandomString(char start, int alphabetSize, int length) |
| { |
| Debug.Assert(alphabetSize <= 26); |
| char[] s = new char[2 * length]; |
| for (int charIDX = 0; charIDX < length; charIDX++) |
| { |
| s[2 * charIDX] = (char)(start + Random.Next(alphabetSize)); |
| s[2 * charIDX + 1] = ' '; |
| } |
| return new string(s); |
| } |
| |
| protected class OneSyn |
| { |
| internal string @in; |
| internal IList<string> @out; |
| internal bool keepOrig; |
| } |
| |
| protected virtual string SlowSynMatcher(string doc, IList<OneSyn> syns, int maxOutputLength) |
| { |
| assertTrue(doc.Length % 2 == 0); |
| int numInputs = doc.Length / 2; |
| bool[] keepOrigs = new bool[numInputs]; |
| bool[] hasMatch = new bool[numInputs]; |
| Arrays.Fill(keepOrigs, false); |
| string[] outputs = new string[numInputs + maxOutputLength]; |
| OneSyn[] matches = new OneSyn[numInputs]; |
| foreach (OneSyn syn in syns) |
| { |
| int idx = -1; |
| while (true) |
| { |
| idx = doc.IndexOf(syn.@in, 1 + idx, StringComparison.Ordinal); |
| if (idx == -1) |
| { |
| break; |
| } |
| assertTrue(idx % 2 == 0); |
| int matchIDX = idx / 2; |
| assertTrue(syn.@in.Length % 2 == 1); |
| if (matches[matchIDX] == null) |
| { |
| matches[matchIDX] = syn; |
| } |
| else if (syn.@in.Length > matches[matchIDX].@in.Length) |
| { |
| // Greedy conflict resolution: longer match wins: |
| matches[matchIDX] = syn; |
| } |
| else |
| { |
| assertTrue(syn.@in.Length < matches[matchIDX].@in.Length); |
| } |
| } |
| } |
| |
| // Greedy conflict resolution: if syn matches a range of inputs, |
| // it prevents other syns from matching that range |
| for (int inputIDX = 0; inputIDX < numInputs; inputIDX++) |
| { |
| OneSyn match = matches[inputIDX]; |
| if (match != null) |
| { |
| int synInLength = (1 + match.@in.Length) / 2; |
| for (int nextInputIDX = inputIDX + 1; nextInputIDX < numInputs && nextInputIDX < (inputIDX + synInLength); nextInputIDX++) |
| { |
| matches[nextInputIDX] = null; |
| } |
| } |
| } |
| |
| // Fill overlapping outputs: |
| for (int inputIDX = 0; inputIDX < numInputs; inputIDX++) |
| { |
| OneSyn syn = matches[inputIDX]; |
| if (syn == null) |
| { |
| continue; |
| } |
| for (int idx = 0; idx < (1 + syn.@in.Length) / 2; idx++) |
| { |
| hasMatch[inputIDX + idx] = true; |
| keepOrigs[inputIDX + idx] |= syn.keepOrig; |
| } |
| foreach (string synOut in syn.@out) |
| { |
| string[] synOutputs = synOut.Split(' ').TrimEnd(); |
| assertEquals(synOutputs.Length, (1 + synOut.Length) / 2); |
| int matchEnd = inputIDX + synOutputs.Length; |
| int synUpto = 0; |
| for (int matchIDX = inputIDX; matchIDX < matchEnd; matchIDX++) |
| { |
| if (outputs[matchIDX] == null) |
| { |
| outputs[matchIDX] = synOutputs[synUpto++]; |
| } |
| else |
| { |
| outputs[matchIDX] = outputs[matchIDX] + "/" + synOutputs[synUpto++]; |
| } |
| int endOffset; |
| if (matchIDX < numInputs) |
| { |
| int posLen; |
| if (synOutputs.Length == 1) |
| { |
| // Add full endOffset |
| endOffset = (inputIDX * 2) + syn.@in.Length; |
| posLen = syn.keepOrig ? (1 + syn.@in.Length) / 2 : 1; |
| } |
| else |
| { |
| // Add endOffset matching input token's |
| endOffset = (matchIDX * 2) + 1; |
| posLen = 1; |
| } |
| outputs[matchIDX] = outputs[matchIDX] + ":" + endOffset + "_" + posLen; |
| } |
| } |
| } |
| } |
| |
| StringBuilder sb = new StringBuilder(); |
| string[] inputTokens = doc.Split(' ').TrimEnd(); |
| int limit = inputTokens.Length + maxOutputLength; |
| for (int inputIDX = 0; inputIDX < limit; inputIDX++) |
| { |
| bool posHasOutput = false; |
| if (inputIDX >= numInputs && outputs[inputIDX] == null) |
| { |
| break; |
| } |
| if (inputIDX < numInputs && (!hasMatch[inputIDX] || keepOrigs[inputIDX])) |
| { |
| assertTrue(inputTokens[inputIDX].Length != 0); |
| sb.Append(inputTokens[inputIDX]); |
| posHasOutput = true; |
| } |
| |
| if (outputs[inputIDX] != null) |
| { |
| if (posHasOutput) |
| { |
| sb.Append('/'); |
| } |
| sb.Append(outputs[inputIDX]); |
| } |
| else if (!posHasOutput) |
| { |
| continue; |
| } |
| if (inputIDX < limit - 1) |
| { |
| sb.Append(' '); |
| } |
| } |
| |
| return sb.ToString(); |
| } |
| |
| [Test] |
| public virtual void TestRandom() |
| { |
| |
| int alphabetSize = TestUtil.NextInt32(Random, 2, 7); |
| |
| int docLen = AtLeast(3000); |
| //final int docLen = 50; |
| |
| string document = GetRandomString('a', alphabetSize, docLen); |
| |
| if (VERBOSE) |
| { |
| Console.WriteLine("TEST: doc=" + document); |
| } |
| |
| int numSyn = AtLeast(5); |
| //final int numSyn = 2; |
| |
| IDictionary<string, OneSyn> synMap = new Dictionary<string, OneSyn>(); |
| IList<OneSyn> syns = new List<OneSyn>(); |
| bool dedup = Random.nextBoolean(); |
| if (VERBOSE) |
| { |
| Console.WriteLine(" dedup=" + dedup); |
| } |
| b = new SynonymMap.Builder(dedup); |
| for (int synIDX = 0; synIDX < numSyn; synIDX++) |
| { |
| string synIn = GetRandomString('a', alphabetSize, TestUtil.NextInt32(Random, 1, 5)).Trim(); |
| if (!synMap.TryGetValue(synIn, out OneSyn s) || s == null) |
| { |
| s = new OneSyn(); |
| s.@in = synIn; |
| syns.Add(s); |
| s.@out = new List<string>(); |
| synMap[synIn] = s; |
| s.keepOrig = Random.nextBoolean(); |
| } |
| string synOut = GetRandomString('0', 10, TestUtil.NextInt32(Random, 1, 5)).Trim(); |
| s.@out.Add(synOut); |
| Add(synIn, synOut, s.keepOrig); |
| if (VERBOSE) |
| { |
| Console.WriteLine(" syns[" + synIDX + "] = " + s.@in + " -> " + s.@out + " keepOrig=" + s.keepOrig); |
| } |
| } |
| |
| tokensIn = new MockTokenizer(new StringReader("a"), MockTokenizer.WHITESPACE, true); |
| tokensIn.Reset(); |
| assertTrue(tokensIn.IncrementToken()); |
| assertFalse(tokensIn.IncrementToken()); |
| tokensIn.End(); |
| tokensIn.Dispose(); |
| |
| tokensOut = new SynonymFilter(tokensIn, b.Build(), true); |
| termAtt = tokensOut.AddAttribute<ICharTermAttribute>(); |
| posIncrAtt = tokensOut.AddAttribute<IPositionIncrementAttribute>(); |
| posLenAtt = tokensOut.AddAttribute<IPositionLengthAttribute>(); |
| offsetAtt = tokensOut.AddAttribute<IOffsetAttribute>(); |
| |
| if (dedup) |
| { |
| PruneDups(syns); |
| } |
| |
| string expected = SlowSynMatcher(document, syns, 5); |
| |
| if (VERBOSE) |
| { |
| Console.WriteLine("TEST: expected=" + expected); |
| } |
| |
| Verify(document, expected); |
| } |
| |
| private void PruneDups(IList<OneSyn> syns) |
| { |
| ISet<string> seen = new JCG.HashSet<string>(); |
| foreach (OneSyn syn in syns) |
| { |
| int idx = 0; |
| while (idx < syn.@out.Count) |
| { |
| string @out = syn.@out[idx]; |
| if (!seen.Contains(@out)) |
| { |
| seen.Add(@out); |
| idx++; |
| } |
| else |
| { |
| syn.@out.RemoveAt(idx); |
| } |
| } |
| seen.Clear(); |
| } |
| } |
| |
| private string RandomNonEmptyString() |
| { |
| while (true) |
| { |
| string s = TestUtil.RandomUnicodeString(Random).Trim(); |
| if (s.Length != 0 && s.IndexOf('\u0000') == -1) |
| { |
| return s; |
| } |
| } |
| } |
| |
| /// <summary> |
| /// simple random test, doesn't verify correctness. |
| /// does verify it doesnt throw exceptions, or that the stream doesn't misbehave |
| /// </summary> |
| [Test] |
| public virtual void TestRandom2() |
| { |
| int numIters = AtLeast(3); |
| for (int i = 0; i < numIters; i++) |
| { |
| b = new SynonymMap.Builder(Random.nextBoolean()); |
| int numEntries = AtLeast(10); |
| for (int j = 0; j < numEntries; j++) |
| { |
| Add(RandomNonEmptyString(), RandomNonEmptyString(), Random.nextBoolean()); |
| } |
| SynonymMap map = b.Build(); |
| bool ignoreCase = Random.nextBoolean(); |
| |
| Analyzer analyzer = new AnalyzerAnonymousInnerClassHelper100(this, map, ignoreCase); |
| |
| CheckRandomData(Random, analyzer, 100); |
| } |
| } |
| |
| private class AnalyzerAnonymousInnerClassHelper100 : Analyzer |
| { |
| private readonly TestSynonymMapFilter outerInstance; |
| |
| private SynonymMap map; |
| private bool ignoreCase; |
| |
| public AnalyzerAnonymousInnerClassHelper100(TestSynonymMapFilter outerInstance, SynonymMap map, bool ignoreCase) |
| { |
| this.outerInstance = outerInstance; |
| this.map = map; |
| this.ignoreCase = ignoreCase; |
| } |
| |
| protected override TokenStreamComponents CreateComponents(string fieldName, TextReader reader) |
| { |
| Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.SIMPLE, true); |
| return new TokenStreamComponents(tokenizer, new SynonymFilter(tokenizer, map, ignoreCase)); |
| } |
| } |
| |
| // NOTE: this is an invalid test... SynFilter today can't |
| // properly consume a graph... we can re-enable this once |
| // we fix that... |
| /* |
| // Adds MockGraphTokenFilter before SynFilter: |
| public void TestRandom2GraphBefore() throws Exception { |
| final int numIters = AtLeast(10); |
| Random random = Random; |
| for (int i = 0; i < numIters; i++) { |
| b = new SynonymMap.Builder(random.nextBoolean()); |
| final int numEntries = AtLeast(10); |
| for (int j = 0; j < numEntries; j++) { |
| add(randomNonEmptyString(), randomNonEmptyString(), random.nextBoolean()); |
| } |
| final SynonymMap map = b.Build(); |
| final boolean ignoreCase = random.nextBoolean(); |
| |
| final Analyzer analyzer = new Analyzer() { |
| @Override |
| protected TokenStreamComponents createComponents(String fieldName, Reader reader) { |
| Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.SIMPLE, true); |
| TokenStream graph = new MockGraphTokenFilter(Random, tokenizer); |
| return new TokenStreamComponents(tokenizer, new SynonymFilter(graph, map, ignoreCase)); |
| } |
| }; |
| |
| checkRandomData(random, analyzer, 1000*RANDOM_MULTIPLIER); |
| } |
| } |
| */ |
| |
| // Adds MockGraphTokenFilter after SynFilter: |
| [Test] |
| public virtual void TestRandom2GraphAfter() |
| { |
| int numIters = AtLeast(3); |
| Random random = Random; |
| for (int i = 0; i < numIters; i++) |
| { |
| b = new SynonymMap.Builder(random.nextBoolean()); |
| int numEntries = AtLeast(10); |
| for (int j = 0; j < numEntries; j++) |
| { |
| Add(RandomNonEmptyString(), RandomNonEmptyString(), random.nextBoolean()); |
| } |
| SynonymMap map = b.Build(); |
| bool ignoreCase = random.nextBoolean(); |
| |
| Analyzer analyzer = new AnalyzerAnonymousInnerClassHelper101(this, map, ignoreCase); |
| |
| CheckRandomData(random, analyzer, 100); |
| } |
| } |
| |
| private class AnalyzerAnonymousInnerClassHelper101 : Analyzer |
| { |
| private readonly TestSynonymMapFilter outerInstance; |
| |
| private SynonymMap map; |
| private bool ignoreCase; |
| |
| public AnalyzerAnonymousInnerClassHelper101(TestSynonymMapFilter outerInstance, SynonymMap map, bool ignoreCase) |
| { |
| this.outerInstance = outerInstance; |
| this.map = map; |
| this.ignoreCase = ignoreCase; |
| } |
| |
| protected override TokenStreamComponents CreateComponents(string fieldName, TextReader reader) |
| { |
| Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.SIMPLE, true); |
| TokenStream syns = new SynonymFilter(tokenizer, map, ignoreCase); |
| TokenStream graph = new MockGraphTokenFilter(Random, syns); |
| return new TokenStreamComponents(tokenizer, graph); |
| } |
| } |
| |
| [Test] |
| public virtual void TestEmptyTerm() |
| { |
| Random random = Random; |
| int numIters = AtLeast(10); |
| for (int i = 0; i < numIters; i++) |
| { |
| b = new SynonymMap.Builder(random.nextBoolean()); |
| int numEntries = AtLeast(10); |
| for (int j = 0; j < numEntries; j++) |
| { |
| Add(RandomNonEmptyString(), RandomNonEmptyString(), random.nextBoolean()); |
| } |
| SynonymMap map = b.Build(); |
| bool ignoreCase = random.nextBoolean(); |
| |
| Analyzer analyzer = new AnalyzerAnonymousInnerClassHelper3(this, map, ignoreCase); |
| |
| CheckAnalysisConsistency(random, analyzer, random.nextBoolean(), ""); |
| } |
| } |
| |
| private class AnalyzerAnonymousInnerClassHelper3 : Analyzer |
| { |
| private readonly TestSynonymMapFilter outerInstance; |
| |
| private SynonymMap map; |
| private bool ignoreCase; |
| |
| public AnalyzerAnonymousInnerClassHelper3(TestSynonymMapFilter outerInstance, SynonymMap map, bool ignoreCase) |
| { |
| this.outerInstance = outerInstance; |
| this.map = map; |
| this.ignoreCase = ignoreCase; |
| } |
| |
| protected override TokenStreamComponents CreateComponents(string fieldName, TextReader reader) |
| { |
| Tokenizer tokenizer = new KeywordTokenizer(reader); |
| return new TokenStreamComponents(tokenizer, new SynonymFilter(tokenizer, map, ignoreCase)); |
| } |
| } |
| |
| /// <summary> |
| /// simple random test like testRandom2, but for larger docs |
| /// </summary> |
| [Test] |
| public virtual void TestRandomHuge() |
| { |
| Random random = Random; |
| int numIters = AtLeast(3); |
| for (int i = 0; i < numIters; i++) |
| { |
| b = new SynonymMap.Builder(random.nextBoolean()); |
| int numEntries = AtLeast(10); |
| if (VERBOSE) |
| { |
| Console.WriteLine("TEST: iter=" + i + " numEntries=" + numEntries); |
| } |
| for (int j = 0; j < numEntries; j++) |
| { |
| Add(RandomNonEmptyString(), RandomNonEmptyString(), random.nextBoolean()); |
| } |
| SynonymMap map = b.Build(); |
| bool ignoreCase = random.nextBoolean(); |
| |
| Analyzer analyzer = new AnalyzerAnonymousInnerClassHelper4(this, map, ignoreCase); |
| |
| CheckRandomData(random, analyzer, 100, 1024); |
| } |
| } |
| |
| private class AnalyzerAnonymousInnerClassHelper4 : Analyzer |
| { |
| private readonly TestSynonymMapFilter outerInstance; |
| |
| private SynonymMap map; |
| private bool ignoreCase; |
| |
| public AnalyzerAnonymousInnerClassHelper4(TestSynonymMapFilter outerInstance, SynonymMap map, bool ignoreCase) |
| { |
| this.outerInstance = outerInstance; |
| this.map = map; |
| this.ignoreCase = ignoreCase; |
| } |
| |
| protected override TokenStreamComponents CreateComponents(string fieldName, TextReader reader) |
| { |
| Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.SIMPLE, true); |
| return new TokenStreamComponents(tokenizer, new SynonymFilter(tokenizer, map, ignoreCase)); |
| } |
| } |
| |
| // LUCENE-3375 |
| [Test] |
| public virtual void TestVanishingTerms() |
| { |
| string testFile = "aaa => aaaa1 aaaa2 aaaa3\n" + "bbb => bbbb1 bbbb2\n"; |
| |
| SolrSynonymParser parser = new SolrSynonymParser(true, true, new MockAnalyzer(Random)); |
| parser.Parse(new StringReader(testFile)); |
| SynonymMap map = parser.Build(); |
| |
| Analyzer analyzer = new AnalyzerAnonymousInnerClassHelper5(this, map); |
| |
| // where did my pot go?! |
| AssertAnalyzesTo(analyzer, "xyzzy bbb pot of gold", new string[] { "xyzzy", "bbbb1", "pot", "bbbb2", "of", "gold" }); |
| |
| // this one nukes 'pot' and 'of' |
| // xyzzy aaa pot of gold -> xyzzy aaaa1 aaaa2 aaaa3 gold |
| AssertAnalyzesTo(analyzer, "xyzzy aaa pot of gold", new string[] { "xyzzy", "aaaa1", "pot", "aaaa2", "of", "aaaa3", "gold" }); |
| } |
| |
| private class AnalyzerAnonymousInnerClassHelper5 : Analyzer |
| { |
| private readonly TestSynonymMapFilter outerInstance; |
| |
| private SynonymMap map; |
| |
| public AnalyzerAnonymousInnerClassHelper5(TestSynonymMapFilter outerInstance, SynonymMap map) |
| { |
| this.outerInstance = outerInstance; |
| this.map = map; |
| } |
| |
| protected override TokenStreamComponents CreateComponents(string fieldName, TextReader reader) |
| { |
| Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, true); |
| return new TokenStreamComponents(tokenizer, new SynonymFilter(tokenizer, map, true)); |
| } |
| } |
| |
| [Test] |
| public virtual void TestBasic2() |
| { |
| b = new SynonymMap.Builder(true); |
| const bool keepOrig = false; |
| Add("aaa", "aaaa1 aaaa2 aaaa3", keepOrig); |
| Add("bbb", "bbbb1 bbbb2", keepOrig); |
| tokensIn = new MockTokenizer(new StringReader("a"), MockTokenizer.WHITESPACE, true); |
| tokensIn.Reset(); |
| assertTrue(tokensIn.IncrementToken()); |
| assertFalse(tokensIn.IncrementToken()); |
| tokensIn.End(); |
| tokensIn.Dispose(); |
| |
| tokensOut = new SynonymFilter(tokensIn, b.Build(), true); |
| termAtt = tokensOut.AddAttribute<ICharTermAttribute>(); |
| posIncrAtt = tokensOut.AddAttribute<IPositionIncrementAttribute>(); |
| posLenAtt = tokensOut.AddAttribute<IPositionLengthAttribute>(); |
| offsetAtt = tokensOut.AddAttribute<IOffsetAttribute>(); |
| |
| #pragma warning disable 162 |
| if (keepOrig) |
| { |
| Verify("xyzzy bbb pot of gold", "xyzzy bbb/bbbb1 pot/bbbb2 of gold"); |
| Verify("xyzzy aaa pot of gold", "xyzzy aaa/aaaa1 pot/aaaa2 of/aaaa3 gold"); |
| } |
| else |
| { |
| Verify("xyzzy bbb pot of gold", "xyzzy bbbb1 pot/bbbb2 of gold"); |
| Verify("xyzzy aaa pot of gold", "xyzzy aaaa1 pot/aaaa2 of/aaaa3 gold"); |
| } |
| #pragma warning restore 612, 618 |
| } |
| |
| [Test] |
| public virtual void TestMatching() |
| { |
| b = new SynonymMap.Builder(true); |
| const bool keepOrig = false; |
| Add("a b", "ab", keepOrig); |
| Add("a c", "ac", keepOrig); |
| Add("a", "aa", keepOrig); |
| Add("b", "bb", keepOrig); |
| Add("z x c v", "zxcv", keepOrig); |
| Add("x c", "xc", keepOrig); |
| SynonymMap map = b.Build(); |
| Analyzer a = new AnalyzerAnonymousInnerClassHelper6(this, map); |
| |
| CheckOneTerm(a, "$", "$"); |
| CheckOneTerm(a, "a", "aa"); |
| CheckOneTerm(a, "b", "bb"); |
| |
| AssertAnalyzesTo(a, "a $", new string[] { "aa", "$" }, new int[] { 1, 1 }); |
| |
| AssertAnalyzesTo(a, "$ a", new string[] { "$", "aa" }, new int[] { 1, 1 }); |
| |
| AssertAnalyzesTo(a, "a a", new string[] { "aa", "aa" }, new int[] { 1, 1 }); |
| |
| AssertAnalyzesTo(a, "z x c v", new string[] { "zxcv" }, new int[] { 1 }); |
| |
| AssertAnalyzesTo(a, "z x c $", new string[] { "z", "xc", "$" }, new int[] { 1, 1, 1 }); |
| } |
| |
| private class AnalyzerAnonymousInnerClassHelper6 : Analyzer |
| { |
| private readonly TestSynonymMapFilter outerInstance; |
| |
| private SynonymMap map; |
| |
| public AnalyzerAnonymousInnerClassHelper6(TestSynonymMapFilter outerInstance, SynonymMap map) |
| { |
| this.outerInstance = outerInstance; |
| this.map = map; |
| } |
| |
| protected override TokenStreamComponents CreateComponents(string fieldName, TextReader reader) |
| { |
| Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false); |
| return new TokenStreamComponents(tokenizer, new SynonymFilter(tokenizer, map, true)); |
| } |
| } |
| |
| [Test] |
| public virtual void TestRepeatsOff() |
| { |
| b = new SynonymMap.Builder(true); |
| const bool keepOrig = false; |
| Add("a b", "ab", keepOrig); |
| Add("a b", "ab", keepOrig); |
| Add("a b", "ab", keepOrig); |
| SynonymMap map = b.Build(); |
| Analyzer a = new AnalyzerAnonymousInnerClassHelper7(this, map); |
| |
| AssertAnalyzesTo(a, "a b", new string[] { "ab" }, new int[] { 1 }); |
| } |
| |
| private class AnalyzerAnonymousInnerClassHelper7 : Analyzer |
| { |
| private readonly TestSynonymMapFilter outerInstance; |
| |
| private SynonymMap map; |
| |
| public AnalyzerAnonymousInnerClassHelper7(TestSynonymMapFilter outerInstance, SynonymMap map) |
| { |
| this.outerInstance = outerInstance; |
| this.map = map; |
| } |
| |
| protected override TokenStreamComponents CreateComponents(string fieldName, TextReader reader) |
| { |
| Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false); |
| return new TokenStreamComponents(tokenizer, new SynonymFilter(tokenizer, map, true)); |
| } |
| } |
| |
| [Test] |
| public virtual void TestRepeatsOn() |
| { |
| b = new SynonymMap.Builder(false); |
| const bool keepOrig = false; |
| Add("a b", "ab", keepOrig); |
| Add("a b", "ab", keepOrig); |
| Add("a b", "ab", keepOrig); |
| SynonymMap map = b.Build(); |
| Analyzer a = new AnalyzerAnonymousInnerClassHelper8(this, map); |
| |
| AssertAnalyzesTo(a, "a b", new string[] { "ab", "ab", "ab" }, new int[] { 1, 0, 0 }); |
| } |
| |
| private class AnalyzerAnonymousInnerClassHelper8 : Analyzer |
| { |
| private readonly TestSynonymMapFilter outerInstance; |
| |
| private SynonymMap map; |
| |
| public AnalyzerAnonymousInnerClassHelper8(TestSynonymMapFilter outerInstance, SynonymMap map) |
| { |
| this.outerInstance = outerInstance; |
| this.map = map; |
| } |
| |
| protected override TokenStreamComponents CreateComponents(string fieldName, TextReader reader) |
| { |
| Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false); |
| return new TokenStreamComponents(tokenizer, new SynonymFilter(tokenizer, map, true)); |
| } |
| } |
| |
| [Test] |
| public virtual void TestRecursion() |
| { |
| b = new SynonymMap.Builder(true); |
| const bool keepOrig = false; |
| Add("zoo", "zoo", keepOrig); |
| SynonymMap map = b.Build(); |
| Analyzer a = new AnalyzerAnonymousInnerClassHelper9(this, map); |
| |
| AssertAnalyzesTo(a, "zoo zoo $ zoo", new string[] { "zoo", "zoo", "$", "zoo" }, new int[] { 1, 1, 1, 1 }); |
| } |
| |
| private class AnalyzerAnonymousInnerClassHelper9 : Analyzer |
| { |
| private readonly TestSynonymMapFilter outerInstance; |
| |
| private SynonymMap map; |
| |
| public AnalyzerAnonymousInnerClassHelper9(TestSynonymMapFilter outerInstance, SynonymMap map) |
| { |
| this.outerInstance = outerInstance; |
| this.map = map; |
| } |
| |
| protected override TokenStreamComponents CreateComponents(string fieldName, TextReader reader) |
| { |
| Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false); |
| return new TokenStreamComponents(tokenizer, new SynonymFilter(tokenizer, map, true)); |
| } |
| } |
| |
| [Test] |
| public virtual void TestRecursion2() |
| { |
| b = new SynonymMap.Builder(true); |
| const bool keepOrig = false; |
| Add("zoo", "zoo", keepOrig); |
| Add("zoo", "zoo zoo", keepOrig); |
| SynonymMap map = b.Build(); |
| Analyzer a = new AnalyzerAnonymousInnerClassHelper10(this, map); |
| |
| // verify("zoo zoo $ zoo", "zoo/zoo zoo/zoo/zoo $/zoo zoo/zoo zoo"); |
| AssertAnalyzesTo(a, "zoo zoo $ zoo", new string[] { "zoo", "zoo", "zoo", "zoo", "zoo", "$", "zoo", "zoo", "zoo", "zoo" }, new int[] { 1, 0, 1, 0, 0, 1, 0, 1, 0, 1 }); |
| } |
| |
| private class AnalyzerAnonymousInnerClassHelper10 : Analyzer |
| { |
| private readonly TestSynonymMapFilter outerInstance; |
| |
| private SynonymMap map; |
| |
| public AnalyzerAnonymousInnerClassHelper10(TestSynonymMapFilter outerInstance, SynonymMap map) |
| { |
| this.outerInstance = outerInstance; |
| this.map = map; |
| } |
| |
| protected override TokenStreamComponents CreateComponents(string fieldName, TextReader reader) |
| { |
| Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false); |
| return new TokenStreamComponents(tokenizer, new SynonymFilter(tokenizer, map, true)); |
| } |
| } |
| |
| [Test] |
| public virtual void TestOutputHangsOffEnd() |
| { |
| b = new SynonymMap.Builder(true); |
| const bool keepOrig = false; |
| // b hangs off the end (no input token under it): |
| Add("a", "a b", keepOrig); |
| tokensIn = new MockTokenizer(new StringReader("a"), MockTokenizer.WHITESPACE, true); |
| tokensIn.Reset(); |
| assertTrue(tokensIn.IncrementToken()); |
| assertFalse(tokensIn.IncrementToken()); |
| tokensIn.End(); |
| tokensIn.Dispose(); |
| |
| tokensOut = new SynonymFilter(tokensIn, b.Build(), true); |
| termAtt = tokensOut.AddAttribute<ICharTermAttribute>(); |
| posIncrAtt = tokensOut.AddAttribute<IPositionIncrementAttribute>(); |
| offsetAtt = tokensOut.AddAttribute<IOffsetAttribute>(); |
| posLenAtt = tokensOut.AddAttribute<IPositionLengthAttribute>(); |
| |
| // Make sure endOffset inherits from previous input token: |
| Verify("a", "a b:1"); |
| } |
| |
| [Test] |
| public virtual void TestIncludeOrig() |
| { |
| b = new SynonymMap.Builder(true); |
| const bool keepOrig = true; |
| Add("a b", "ab", keepOrig); |
| Add("a c", "ac", keepOrig); |
| Add("a", "aa", keepOrig); |
| Add("b", "bb", keepOrig); |
| Add("z x c v", "zxcv", keepOrig); |
| Add("x c", "xc", keepOrig); |
| SynonymMap map = b.Build(); |
| Analyzer a = new AnalyzerAnonymousInnerClassHelper11(this, map); |
| |
| AssertAnalyzesTo(a, "$", new string[] { "$" }, new int[] { 1 }); |
| AssertAnalyzesTo(a, "a", new string[] { "a", "aa" }, new int[] { 1, 0 }); |
| AssertAnalyzesTo(a, "a", new string[] { "a", "aa" }, new int[] { 1, 0 }); |
| AssertAnalyzesTo(a, "$ a", new string[] { "$", "a", "aa" }, new int[] { 1, 1, 0 }); |
| AssertAnalyzesTo(a, "a $", new string[] { "a", "aa", "$" }, new int[] { 1, 0, 1 }); |
| AssertAnalyzesTo(a, "$ a !", new string[] { "$", "a", "aa", "!" }, new int[] { 1, 1, 0, 1 }); |
| AssertAnalyzesTo(a, "a a", new string[] { "a", "aa", "a", "aa" }, new int[] { 1, 0, 1, 0 }); |
| AssertAnalyzesTo(a, "b", new string[] { "b", "bb" }, new int[] { 1, 0 }); |
| AssertAnalyzesTo(a, "z x c v", new string[] { "z", "zxcv", "x", "c", "v" }, new int[] { 1, 0, 1, 1, 1 }); |
| AssertAnalyzesTo(a, "z x c $", new string[] { "z", "x", "xc", "c", "$" }, new int[] { 1, 1, 0, 1, 1 }); |
| } |
| |
| private class AnalyzerAnonymousInnerClassHelper11 : Analyzer |
| { |
| private readonly TestSynonymMapFilter outerInstance; |
| |
| private SynonymMap map; |
| |
| public AnalyzerAnonymousInnerClassHelper11(TestSynonymMapFilter outerInstance, SynonymMap map) |
| { |
| this.outerInstance = outerInstance; |
| this.map = map; |
| } |
| |
| protected override TokenStreamComponents CreateComponents(string fieldName, TextReader reader) |
| { |
| Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false); |
| return new TokenStreamComponents(tokenizer, new SynonymFilter(tokenizer, map, true)); |
| } |
| } |
| |
| [Test] |
| public virtual void TestRecursion3() |
| { |
| b = new SynonymMap.Builder(true); |
| const bool keepOrig = true; |
| Add("zoo zoo", "zoo", keepOrig); |
| SynonymMap map = b.Build(); |
| Analyzer a = new AnalyzerAnonymousInnerClassHelper12(this, map); |
| |
| AssertAnalyzesTo(a, "zoo zoo $ zoo", new string[] { "zoo", "zoo", "zoo", "$", "zoo" }, new int[] { 1, 0, 1, 1, 1 }); |
| } |
| |
| private class AnalyzerAnonymousInnerClassHelper12 : Analyzer |
| { |
| private readonly TestSynonymMapFilter outerInstance; |
| |
| private SynonymMap map; |
| |
| public AnalyzerAnonymousInnerClassHelper12(TestSynonymMapFilter outerInstance, SynonymMap map) |
| { |
| this.outerInstance = outerInstance; |
| this.map = map; |
| } |
| |
| protected override TokenStreamComponents CreateComponents(string fieldName, TextReader reader) |
| { |
| Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false); |
| return new TokenStreamComponents(tokenizer, new SynonymFilter(tokenizer, map, true)); |
| } |
| } |
| |
| [Test] |
| public virtual void TestRecursion4() |
| { |
| b = new SynonymMap.Builder(true); |
| const bool keepOrig = true; |
| Add("zoo zoo", "zoo", keepOrig); |
| Add("zoo", "zoo zoo", keepOrig); |
| SynonymMap map = b.Build(); |
| Analyzer a = new AnalyzerAnonymousInnerClassHelper13(this, map); |
| |
| AssertAnalyzesTo(a, "zoo zoo $ zoo", new string[] { "zoo", "zoo", "zoo", "$", "zoo", "zoo", "zoo" }, new int[] { 1, 0, 1, 1, 1, 0, 1 }); |
| } |
| |
| private class AnalyzerAnonymousInnerClassHelper13 : Analyzer |
| { |
| private readonly TestSynonymMapFilter outerInstance; |
| |
| private SynonymMap map; |
| |
| public AnalyzerAnonymousInnerClassHelper13(TestSynonymMapFilter outerInstance, SynonymMap map) |
| { |
| this.outerInstance = outerInstance; |
| this.map = map; |
| } |
| |
| protected override TokenStreamComponents CreateComponents(string fieldName, TextReader reader) |
| { |
| Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false); |
| return new TokenStreamComponents(tokenizer, new SynonymFilter(tokenizer, map, true)); |
| } |
| } |
| |
| [Test] |
| public virtual void TestMultiwordOffsets() |
| { |
| b = new SynonymMap.Builder(true); |
| const bool keepOrig = true; |
| Add("national hockey league", "nhl", keepOrig); |
| SynonymMap map = b.Build(); |
| Analyzer a = new AnalyzerAnonymousInnerClassHelper14(this, map); |
| |
| AssertAnalyzesTo(a, "national hockey league", new string[] { "national", "nhl", "hockey", "league" }, new int[] { 0, 0, 9, 16 }, new int[] { 8, 22, 15, 22 }, new int[] { 1, 0, 1, 1 }); |
| } |
| |
| private class AnalyzerAnonymousInnerClassHelper14 : Analyzer |
| { |
| private readonly TestSynonymMapFilter outerInstance; |
| |
| private SynonymMap map; |
| |
| public AnalyzerAnonymousInnerClassHelper14(TestSynonymMapFilter outerInstance, SynonymMap map) |
| { |
| this.outerInstance = outerInstance; |
| this.map = map; |
| } |
| |
| protected override TokenStreamComponents CreateComponents(string fieldName, TextReader reader) |
| { |
| Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false); |
| return new TokenStreamComponents(tokenizer, new SynonymFilter(tokenizer, map, true)); |
| } |
| } |
| |
| [Test] |
| public virtual void TestEmpty() |
| { |
| Tokenizer tokenizer = new MockTokenizer(new StringReader("aa bb")); |
| try |
| { |
| new SynonymFilter(tokenizer, (new SynonymMap.Builder(true)).Build(), true); |
| fail("did not hit expected exception"); |
| } |
| catch (System.ArgumentException iae) |
| { |
| // expected |
| assertEquals("fst must be non-null", iae.Message); |
| } |
| } |
| } |
| } |