| using Lucene.Net.Attributes; |
| using Lucene.Net.Util; |
| using NUnit.Framework; |
| using System; |
| using System.Collections.Generic; |
| using System.Diagnostics; |
| using System.IO; |
| using System.Text; |
| using JCG = J2N.Collections.Generic; |
| using Console = Lucene.Net.Support.SystemConsole; |
| |
| namespace Lucene.Net.Analysis.CharFilters |
| { |
| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| |
| public class TestMappingCharFilter : BaseTokenStreamTestCase |
| { |
| |
| internal NormalizeCharMap normMap; |
| |
| public override void SetUp() |
| { |
| base.SetUp(); |
| NormalizeCharMap.Builder builder = new NormalizeCharMap.Builder(); |
| |
| builder.Add("aa", "a"); |
| builder.Add("bbb", "b"); |
| builder.Add("cccc", "cc"); |
| |
| builder.Add("h", "i"); |
| builder.Add("j", "jj"); |
| builder.Add("k", "kkk"); |
| builder.Add("ll", "llll"); |
| |
| builder.Add("empty", ""); |
| |
| // BMP (surrogate pair): |
| builder.Add(UnicodeUtil.NewString(new int[] { 0x1D122 }, 0, 1), "fclef"); |
| |
| builder.Add("\uff01", "full-width-exclamation"); |
| |
| normMap = builder.Build(); |
| } |
| |
| [Test] |
| public virtual void TestReaderReset() |
| { |
| CharFilter cs = new MappingCharFilter(normMap, new StringReader("x")); |
| char[] buf = new char[10]; |
| int len = cs.Read(buf, 0, 10); |
| assertEquals(1, len); |
| assertEquals('x', buf[0]); |
| len = cs.Read(buf, 0, 10); |
| assertEquals(-1, len); |
| |
| // rewind |
| cs.Reset(); |
| len = cs.Read(buf, 0, 10); |
| assertEquals(1, len); |
| assertEquals('x', buf[0]); |
| } |
| |
| [Test] |
| public virtual void TestNothingChange() |
| { |
| CharFilter cs = new MappingCharFilter(normMap, new StringReader("x")); |
| TokenStream ts = new MockTokenizer(cs, MockTokenizer.WHITESPACE, false); |
| AssertTokenStreamContents(ts, new string[] { "x" }, new int[] { 0 }, new int[] { 1 }, 1); |
| } |
| |
| [Test] |
| public virtual void Test1to1() |
| { |
| CharFilter cs = new MappingCharFilter(normMap, new StringReader("h")); |
| TokenStream ts = new MockTokenizer(cs, MockTokenizer.WHITESPACE, false); |
| AssertTokenStreamContents(ts, new string[] { "i" }, new int[] { 0 }, new int[] { 1 }, 1); |
| } |
| |
| [Test] |
| public virtual void Test1to2() |
| { |
| CharFilter cs = new MappingCharFilter(normMap, new StringReader("j")); |
| TokenStream ts = new MockTokenizer(cs, MockTokenizer.WHITESPACE, false); |
| AssertTokenStreamContents(ts, new string[] { "jj" }, new int[] { 0 }, new int[] { 1 }, 1); |
| } |
| |
| [Test] |
| public virtual void Test1to3() |
| { |
| CharFilter cs = new MappingCharFilter(normMap, new StringReader("k")); |
| TokenStream ts = new MockTokenizer(cs, MockTokenizer.WHITESPACE, false); |
| AssertTokenStreamContents(ts, new string[] { "kkk" }, new int[] { 0 }, new int[] { 1 }, 1); |
| } |
| |
| [Test] |
| public virtual void Test2to4() |
| { |
| CharFilter cs = new MappingCharFilter(normMap, new StringReader("ll")); |
| TokenStream ts = new MockTokenizer(cs, MockTokenizer.WHITESPACE, false); |
| AssertTokenStreamContents(ts, new string[] { "llll" }, new int[] { 0 }, new int[] { 2 }, 2); |
| } |
| |
| [Test] |
| public virtual void Test2to1() |
| { |
| CharFilter cs = new MappingCharFilter(normMap, new StringReader("aa")); |
| TokenStream ts = new MockTokenizer(cs, MockTokenizer.WHITESPACE, false); |
| AssertTokenStreamContents(ts, new string[] { "a" }, new int[] { 0 }, new int[] { 2 }, 2); |
| } |
| |
| [Test] |
| public virtual void Test3to1() |
| { |
| CharFilter cs = new MappingCharFilter(normMap, new StringReader("bbb")); |
| TokenStream ts = new MockTokenizer(cs, MockTokenizer.WHITESPACE, false); |
| AssertTokenStreamContents(ts, new string[] { "b" }, new int[] { 0 }, new int[] { 3 }, 3); |
| } |
| |
| [Test] |
| public virtual void Test4to2() |
| { |
| CharFilter cs = new MappingCharFilter(normMap, new StringReader("cccc")); |
| TokenStream ts = new MockTokenizer(cs, MockTokenizer.WHITESPACE, false); |
| AssertTokenStreamContents(ts, new string[] { "cc" }, new int[] { 0 }, new int[] { 4 }, 4); |
| } |
| |
| [Test] |
| public virtual void Test5to0() |
| { |
| CharFilter cs = new MappingCharFilter(normMap, new StringReader("empty")); |
| TokenStream ts = new MockTokenizer(cs, MockTokenizer.WHITESPACE, false); |
| AssertTokenStreamContents(ts, new string[0], new int[] { }, new int[] { }, 5); |
| } |
| |
| [Test] |
| public virtual void TestNonBMPChar() |
| { |
| CharFilter cs = new MappingCharFilter(normMap, new StringReader(UnicodeUtil.NewString(new int[] { 0x1D122 }, 0, 1))); |
| TokenStream ts = new MockTokenizer(cs, MockTokenizer.WHITESPACE, false); |
| AssertTokenStreamContents(ts, new string[] { "fclef" }, new int[] { 0 }, new int[] { 2 }, 2); |
| } |
| |
| [Test] |
| public virtual void TestFullWidthChar() |
| { |
| CharFilter cs = new MappingCharFilter(normMap, new StringReader("\uff01")); |
| TokenStream ts = new MockTokenizer(cs, MockTokenizer.WHITESPACE, false); |
| AssertTokenStreamContents(ts, new string[] { "full-width-exclamation" }, new int[] { 0 }, new int[] { 1 }, 1); |
| } |
| |
| // |
| // 1111111111222 |
| // 01234567890123456789012 |
| //(in) h i j k ll cccc bbb aa |
| // |
| // 1111111111222 |
| // 01234567890123456789012 |
| //(out) i i jj kkk llll cc b a |
| // |
| // h, 0, 1 => i, 0, 1 |
| // i, 2, 3 => i, 2, 3 |
| // j, 4, 5 => jj, 4, 5 |
| // k, 6, 7 => kkk, 6, 7 |
| // ll, 8,10 => llll, 8,10 |
| // cccc,11,15 => cc,11,15 |
| // bbb,16,19 => b,16,19 |
| // aa,20,22 => a,20,22 |
| // |
| [Test] |
| public virtual void TestTokenStream() |
| { |
| string testString = "h i j k ll cccc bbb aa"; |
| CharFilter cs = new MappingCharFilter(normMap, new StringReader(testString)); |
| TokenStream ts = new MockTokenizer(cs, MockTokenizer.WHITESPACE, false); |
| AssertTokenStreamContents(ts, new string[] { "i", "i", "jj", "kkk", "llll", "cc", "b", "a" }, new int[] { 0, 2, 4, 6, 8, 11, 16, 20 }, new int[] { 1, 3, 5, 7, 10, 15, 19, 22 }, testString.Length); |
| } |
| |
| // |
| // |
| // 0123456789 |
| //(in) aaaa ll h |
| //(out-1) aa llll i |
| //(out-2) a llllllll i |
| // |
| // aaaa,0,4 => a,0,4 |
| // ll,5,7 => llllllll,5,7 |
| // h,8,9 => i,8,9 |
| [Test] |
| public virtual void TestChained() |
| { |
| string testString = "aaaa ll h"; |
| CharFilter cs = new MappingCharFilter(normMap, new MappingCharFilter(normMap, new StringReader(testString))); |
| TokenStream ts = new MockTokenizer(cs, MockTokenizer.WHITESPACE, false); |
| AssertTokenStreamContents(ts, new string[] { "a", "llllllll", "i" }, new int[] { 0, 5, 8 }, new int[] { 4, 7, 9 }, testString.Length); |
| } |
| |
| [Test] |
| public virtual void TestRandom() |
| { |
| Analyzer analyzer = new AnalyzerAnonymousInnerClassHelper(this); |
| |
| int numRounds = RANDOM_MULTIPLIER * 10000; |
| CheckRandomData(Random, analyzer, numRounds); |
| } |
| |
| private class AnalyzerAnonymousInnerClassHelper : Analyzer |
| { |
| private readonly TestMappingCharFilter outerInstance; |
| |
| public AnalyzerAnonymousInnerClassHelper(TestMappingCharFilter outerInstance) |
| { |
| this.outerInstance = outerInstance; |
| } |
| |
| |
| protected override TokenStreamComponents CreateComponents(string fieldName, TextReader reader) |
| { |
| Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false); |
| return new TokenStreamComponents(tokenizer, tokenizer); |
| } |
| |
| protected override TextReader InitReader(string fieldName, TextReader reader) |
| { |
| return new MappingCharFilter(outerInstance.normMap, reader); |
| } |
| } |
| |
| [Ignore("wrong finalOffset: https://issues.apache.org/jira/browse/LUCENE-3971")] |
| [Test] |
| public virtual void TestFinalOffsetSpecialCase() |
| { |
| NormalizeCharMap.Builder builder = new NormalizeCharMap.Builder(); |
| builder.Add("t", ""); |
| // even though this below rule has no effect, the test passes if you remove it!! |
| builder.Add("tmakdbl", "c"); |
| |
| NormalizeCharMap map = builder.Build(); |
| |
| Analyzer analyzer = new AnalyzerAnonymousInnerClassHelper2(this, map); |
| |
| string text = "gzw f quaxot"; |
| CheckAnalysisConsistency(Random, analyzer, false, text); |
| } |
| |
| private class AnalyzerAnonymousInnerClassHelper2 : Analyzer |
| { |
| private readonly TestMappingCharFilter outerInstance; |
| |
| private NormalizeCharMap map; |
| |
| public AnalyzerAnonymousInnerClassHelper2(TestMappingCharFilter outerInstance, NormalizeCharMap map) |
| { |
| this.outerInstance = outerInstance; |
| this.map = map; |
| } |
| |
| protected override TokenStreamComponents CreateComponents(string fieldName, TextReader reader) |
| { |
| Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false); |
| return new TokenStreamComponents(tokenizer, tokenizer); |
| } |
| |
| protected override TextReader InitReader(string fieldName, TextReader reader) |
| { |
| return new MappingCharFilter(map, reader); |
| } |
| } |
| |
| [Ignore("wrong finalOffset: https://issues.apache.org/jira/browse/LUCENE-3971")] |
| [Test] |
| public virtual void TestRandomMaps() |
| { |
| int numIterations = AtLeast(3); |
| for (int i = 0; i < numIterations; i++) |
| { |
| NormalizeCharMap map = RandomMap(); |
| Analyzer analyzer = new AnalyzerAnonymousInnerClassHelper3(this, map); |
| int numRounds = 100; |
| CheckRandomData(Random, analyzer, numRounds); |
| } |
| } |
| |
| private class AnalyzerAnonymousInnerClassHelper3 : Analyzer |
| { |
| private readonly TestMappingCharFilter outerInstance; |
| |
| private NormalizeCharMap map; |
| |
| public AnalyzerAnonymousInnerClassHelper3(TestMappingCharFilter outerInstance, NormalizeCharMap map) |
| { |
| this.outerInstance = outerInstance; |
| this.map = map; |
| } |
| |
| protected override TokenStreamComponents CreateComponents(string fieldName, TextReader reader) |
| { |
| Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false); |
| return new TokenStreamComponents(tokenizer, tokenizer); |
| } |
| |
| protected override TextReader InitReader(string fieldName, TextReader reader) |
| { |
| return new MappingCharFilter(map, reader); |
| } |
| } |
| |
| private NormalizeCharMap RandomMap() |
| { |
| Random random = Random; |
| NormalizeCharMap.Builder builder = new NormalizeCharMap.Builder(); |
| // we can't add duplicate keys, or NormalizeCharMap gets angry |
| ISet<string> keys = new JCG.HashSet<string>(); |
| int num = random.Next(5); |
| //System.out.println("NormalizeCharMap="); |
| for (int i = 0; i < num; i++) |
| { |
| string key = TestUtil.RandomSimpleString(random); |
| if (!keys.Contains(key) && key.Length != 0) |
| { |
| string value = TestUtil.RandomSimpleString(random); |
| builder.Add(key, value); |
| keys.Add(key); |
| //System.out.println("mapping: '" + key + "' => '" + value + "'"); |
| } |
| } |
| return builder.Build(); |
| } |
| |
| [Test] |
| public virtual void TestRandomMaps2() |
| { |
| Random random = Random; |
| int numIterations = AtLeast(3); |
| for (int iter = 0; iter < numIterations; iter++) |
| { |
| |
| if (VERBOSE) |
| { |
| Console.WriteLine("\nTEST iter=" + iter); |
| } |
| |
| char endLetter = (char)TestUtil.NextInt32(random, 'b', 'z'); |
| IDictionary<string, string> map = new Dictionary<string, string>(); |
| NormalizeCharMap.Builder builder = new NormalizeCharMap.Builder(); |
| int numMappings = AtLeast(5); |
| if (VERBOSE) |
| { |
| Console.WriteLine(" mappings:"); |
| } |
| while (map.Count < numMappings) |
| { |
| string key = TestUtil.RandomSimpleStringRange(random, 'a', endLetter, 7); |
| if (key.Length != 0 && !map.ContainsKey(key)) |
| { |
| string value = TestUtil.RandomSimpleString(random); |
| map[key] = value; |
| builder.Add(key, value); |
| if (VERBOSE) |
| { |
| Console.WriteLine(" " + key + " -> " + value); |
| } |
| } |
| } |
| |
| NormalizeCharMap charMap = builder.Build(); |
| |
| if (VERBOSE) |
| { |
| Console.WriteLine(" test random documents..."); |
| } |
| |
| for (int iter2 = 0; iter2 < 100; iter2++) |
| { |
| string content = TestUtil.RandomSimpleStringRange(random, 'a', endLetter, AtLeast(1000)); |
| |
| if (VERBOSE) |
| { |
| Console.WriteLine(" content=" + content); |
| } |
| |
| // Do stupid dog-slow mapping: |
| |
| // Output string: |
| StringBuilder output = new StringBuilder(); |
| |
| // Maps output offset to input offset: |
| IList<int?> inputOffsets = new List<int?>(); |
| |
| int cumDiff = 0; |
| int charIdx = 0; |
| while (charIdx < content.Length) |
| { |
| |
| int matchLen = -1; |
| string matchRepl = null; |
| |
| foreach (KeyValuePair<string, string> ent in map) |
| { |
| string match = ent.Key; |
| if (charIdx + match.Length <= content.Length) |
| { |
| int limit = charIdx + match.Length; |
| bool matches = true; |
| for (int charIdx2 = charIdx; charIdx2 < limit; charIdx2++) |
| { |
| if (match[charIdx2 - charIdx] != content[charIdx2]) |
| { |
| matches = false; |
| break; |
| } |
| } |
| |
| if (matches) |
| { |
| string repl = ent.Value; |
| if (match.Length > matchLen) |
| { |
| // Greedy: longer match wins |
| matchLen = match.Length; |
| matchRepl = repl; |
| } |
| } |
| } |
| } |
| |
| if (matchLen != -1) |
| { |
| // We found a match here! |
| if (VERBOSE) |
| { |
| Console.WriteLine(" match=" + content.Substring(charIdx, matchLen) + " @ off=" + charIdx + " repl=" + matchRepl); |
| } |
| output.Append(matchRepl); |
| int minLen = Math.Min(matchLen, matchRepl.Length); |
| |
| // Common part, directly maps back to input |
| // offset: |
| for (int outIdx = 0; outIdx < minLen; outIdx++) |
| { |
| inputOffsets.Add(output.Length - matchRepl.Length + outIdx + cumDiff); |
| } |
| |
| cumDiff += matchLen - matchRepl.Length; |
| charIdx += matchLen; |
| |
| if (matchRepl.Length < matchLen) |
| { |
| // Replacement string is shorter than matched |
| // input: nothing to do |
| } |
| else if (matchRepl.Length > matchLen) |
| { |
| // Replacement string is longer than matched |
| // input: for all the "extra" chars we map |
| // back to a single input offset: |
| for (int outIdx = matchLen; outIdx < matchRepl.Length; outIdx++) |
| { |
| inputOffsets.Add(output.Length + cumDiff - 1); |
| } |
| } |
| else |
| { |
| // Same length: no change to offset |
| } |
| |
| Debug.Assert(inputOffsets.Count == output.Length, "inputOffsets.size()=" + inputOffsets.Count + " vs output.length()=" + output.Length); |
| } |
| else |
| { |
| inputOffsets.Add(output.Length + cumDiff); |
| output.Append(content[charIdx]); |
| charIdx++; |
| } |
| } |
| |
| string expected = output.ToString(); |
| if (VERBOSE) |
| { |
| Console.Write(" expected:"); |
| for (int charIdx2 = 0; charIdx2 < expected.Length; charIdx2++) |
| { |
| Console.Write(" " + expected[charIdx2] + "/" + inputOffsets[charIdx2]); |
| } |
| Console.WriteLine(); |
| } |
| |
| MappingCharFilter mapFilter = new MappingCharFilter(charMap, new StringReader(content)); |
| StringBuilder actualBuilder = new StringBuilder(); |
| IList<int?> actualInputOffsets = new List<int?>(); |
| |
| // Now consume the actual mapFilter, somewhat randomly: |
| while (true) |
| { |
| if (random.Next(0, 1) == 1) |
| { |
| int ch = mapFilter.Read(); |
| if (ch == -1) |
| { |
| break; |
| } |
| actualBuilder.Append((char)ch); |
| } |
| else |
| { |
| char[] buffer = new char[TestUtil.NextInt32(random, 1, 100)]; |
| int off = buffer.Length == 1 ? 0 : random.Next(buffer.Length - 1); |
| int count = mapFilter.Read(buffer, off, buffer.Length - off); |
| if (count == -1) |
| { |
| break; |
| } |
| else |
| { |
| actualBuilder.Append(buffer, off, count); |
| } |
| } |
| |
| if (random.Next(10) == 7) |
| { |
| // Map offsets |
| while (actualInputOffsets.Count < actualBuilder.Length) |
| { |
| actualInputOffsets.Add(mapFilter.CorrectOffset(actualInputOffsets.Count)); |
| } |
| } |
| } |
| |
| // Finish mappping offsets |
| while (actualInputOffsets.Count < actualBuilder.Length) |
| { |
| actualInputOffsets.Add(mapFilter.CorrectOffset(actualInputOffsets.Count)); |
| } |
| |
| string actual = actualBuilder.ToString(); |
| |
| // Verify: |
| assertEquals(expected, actual); |
| assertEquals(inputOffsets, actualInputOffsets); |
| } |
| } |
| } |
| } |
| } |