| using J2N; |
| using Lucene.Net.Analysis.Core; |
| using Lucene.Net.Analysis.Miscellaneous; |
| using Lucene.Net.Analysis.Shingle; |
| using Lucene.Net.Analysis.TokenAttributes; |
| using Lucene.Net.Attributes; |
| using Lucene.Net.Util; |
| using NUnit.Framework; |
| using System; |
| using System.IO; |
| |
| namespace Lucene.Net.Analysis.NGram |
| { |
| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| |
| /// <summary> |
| /// Tests <seealso cref="EdgeNGramTokenFilter"/> for correctness. |
| /// </summary> |
| public class EdgeNGramTokenFilterTest : BaseTokenStreamTestCase |
| { |
| private TokenStream input; |
| |
| public override void SetUp() |
| { |
| base.SetUp(); |
| input = new MockTokenizer(new StringReader("abcde"), MockTokenizer.WHITESPACE, false); |
| } |
| |
| [Test] |
| public virtual void TestInvalidInput() |
| { |
| bool gotException = false; |
| try |
| { |
| #pragma warning disable 612, 618 |
| new EdgeNGramTokenFilter(TEST_VERSION_CURRENT, input, EdgeNGramTokenFilter.Side.FRONT, 0, 0); |
| #pragma warning restore 612, 618 |
| } |
| catch (System.ArgumentException) |
| { |
| gotException = true; |
| } |
| assertTrue(gotException); |
| } |
| |
| [Test] |
| public virtual void TestInvalidInput2() |
| { |
| bool gotException = false; |
| try |
| { |
| #pragma warning disable 612, 618 |
| new EdgeNGramTokenFilter(TEST_VERSION_CURRENT, input, EdgeNGramTokenFilter.Side.FRONT, 2, 1); |
| #pragma warning restore 612, 618 |
| } |
| catch (System.ArgumentException) |
| { |
| gotException = true; |
| } |
| assertTrue(gotException); |
| } |
| |
| [Test] |
| public virtual void TestInvalidInput3() |
| { |
| bool gotException = false; |
| try |
| { |
| #pragma warning disable 612, 618 |
| new EdgeNGramTokenFilter(TEST_VERSION_CURRENT, input, EdgeNGramTokenFilter.Side.FRONT, -1, 2); |
| #pragma warning restore 612, 618 |
| } |
| catch (System.ArgumentException) |
| { |
| gotException = true; |
| } |
| assertTrue(gotException); |
| } |
| |
| [Test] |
| public virtual void TestFrontUnigram() |
| { |
| #pragma warning disable 612, 618 |
| EdgeNGramTokenFilter tokenizer = new EdgeNGramTokenFilter(TEST_VERSION_CURRENT, input, EdgeNGramTokenFilter.Side.FRONT, 1, 1); |
| #pragma warning restore 612, 618 |
| AssertTokenStreamContents(tokenizer, new string[] { "a" }, new int[] { 0 }, new int[] { 5 }); |
| } |
| |
| [Test] |
| public virtual void TestBackUnigram() |
| { |
| #pragma warning disable 612, 618 |
| EdgeNGramTokenFilter tokenizer = new EdgeNGramTokenFilter(LuceneVersion.LUCENE_43, input, EdgeNGramTokenFilter.Side.BACK, 1, 1); |
| #pragma warning restore 612, 618 |
| AssertTokenStreamContents(tokenizer, new string[] { "e" }, new int[] { 4 }, new int[] { 5 }); |
| } |
| |
| [Test] |
| public virtual void TestOversizedNgrams() |
| { |
| #pragma warning disable 612, 618 |
| EdgeNGramTokenFilter tokenizer = new EdgeNGramTokenFilter(TEST_VERSION_CURRENT, input, EdgeNGramTokenFilter.Side.FRONT, 6, 6); |
| #pragma warning restore 612, 618 |
| AssertTokenStreamContents(tokenizer, new string[0], new int[0], new int[0]); |
| } |
| |
| [Test] |
| public virtual void TestFrontRangeOfNgrams() |
| { |
| #pragma warning disable 612, 618 |
| EdgeNGramTokenFilter tokenizer = new EdgeNGramTokenFilter(TEST_VERSION_CURRENT, input, EdgeNGramTokenFilter.Side.FRONT, 1, 3); |
| #pragma warning restore 612, 618 |
| AssertTokenStreamContents(tokenizer, new string[] { "a", "ab", "abc" }, new int[] { 0, 0, 0 }, new int[] { 5, 5, 5 }); |
| } |
| |
| [Test] |
| public virtual void TestBackRangeOfNgrams() |
| { |
| #pragma warning disable 612, 618 |
| EdgeNGramTokenFilter tokenizer = new EdgeNGramTokenFilter(LuceneVersion.LUCENE_43, input, EdgeNGramTokenFilter.Side.BACK, 1, 3); |
| #pragma warning restore 612, 618 |
| AssertTokenStreamContents(tokenizer, new string[] { "e", "de", "cde" }, new int[] { 4, 3, 2 }, new int[] { 5, 5, 5 }, null, null, null, null, false); |
| } |
| |
| [Test] |
| public virtual void TestFilterPositions() |
| { |
| TokenStream ts = new MockTokenizer(new StringReader("abcde vwxyz"), MockTokenizer.WHITESPACE, false); |
| #pragma warning disable 612, 618 |
| EdgeNGramTokenFilter tokenizer = new EdgeNGramTokenFilter(TEST_VERSION_CURRENT, ts, EdgeNGramTokenFilter.Side.FRONT, 1, 3); |
| #pragma warning restore 612, 618 |
| AssertTokenStreamContents(tokenizer, new string[] { "a", "ab", "abc", "v", "vw", "vwx" }, new int[] { 0, 0, 0, 6, 6, 6 }, new int[] { 5, 5, 5, 11, 11, 11 }, null, new int[] { 1, 0, 0, 1, 0, 0 }, null, null, false); |
| } |
| |
| private class PositionFilter : TokenFilter |
| { |
| |
| internal readonly IPositionIncrementAttribute posIncrAtt; |
| internal bool started; |
| |
| internal PositionFilter(TokenStream input) : base(input) |
| { |
| posIncrAtt = AddAttribute<IPositionIncrementAttribute>(); |
| } |
| |
| public override sealed bool IncrementToken() |
| { |
| if (m_input.IncrementToken()) |
| { |
| if (started) |
| { |
| posIncrAtt.PositionIncrement = 0; |
| } |
| else |
| { |
| started = true; |
| } |
| return true; |
| } |
| else |
| { |
| return false; |
| } |
| } |
| |
| public override void Reset() |
| { |
| base.Reset(); |
| started = false; |
| } |
| } |
| |
| [Test] |
| public virtual void TestFirstTokenPositionIncrement() |
| { |
| TokenStream ts = new MockTokenizer(new StringReader("a abc"), MockTokenizer.WHITESPACE, false); |
| ts = new PositionFilter(ts); // All but first token will get 0 position increment |
| #pragma warning disable 612, 618 |
| EdgeNGramTokenFilter filter = new EdgeNGramTokenFilter(TEST_VERSION_CURRENT, ts, EdgeNGramTokenFilter.Side.FRONT, 2, 3); |
| #pragma warning restore 612, 618 |
| // The first token "a" will not be output, since it's smaller than the mingram size of 2. |
| // The second token on input to EdgeNGramTokenFilter will have position increment of 0, |
| // which should be increased to 1, since this is the first output token in the stream. |
| AssertTokenStreamContents(filter, new string[] { "ab", "abc" }, new int[] { 2, 2 }, new int[] { 5, 5 }, new int[] { 1, 0 }); |
| } |
| |
| [Test] |
| public virtual void TestSmallTokenInStream() |
| { |
| input = new MockTokenizer(new StringReader("abc de fgh"), MockTokenizer.WHITESPACE, false); |
| #pragma warning disable 612, 618 |
| EdgeNGramTokenFilter tokenizer = new EdgeNGramTokenFilter(TEST_VERSION_CURRENT, input, EdgeNGramTokenFilter.Side.FRONT, 3, 3); |
| #pragma warning restore 612, 618 |
| AssertTokenStreamContents(tokenizer, new string[] { "abc", "fgh" }, new int[] { 0, 7 }, new int[] { 3, 10 }); |
| } |
| |
| [Test] |
| public virtual void TestReset() |
| { |
| WhitespaceTokenizer tokenizer = new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader("abcde")); |
| #pragma warning disable 612, 618 |
| EdgeNGramTokenFilter filter = new EdgeNGramTokenFilter(TEST_VERSION_CURRENT, tokenizer, EdgeNGramTokenFilter.Side.FRONT, 1, 3); |
| #pragma warning restore 612, 618 |
| AssertTokenStreamContents(filter, new string[] { "a", "ab", "abc" }, new int[] { 0, 0, 0 }, new int[] { 5, 5, 5 }); |
| tokenizer.SetReader(new StringReader("abcde")); |
| AssertTokenStreamContents(filter, new string[] { "a", "ab", "abc" }, new int[] { 0, 0, 0 }, new int[] { 5, 5, 5 }); |
| } |
| |
| // LUCENE-3642 |
| // EdgeNgram blindly adds term length to offset, but this can take things out of bounds |
| // wrt original text if a previous filter increases the length of the word (in this case æ -> ae) |
| // so in this case we behave like WDF, and preserve any modified offsets |
| [Test] |
| public virtual void TestInvalidOffsets() |
| { |
| Analyzer analyzer = new AnalyzerAnonymousInnerClassHelper(this); |
| AssertAnalyzesTo(analyzer, "mosfellsbær", new string[] { "mo", "mos", "mosf", "mosfe", "mosfel", "mosfell", "mosfells", "mosfellsb", "mosfellsba", "mosfellsbae", "mosfellsbaer" }, new int[] { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, new int[] { 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11 }); |
| } |
| |
| private class AnalyzerAnonymousInnerClassHelper : Analyzer |
| { |
| private readonly EdgeNGramTokenFilterTest outerInstance; |
| |
| public AnalyzerAnonymousInnerClassHelper(EdgeNGramTokenFilterTest outerInstance) |
| { |
| this.outerInstance = outerInstance; |
| } |
| |
| protected override TokenStreamComponents CreateComponents(string fieldName, TextReader reader) |
| { |
| Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false); |
| TokenFilter filters = new ASCIIFoldingFilter(tokenizer); |
| #pragma warning disable 612, 618 |
| filters = new EdgeNGramTokenFilter(LuceneVersion.LUCENE_43, filters, EdgeNGramTokenFilter.Side.FRONT, 2, 15); |
| #pragma warning restore 612, 618 |
| return new TokenStreamComponents(tokenizer, filters); |
| } |
| } |
| |
| /// <summary> |
| /// blast some random strings through the analyzer </summary> |
| [Test, LongRunningTest] |
| public virtual void TestRandomStrings() |
| { |
| for (int i = 0; i < 10; i++) |
| { |
| int min = TestUtil.NextInt32(Random, 2, 10); |
| int max = TestUtil.NextInt32(Random, min, 20); |
| |
| Analyzer a = new AnalyzerAnonymousInnerClassHelper2(this, min, max); |
| CheckRandomData(Random, a, 100 * RANDOM_MULTIPLIER); |
| } |
| |
| Analyzer b = new AnalyzerAnonymousInnerClassHelper3(this); |
| CheckRandomData(Random, b, 1000 * RANDOM_MULTIPLIER, 20, false, false); |
| } |
| |
| private class AnalyzerAnonymousInnerClassHelper2 : Analyzer |
| { |
| private readonly EdgeNGramTokenFilterTest outerInstance; |
| |
| private int min; |
| private int max; |
| |
| public AnalyzerAnonymousInnerClassHelper2(EdgeNGramTokenFilterTest outerInstance, int min, int max) |
| { |
| this.outerInstance = outerInstance; |
| this.min = min; |
| this.max = max; |
| } |
| |
| protected override TokenStreamComponents CreateComponents(string fieldName, TextReader reader) |
| { |
| Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false); |
| return new TokenStreamComponents(tokenizer, new EdgeNGramTokenFilter(TEST_VERSION_CURRENT, tokenizer, min, max)); |
| } |
| } |
| |
| private class AnalyzerAnonymousInnerClassHelper3 : Analyzer |
| { |
| private readonly EdgeNGramTokenFilterTest outerInstance; |
| |
| public AnalyzerAnonymousInnerClassHelper3(EdgeNGramTokenFilterTest outerInstance) |
| { |
| this.outerInstance = outerInstance; |
| } |
| |
| protected override TokenStreamComponents CreateComponents(string fieldName, TextReader reader) |
| { |
| Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false); |
| #pragma warning disable 612, 618 |
| return new TokenStreamComponents(tokenizer, new EdgeNGramTokenFilter(LuceneVersion.LUCENE_43, tokenizer, EdgeNGramTokenFilter.Side.BACK, 2, 4)); |
| #pragma warning restore 612, 618 |
| } |
| } |
| |
| [Test] |
| public virtual void TestEmptyTerm() |
| { |
| Random random = Random; |
| Analyzer a = new AnalyzerAnonymousInnerClassHelper4(this); |
| CheckAnalysisConsistency(random, a, random.nextBoolean(), ""); |
| |
| Analyzer b = new AnalyzerAnonymousInnerClassHelper5(this); |
| CheckAnalysisConsistency(random, b, random.nextBoolean(), ""); |
| } |
| |
| private class AnalyzerAnonymousInnerClassHelper4 : Analyzer |
| { |
| private readonly EdgeNGramTokenFilterTest outerInstance; |
| |
| public AnalyzerAnonymousInnerClassHelper4(EdgeNGramTokenFilterTest outerInstance) |
| { |
| this.outerInstance = outerInstance; |
| } |
| |
| protected override TokenStreamComponents CreateComponents(string fieldName, TextReader reader) |
| { |
| Tokenizer tokenizer = new KeywordTokenizer(reader); |
| #pragma warning disable 612, 618 |
| return new TokenStreamComponents(tokenizer, new EdgeNGramTokenFilter(TEST_VERSION_CURRENT, tokenizer, EdgeNGramTokenFilter.Side.FRONT, 2, 15)); |
| #pragma warning restore 612, 618 |
| } |
| } |
| |
| private class AnalyzerAnonymousInnerClassHelper5 : Analyzer |
| { |
| private readonly EdgeNGramTokenFilterTest outerInstance; |
| |
| public AnalyzerAnonymousInnerClassHelper5(EdgeNGramTokenFilterTest outerInstance) |
| { |
| this.outerInstance = outerInstance; |
| } |
| |
| protected override TokenStreamComponents CreateComponents(string fieldName, TextReader reader) |
| { |
| Tokenizer tokenizer = new KeywordTokenizer(reader); |
| #pragma warning disable 612, 618 |
| return new TokenStreamComponents(tokenizer, new EdgeNGramTokenFilter(LuceneVersion.LUCENE_43, tokenizer, EdgeNGramTokenFilter.Side.BACK, 2, 15)); |
| #pragma warning restore 612, 618 |
| } |
| } |
| |
| [Test] |
| public virtual void TestGraphs() |
| { |
| TokenStream tk = new LetterTokenizer(TEST_VERSION_CURRENT, new StringReader("abc d efgh ij klmno p q")); |
| tk = new ShingleFilter(tk); |
| tk = new EdgeNGramTokenFilter(TEST_VERSION_CURRENT, tk, 7, 10); |
| AssertTokenStreamContents(tk, new string[] { "efgh ij", "ij klmn", "ij klmno", "klmno p" }, new int[] { 6, 11, 11, 14 }, new int[] { 13, 19, 19, 21 }, new int[] { 3, 1, 0, 1 }, new int[] { 2, 2, 2, 2 }, 23); |
| } |
| |
| [Test] |
| public virtual void TestSupplementaryCharacters() |
| { |
| string s = TestUtil.RandomUnicodeString(Random, 10); |
| int codePointCount = s.CodePointCount(0, s.Length); |
| int minGram = TestUtil.NextInt32(Random, 1, 3); |
| int maxGram = TestUtil.NextInt32(Random, minGram, 10); |
| TokenStream tk = new KeywordTokenizer(new StringReader(s)); |
| tk = new EdgeNGramTokenFilter(TEST_VERSION_CURRENT, tk, minGram, maxGram); |
| ICharTermAttribute termAtt = tk.AddAttribute<ICharTermAttribute>(); |
| IOffsetAttribute offsetAtt = tk.AddAttribute<IOffsetAttribute>(); |
| tk.Reset(); |
| for (int i = minGram; i <= Math.Min(codePointCount, maxGram); ++i) |
| { |
| assertTrue(tk.IncrementToken()); |
| assertEquals(0, offsetAtt.StartOffset); |
| assertEquals(s.Length, offsetAtt.EndOffset); |
| int end = Character.OffsetByCodePoints(s, 0, i); |
| assertEquals(s.Substring(0, end), termAtt.ToString()); |
| } |
| assertFalse(tk.IncrementToken()); |
| } |
| } |
| } |