| using System; |
| using Lucene.Net.Attributes; |
| using Lucene.Net.Documents; |
| using Lucene.Net.Index; |
| |
| namespace Lucene.Net.Analysis |
| { |
| using Lucene.Net.Randomized.Generators; |
| using Lucene.Net.Support; |
| using NUnit.Framework; |
| using System.IO; |
| using AtomicReader = Lucene.Net.Index.AtomicReader; |
| using Automaton = Lucene.Net.Util.Automaton.Automaton; |
| using AutomatonTestUtil = Lucene.Net.Util.Automaton.AutomatonTestUtil; |
| using BasicAutomata = Lucene.Net.Util.Automaton.BasicAutomata; |
| using BasicOperations = Lucene.Net.Util.Automaton.BasicOperations; |
| using BytesRef = Lucene.Net.Util.BytesRef; |
| using CharacterRunAutomaton = Lucene.Net.Util.Automaton.CharacterRunAutomaton; |
| using DocsAndPositionsEnum = Lucene.Net.Index.DocsAndPositionsEnum; |
| |
| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| |
| using Document = Documents.Document; |
| using Field = Field; |
| using Fields = Lucene.Net.Index.Fields; |
| using FieldType = FieldType; |
| using IOUtils = Lucene.Net.Util.IOUtils; |
| using RandomIndexWriter = Lucene.Net.Index.RandomIndexWriter; |
| using RegExp = Lucene.Net.Util.Automaton.RegExp; |
| using Terms = Lucene.Net.Index.Terms; |
| using TermsEnum = Lucene.Net.Index.TermsEnum; |
| using TestUtil = Lucene.Net.Util.TestUtil; |
| |
| [TestFixture] |
| public class TestMockAnalyzer : BaseTokenStreamTestCase |
| { |
| /// <summary> |
| /// Test a configuration that behaves a lot like WhitespaceAnalyzer </summary> |
| [Test] |
| public virtual void TestWhitespace() |
| { |
| Analyzer a = new MockAnalyzer(Random); |
| AssertAnalyzesTo(a, "A bc defg hiJklmn opqrstuv wxy z ", new string[] { "a", "bc", "defg", "hijklmn", "opqrstuv", "wxy", "z" }); |
| AssertAnalyzesTo(a, "aba cadaba shazam", new string[] { "aba", "cadaba", "shazam" }); |
| AssertAnalyzesTo(a, "break on whitespace", new string[] { "break", "on", "whitespace" }); |
| } |
| |
| /// <summary> |
| /// Test a configuration that behaves a lot like SimpleAnalyzer </summary> |
| [Test] |
| public virtual void TestSimple() |
| { |
| Analyzer a = new MockAnalyzer(Random, MockTokenizer.SIMPLE, true); |
| AssertAnalyzesTo(a, "a-bc123 defg+hijklmn567opqrstuv78wxy_z ", new string[] { "a", "bc", "defg", "hijklmn", "opqrstuv", "wxy", "z" }); |
| AssertAnalyzesTo(a, "aba4cadaba-Shazam", new string[] { "aba", "cadaba", "shazam" }); |
| AssertAnalyzesTo(a, "break+on/Letters", new string[] { "break", "on", "letters" }); |
| } |
| |
| /// <summary> |
| /// Test a configuration that behaves a lot like KeywordAnalyzer </summary> |
| [Test] |
| public virtual void TestKeyword() |
| { |
| Analyzer a = new MockAnalyzer(Random, MockTokenizer.KEYWORD, false); |
| AssertAnalyzesTo(a, "a-bc123 defg+hijklmn567opqrstuv78wxy_z ", new string[] { "a-bc123 defg+hijklmn567opqrstuv78wxy_z " }); |
| AssertAnalyzesTo(a, "aba4cadaba-Shazam", new string[] { "aba4cadaba-Shazam" }); |
| AssertAnalyzesTo(a, "break+on/Nothing", new string[] { "break+on/Nothing" }); |
| // currently though emits no tokens for empty string: maybe we can do it, |
| // but we don't want to emit tokens infinitely... |
| AssertAnalyzesTo(a, "", new string[0]); |
| } |
| |
| // Test some regular expressions as tokenization patterns |
| /// <summary> |
| /// Test a configuration where each character is a term </summary> |
| [Test] |
| public virtual void TestSingleChar() |
| { |
| var single = new CharacterRunAutomaton((new RegExp(".")).ToAutomaton()); |
| Analyzer a = new MockAnalyzer(Random, single, false); |
| AssertAnalyzesTo(a, "foobar", new[] { "f", "o", "o", "b", "a", "r" }, new[] { 0, 1, 2, 3, 4, 5 }, new[] { 1, 2, 3, 4, 5, 6 }); |
| CheckRandomData(Random, a, 100); |
| } |
| |
| /// <summary> |
| /// Test a configuration where two characters makes a term </summary> |
| [Test] |
| public virtual void TestTwoChars() |
| { |
| CharacterRunAutomaton single = new CharacterRunAutomaton((new RegExp("..")).ToAutomaton()); |
| Analyzer a = new MockAnalyzer(Random, single, false); |
| AssertAnalyzesTo(a, "foobar", new string[] { "fo", "ob", "ar" }, new int[] { 0, 2, 4 }, new int[] { 2, 4, 6 }); |
| // make sure when last term is a "partial" match that End() is correct |
| AssertTokenStreamContents(a.GetTokenStream("bogus", new StringReader("fooba")), new string[] { "fo", "ob" }, new int[] { 0, 2 }, new int[] { 2, 4 }, new int[] { 1, 1 }, new int?(5)); |
| CheckRandomData(Random, a, 100); |
| } |
| |
| /// <summary> |
| /// Test a configuration where three characters makes a term </summary> |
| [Test] |
| public virtual void TestThreeChars() |
| { |
| CharacterRunAutomaton single = new CharacterRunAutomaton((new RegExp("...")).ToAutomaton()); |
| Analyzer a = new MockAnalyzer(Random, single, false); |
| AssertAnalyzesTo(a, "foobar", new string[] { "foo", "bar" }, new int[] { 0, 3 }, new int[] { 3, 6 }); |
| // make sure when last term is a "partial" match that End() is correct |
| AssertTokenStreamContents(a.GetTokenStream("bogus", new StringReader("fooba")), new string[] { "foo" }, new int[] { 0 }, new int[] { 3 }, new int[] { 1 }, new int?(5)); |
| CheckRandomData(Random, a, 100); |
| } |
| |
| /// <summary> |
| /// Test a configuration where word starts with one uppercase </summary> |
| [Test] |
| public virtual void TestUppercase() |
| { |
| CharacterRunAutomaton single = new CharacterRunAutomaton((new RegExp("[A-Z][a-z]*")).ToAutomaton()); |
| Analyzer a = new MockAnalyzer(Random, single, false); |
| AssertAnalyzesTo(a, "FooBarBAZ", new string[] { "Foo", "Bar", "B", "A", "Z" }, new int[] { 0, 3, 6, 7, 8 }, new int[] { 3, 6, 7, 8, 9 }); |
| AssertAnalyzesTo(a, "aFooBar", new string[] { "Foo", "Bar" }, new int[] { 1, 4 }, new int[] { 4, 7 }); |
| CheckRandomData(Random, a, 100); |
| } |
| |
| /// <summary> |
| /// Test a configuration that behaves a lot like StopAnalyzer </summary> |
| [Test] |
| public virtual void TestStop() |
| { |
| Analyzer a = new MockAnalyzer(Random, MockTokenizer.SIMPLE, true, MockTokenFilter.ENGLISH_STOPSET); |
| AssertAnalyzesTo(a, "the quick brown a fox", new string[] { "quick", "brown", "fox" }, new int[] { 2, 1, 2 }); |
| } |
| |
| /// <summary> |
| /// Test a configuration that behaves a lot like KeepWordFilter </summary> |
| [Test] |
| public virtual void TestKeep() |
| { |
| CharacterRunAutomaton keepWords = new CharacterRunAutomaton(BasicOperations.Complement(Automaton.Union(new Automaton[] { BasicAutomata.MakeString("foo"), BasicAutomata.MakeString("bar") }))); |
| Analyzer a = new MockAnalyzer(Random, MockTokenizer.SIMPLE, true, keepWords); |
| AssertAnalyzesTo(a, "quick foo brown bar bar fox foo", new string[] { "foo", "bar", "bar", "foo" }, new int[] { 2, 2, 1, 2 }); |
| } |
| |
| /// <summary> |
| /// Test a configuration that behaves a lot like LengthFilter </summary> |
| [Test] |
| public virtual void TestLength() |
| { |
| CharacterRunAutomaton length5 = new CharacterRunAutomaton((new RegExp(".{5,}")).ToAutomaton()); |
| Analyzer a = new MockAnalyzer(Random, MockTokenizer.WHITESPACE, true, length5); |
| AssertAnalyzesTo(a, "ok toolong fine notfine", new string[] { "ok", "fine" }, new int[] { 1, 2 }); |
| } |
| |
| /// <summary> |
| /// Test MockTokenizer encountering a too long token </summary> |
| [Test] |
| public virtual void TestTooLongToken() |
| { |
| Analyzer whitespace = new AnalyzerAnonymousInnerClassHelper(this); |
| AssertTokenStreamContents(whitespace.GetTokenStream("bogus", new StringReader("test 123 toolong ok ")), new string[] { "test", "123", "toolo", "ng", "ok" }, new int[] { 0, 5, 9, 14, 17 }, new int[] { 4, 8, 14, 16, 19 }, new int?(20)); |
| AssertTokenStreamContents(whitespace.GetTokenStream("bogus", new StringReader("test 123 toolo")), new string[] { "test", "123", "toolo" }, new int[] { 0, 5, 9 }, new int[] { 4, 8, 14 }, new int?(14)); |
| } |
| |
| private class AnalyzerAnonymousInnerClassHelper : Analyzer |
| { |
| private readonly TestMockAnalyzer OuterInstance; |
| |
| public AnalyzerAnonymousInnerClassHelper(TestMockAnalyzer outerInstance) |
| { |
| this.OuterInstance = outerInstance; |
| } |
| |
| protected internal override TokenStreamComponents CreateComponents(string fieldName, TextReader reader) |
| { |
| Tokenizer t = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false, 5); |
| return new TokenStreamComponents(t, t); |
| } |
| } |
| |
| [Test] |
| public virtual void TestLUCENE_3042() |
| { |
| string testString = "t"; |
| |
| Analyzer analyzer = new MockAnalyzer(Random); |
| Exception priorException = null; |
| TokenStream stream = analyzer.GetTokenStream("dummy", new StringReader(testString)); |
| try |
| { |
| stream.Reset(); |
| while (stream.IncrementToken()) |
| { |
| // consume |
| } |
| stream.End(); |
| } |
| catch (Exception e) |
| { |
| priorException = e; |
| } |
| finally |
| { |
| IOUtils.DisposeWhileHandlingException(priorException, stream); |
| } |
| |
| AssertAnalyzesTo(analyzer, testString, new string[] { "t" }); |
| } |
| |
| /// <summary> |
| /// blast some random strings through the analyzer </summary> |
| [Test] |
| public virtual void TestRandomStrings() |
| { |
| CheckRandomData(Random, new MockAnalyzer(Random), AtLeast(1000)); |
| } |
| |
| /// <summary> |
| /// blast some random strings through differently configured tokenizers </summary> |
| [Test, LongRunningTest] |
| public virtual void TestRandomRegexps() |
| { |
| int iters = AtLeast(30); |
| for (int i = 0; i < iters; i++) |
| { |
| CharacterRunAutomaton dfa = new CharacterRunAutomaton(AutomatonTestUtil.RandomAutomaton(Random)); |
| bool lowercase = Random.NextBoolean(); |
| int limit = TestUtil.NextInt32(Random, 0, 500); |
| Analyzer a = new AnalyzerAnonymousInnerClassHelper2(this, dfa, lowercase, limit); |
| CheckRandomData(Random, a, 100); |
| a.Dispose(); |
| } |
| } |
| |
| private class AnalyzerAnonymousInnerClassHelper2 : Analyzer |
| { |
| private readonly TestMockAnalyzer OuterInstance; |
| |
| private CharacterRunAutomaton Dfa; |
| private bool Lowercase; |
| private int Limit; |
| |
| public AnalyzerAnonymousInnerClassHelper2(TestMockAnalyzer outerInstance, CharacterRunAutomaton dfa, bool lowercase, int limit) |
| { |
| this.OuterInstance = outerInstance; |
| this.Dfa = dfa; |
| this.Lowercase = lowercase; |
| this.Limit = limit; |
| } |
| |
| protected internal override TokenStreamComponents CreateComponents(string fieldName, TextReader reader) |
| { |
| Tokenizer t = new MockTokenizer(reader, Dfa, Lowercase, Limit); |
| return new TokenStreamComponents(t, t); |
| } |
| } |
| |
| [Test] |
| public virtual void TestForwardOffsets() |
| { |
| int num = AtLeast(10000); |
| for (int i = 0; i < num; i++) |
| { |
| string s = TestUtil.RandomHtmlishString(Random, 20); |
| StringReader reader = new StringReader(s); |
| MockCharFilter charfilter = new MockCharFilter(reader, 2); |
| MockAnalyzer analyzer = new MockAnalyzer(Random); |
| Exception priorException = null; |
| TokenStream ts = analyzer.GetTokenStream("bogus", charfilter.m_input); |
| try |
| { |
| ts.Reset(); |
| while (ts.IncrementToken()) |
| { |
| ; |
| } |
| ts.End(); |
| } |
| catch (Exception e) |
| { |
| priorException = e; |
| } |
| finally |
| { |
| IOUtils.DisposeWhileHandlingException(priorException, ts); |
| } |
| } |
| } |
| |
| [Test] |
| public virtual void TestWrapReader() |
| { |
| // LUCENE-5153: test that wrapping an analyzer's reader is allowed |
| Random random = Random; |
| |
| Analyzer @delegate = new MockAnalyzer(random); |
| Analyzer a = new AnalyzerWrapperAnonymousInnerClassHelper(this, @delegate.Strategy, @delegate); |
| |
| CheckOneTerm(a, "abc", "aabc"); |
| } |
| |
| private class AnalyzerWrapperAnonymousInnerClassHelper : AnalyzerWrapper |
| { |
| private readonly TestMockAnalyzer OuterInstance; |
| |
| private Analyzer @delegate; |
| |
| public AnalyzerWrapperAnonymousInnerClassHelper(TestMockAnalyzer outerInstance, ReuseStrategy getReuseStrategy, Analyzer @delegate) |
| : base(getReuseStrategy) |
| { |
| this.OuterInstance = outerInstance; |
| this.@delegate = @delegate; |
| } |
| |
| protected override TextReader WrapReader(string fieldName, TextReader reader) |
| { |
| return new MockCharFilter(reader, 7); |
| } |
| |
| protected override TokenStreamComponents WrapComponents(string fieldName, TokenStreamComponents components) |
| { |
| return components; |
| } |
| |
| protected override Analyzer GetWrappedAnalyzer(string fieldName) |
| { |
| return @delegate; |
| } |
| } |
| |
| [Test] |
| public virtual void TestChangeGaps() |
| { |
| // LUCENE-5324: check that it is possible to change the wrapper's gaps |
| int positionGap = Random.Next(1000); |
| int offsetGap = Random.Next(1000); |
| Analyzer @delegate = new MockAnalyzer(Random); |
| Analyzer a = new AnalyzerWrapperAnonymousInnerClassHelper2(this, @delegate.Strategy, positionGap, offsetGap, @delegate); |
| |
| RandomIndexWriter writer = new RandomIndexWriter( |
| #if FEATURE_INSTANCE_TESTDATA_INITIALIZATION |
| this, |
| #endif |
| Random, NewDirectory()); |
| Document doc = new Document(); |
| FieldType ft = new FieldType(); |
| ft.IsIndexed = true; |
| ft.IndexOptions = IndexOptions.DOCS_ONLY; |
| ft.IsTokenized = true; |
| ft.StoreTermVectors = true; |
| ft.StoreTermVectorPositions = true; |
| ft.StoreTermVectorOffsets = true; |
| doc.Add(new Field("f", "a", ft)); |
| doc.Add(new Field("f", "a", ft)); |
| writer.AddDocument(doc, a); |
| AtomicReader reader = GetOnlySegmentReader(writer.GetReader()); |
| Fields fields = reader.GetTermVectors(0); |
| Terms terms = fields.GetTerms("f"); |
| TermsEnum te = terms.GetIterator(null); |
| Assert.AreEqual(new BytesRef("a"), te.Next()); |
| DocsAndPositionsEnum dpe = te.DocsAndPositions(null, null); |
| Assert.AreEqual(0, dpe.NextDoc()); |
| Assert.AreEqual(2, dpe.Freq); |
| Assert.AreEqual(0, dpe.NextPosition()); |
| Assert.AreEqual(0, dpe.StartOffset); |
| int endOffset = dpe.EndOffset; |
| Assert.AreEqual(1 + positionGap, dpe.NextPosition()); |
| Assert.AreEqual(1 + endOffset + offsetGap, dpe.EndOffset); |
| Assert.AreEqual(null, te.Next()); |
| reader.Dispose(); |
| writer.Dispose(); |
| writer.IndexWriter.Directory.Dispose(); |
| } |
| |
| private class AnalyzerWrapperAnonymousInnerClassHelper2 : AnalyzerWrapper |
| { |
| private readonly TestMockAnalyzer OuterInstance; |
| |
| private int PositionGap; |
| private int OffsetGap; |
| private Analyzer @delegate; |
| |
| public AnalyzerWrapperAnonymousInnerClassHelper2(TestMockAnalyzer outerInstance, ReuseStrategy getReuseStrategy, int positionGap, int offsetGap, Analyzer @delegate) |
| : base(getReuseStrategy) |
| { |
| this.OuterInstance = outerInstance; |
| this.PositionGap = positionGap; |
| this.OffsetGap = offsetGap; |
| this.@delegate = @delegate; |
| } |
| |
| protected override Analyzer GetWrappedAnalyzer(string fieldName) |
| { |
| return @delegate; |
| } |
| |
| public override int GetPositionIncrementGap(string fieldName) |
| { |
| return PositionGap; |
| } |
| |
| public override int GetOffsetGap(string fieldName) |
| { |
| return OffsetGap; |
| } |
| } |
| } |
| } |