| using ICU4NET; |
| using Lucene.Net.Analysis; |
| using Lucene.Net.Analysis.Tokenattributes; |
| using Lucene.Net.Analysis.Util; |
| using Lucene.Net.Support; |
| using NUnit.Framework; |
| using System.IO; |
| using System.Text; |
| |
| namespace Lucene.Net.Analysis.Util |
| { |
| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| |
| /// <summary> |
| /// Basic tests for <seealso cref="SegmentingTokenizerBase"/> </summary> |
| [TestFixture] |
| public class TestSegmentingTokenizerBase : BaseTokenStreamTestCase |
| { |
| private Analyzer sentence = new AnalyzerAnonymousInnerClassHelper(); |
| |
| private class AnalyzerAnonymousInnerClassHelper : Analyzer |
| { |
| public override TokenStreamComponents CreateComponents(string fieldName, TextReader reader) |
| { |
| return new TokenStreamComponents(new WholeSentenceTokenizer(reader)); |
| } |
| } |
| |
| private Analyzer sentenceAndWord = new AnalyzerAnonymousInnerClassHelper2(); |
| |
| private class AnalyzerAnonymousInnerClassHelper2 : Analyzer |
| { |
| public override TokenStreamComponents CreateComponents(string fieldName, TextReader reader) |
| { |
| return new TokenStreamComponents(new SentenceAndWordTokenizer(reader)); |
| } |
| } |
| |
| [Test] |
| public virtual void TestBasics() |
| { |
| AssertAnalyzesTo(sentence, "The acronym for United States is U.S. but this doesn't end a sentence", new[] { "The acronym for United States is U.S. but this doesn't end a sentence" }); |
| AssertAnalyzesTo(sentence, "He said, \"Are you going?\" John shook his head.", new[] { "He said, \"Are you going?\" ", "John shook his head." }); |
| } |
| |
| [Test] |
| public virtual void TestCustomAttributes() |
| { |
| AssertAnalyzesTo(sentenceAndWord, "He said, \"Are you going?\" John shook his head.", new[] { "He", "said", "Are", "you", "going", "John", "shook", "his", "head" }, new[] { 0, 3, 10, 14, 18, 26, 31, 37, 41 }, new[] { 2, 7, 13, 17, 23, 30, 36, 40, 45 }, new[] { 1, 1, 1, 1, 1, 2, 1, 1, 1 }); |
| } |
| |
| [Test] |
| public virtual void TestReuse() |
| { |
| AssertAnalyzesTo(sentenceAndWord, "He said, \"Are you going?\"", new[] { "He", "said", "Are", "you", "going" }, new[] { 0, 3, 10, 14, 18 }, new[] { 2, 7, 13, 17, 23 }, new[] { 1, 1, 1, 1, 1 }); |
| AssertAnalyzesTo(sentenceAndWord, "John shook his head.", new[] { "John", "shook", "his", "head" }, new[] { 0, 5, 11, 15 }, new[] { 4, 10, 14, 19 }, new[] { 1, 1, 1, 1 }); |
| } |
| |
| [Test] |
| public virtual void TestEnd() |
| { |
| // BaseTokenStreamTestCase asserts that end() is set to our StringReader's length for us here. |
| // we add some junk whitespace to the end just to test it. |
| AssertAnalyzesTo(sentenceAndWord, "John shook his head ", new[] { "John", "shook", "his", "head" }); |
| AssertAnalyzesTo(sentenceAndWord, "John shook his head. ", new[] { "John", "shook", "his", "head" }); |
| } |
| |
| [Test] |
| public virtual void TestHugeDoc() |
| { |
| var sb = new StringBuilder(); |
| var whitespace = new char[4094]; |
| Arrays.Fill(whitespace, '\n'); |
| sb.Append(whitespace); |
| sb.Append("testing 1234"); |
| var input = sb.ToString(); |
| AssertAnalyzesTo(sentenceAndWord, input, new[] { "testing", "1234" }); |
| } |
| |
| [Test] |
| public virtual void TestHugeTerm() |
| { |
| var sb = new StringBuilder(); |
| for (int i = 0; i < 10240; i++) |
| { |
| sb.Append('a'); |
| } |
| var input = sb.ToString(); |
| var token = new char[1024]; |
| Arrays.Fill(token, 'a'); |
| var expectedToken = new string(token); |
| var expected = new[] { expectedToken, expectedToken, expectedToken, expectedToken, expectedToken, expectedToken, expectedToken, expectedToken, expectedToken, expectedToken }; |
| AssertAnalyzesTo(sentence, input, expected); |
| } |
| |
| [Test] |
| public virtual void TestRandomStrings() |
| { |
| CheckRandomData(Random(), sentence, 10000 * RANDOM_MULTIPLIER); |
| CheckRandomData(Random(), sentenceAndWord, 10000 * RANDOM_MULTIPLIER); |
| } |
| |
| // some tokenizers for testing |
| |
| /// <summary> |
| /// silly tokenizer that just returns whole sentences as tokens </summary> |
| sealed class WholeSentenceTokenizer : SegmentingTokenizerBase |
| { |
| internal int sentenceStart, sentenceEnd; |
| internal bool hasSentence; |
| |
| internal ICharTermAttribute termAtt; |
| internal IOffsetAttribute offsetAtt; |
| |
| public WholeSentenceTokenizer(TextReader reader) |
| : base(reader, BreakIterator.CreateSentenceInstance(Locale.GetUS())) |
| { |
| termAtt = AddAttribute<ICharTermAttribute>(); |
| offsetAtt = AddAttribute<IOffsetAttribute>(); |
| } |
| |
| protected internal override void SetNextSentence(int sentenceStart, int sentenceEnd) |
| { |
| this.sentenceStart = sentenceStart; |
| this.sentenceEnd = sentenceEnd; |
| hasSentence = true; |
| } |
| |
| protected internal override bool IncrementWord() |
| { |
| if (hasSentence) |
| { |
| hasSentence = false; |
| ClearAttributes(); |
| termAtt.CopyBuffer(buffer, sentenceStart, sentenceEnd - sentenceStart); |
| offsetAtt.SetOffset(CorrectOffset(offset + sentenceStart), CorrectOffset(offset + sentenceEnd)); |
| return true; |
| } |
| else |
| { |
| return false; |
| } |
| } |
| } |
| |
| /// <summary> |
| /// simple tokenizer, that bumps posinc + 1 for tokens after a |
| /// sentence boundary to inhibit phrase queries without slop. |
| /// </summary> |
| sealed class SentenceAndWordTokenizer : SegmentingTokenizerBase |
| { |
| internal int sentenceStart, sentenceEnd; |
| internal int wordStart, wordEnd; |
| internal int posBoost = -1; // initially set to -1 so the first word in the document doesn't get a pos boost |
| |
| internal ICharTermAttribute termAtt; |
| internal IOffsetAttribute offsetAtt; |
| internal IPositionIncrementAttribute posIncAtt; |
| |
| public SentenceAndWordTokenizer(TextReader reader) |
| : base(reader, BreakIterator.CreateSentenceInstance(Locale.GetUS())) |
| { |
| termAtt = AddAttribute<ICharTermAttribute>(); |
| offsetAtt = AddAttribute<IOffsetAttribute>(); |
| posIncAtt = AddAttribute<IPositionIncrementAttribute>(); |
| } |
| |
| protected internal override void SetNextSentence(int sentenceStart, int sentenceEnd) |
| { |
| this.wordStart = this.wordEnd = this.sentenceStart = sentenceStart; |
| this.sentenceEnd = sentenceEnd; |
| posBoost++; |
| } |
| |
| public override void Reset() |
| { |
| base.Reset(); |
| posBoost = -1; |
| } |
| |
| protected internal override bool IncrementWord() |
| { |
| wordStart = wordEnd; |
| while (wordStart < sentenceEnd) |
| { |
| if (char.IsLetterOrDigit(buffer[wordStart])) |
| { |
| break; |
| } |
| wordStart++; |
| } |
| |
| if (wordStart == sentenceEnd) |
| { |
| return false; |
| } |
| |
| wordEnd = wordStart + 1; |
| while (wordEnd < sentenceEnd && char.IsLetterOrDigit(buffer[wordEnd])) |
| { |
| wordEnd++; |
| } |
| |
| ClearAttributes(); |
| termAtt.CopyBuffer(buffer, wordStart, wordEnd - wordStart); |
| offsetAtt.SetOffset(CorrectOffset(offset + wordStart), CorrectOffset(offset + wordEnd)); |
| posIncAtt.PositionIncrement = posIncAtt.PositionIncrement + posBoost; |
| posBoost = 0; |
| return true; |
| } |
| } |
| } |
| |
| } |