src/Lucene.Net.Tests.Analysis.Common/Analysis/Util/TestSegmentingTokenizerBase.cs - lucenenet - Git at Google

 using ICU4NET;
 using Lucene.Net.Analysis;
 using Lucene.Net.Analysis.Tokenattributes;
 using Lucene.Net.Analysis.Util;
 using Lucene.Net.Support;
 using NUnit.Framework;
 using System.IO;
 using System.Text;

 namespace Lucene.Net.Analysis.Util
 {
     /*
      * Licensed to the Apache Software Foundation (ASF) under one or more
      * contributor license agreements.  See the NOTICE file distributed with
      * this work for additional information regarding copyright ownership.
      * The ASF licenses this file to You under the Apache License, Version 2.0
      * (the "License"); you may not use this file except in compliance with
      * the License.  You may obtain a copy of the License at
      *
      *     http://www.apache.org/licenses/LICENSE-2.0
      *
      * Unless required by applicable law or agreed to in writing, software
      * distributed under the License is distributed on an "AS IS" BASIS,
      * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
      * See the License for the specific language governing permissions and
      * limitations under the License.
      */

     /// <summary>
     /// Basic tests for <seealso cref="SegmentingTokenizerBase"/> </summary>
     [TestFixture]
     public class TestSegmentingTokenizerBase : BaseTokenStreamTestCase
     {
         private Analyzer sentence = new AnalyzerAnonymousInnerClassHelper();

         private class AnalyzerAnonymousInnerClassHelper : Analyzer
         {
             public override TokenStreamComponents CreateComponents(string fieldName, TextReader reader)
             {
                 return new TokenStreamComponents(new WholeSentenceTokenizer(reader));
             }
         }

         private Analyzer sentenceAndWord = new AnalyzerAnonymousInnerClassHelper2();

         private class AnalyzerAnonymousInnerClassHelper2 : Analyzer
         {
             public override TokenStreamComponents CreateComponents(string fieldName, TextReader reader)
             {
                 return new TokenStreamComponents(new SentenceAndWordTokenizer(reader));
             }
         }

         [Test]
         public virtual void TestBasics()
         {
             AssertAnalyzesTo(sentence, "The acronym for United States is U.S. but this doesn't end a sentence", new[] { "The acronym for United States is U.S. but this doesn't end a sentence" });
             AssertAnalyzesTo(sentence, "He said, \"Are you going?\" John shook his head.", new[] { "He said, \"Are you going?\" ", "John shook his head." });
         }

         [Test]
         public virtual void TestCustomAttributes()
         {
             AssertAnalyzesTo(sentenceAndWord, "He said, \"Are you going?\" John shook his head.", new[] { "He", "said", "Are", "you", "going", "John", "shook", "his", "head" }, new[] { 0, 3, 10, 14, 18, 26, 31, 37, 41 }, new[] { 2, 7, 13, 17, 23, 30, 36, 40, 45 }, new[] { 1, 1, 1, 1, 1, 2, 1, 1, 1 });
         }

         [Test]
         public virtual void TestReuse()
         {
             AssertAnalyzesTo(sentenceAndWord, "He said, \"Are you going?\"", new[] { "He", "said", "Are", "you", "going" }, new[] { 0, 3, 10, 14, 18 }, new[] { 2, 7, 13, 17, 23 }, new[] { 1, 1, 1, 1, 1 });
             AssertAnalyzesTo(sentenceAndWord, "John shook his head.", new[] { "John", "shook", "his", "head" }, new[] { 0, 5, 11, 15 }, new[] { 4, 10, 14, 19 }, new[] { 1, 1, 1, 1 });
         }

         [Test]
         public virtual void TestEnd()
         {
             // BaseTokenStreamTestCase asserts that end() is set to our StringReader's length for us here.
             // we add some junk whitespace to the end just to test it.
             AssertAnalyzesTo(sentenceAndWord, "John shook his head          ", new[] { "John", "shook", "his", "head" });
             AssertAnalyzesTo(sentenceAndWord, "John shook his head.          ", new[] { "John", "shook", "his", "head" });
         }

         [Test]
         public virtual void TestHugeDoc()
         {
             var sb = new StringBuilder();
             var whitespace = new char[4094];
             Arrays.Fill(whitespace, '\n');
             sb.Append(whitespace);
             sb.Append("testing 1234");
             var input = sb.ToString();
             AssertAnalyzesTo(sentenceAndWord, input, new[] { "testing", "1234" });
         }

         [Test]
         public virtual void TestHugeTerm()
         {
             var sb = new StringBuilder();
             for (int i = 0; i < 10240; i++)
             {
                 sb.Append('a');
             }
             var input = sb.ToString();
             var token = new char[1024];
             Arrays.Fill(token, 'a');
             var expectedToken = new string(token);
             var expected = new[] { expectedToken, expectedToken, expectedToken, expectedToken, expectedToken, expectedToken, expectedToken, expectedToken, expectedToken, expectedToken };
             AssertAnalyzesTo(sentence, input, expected);
         }

         [Test]
         public virtual void TestRandomStrings()
         {
             CheckRandomData(Random(), sentence, 10000 * RANDOM_MULTIPLIER);
             CheckRandomData(Random(), sentenceAndWord, 10000 * RANDOM_MULTIPLIER);
         }

         // some tokenizers for testing

         /// <summary>
         /// silly tokenizer that just returns whole sentences as tokens </summary>
         sealed class WholeSentenceTokenizer : SegmentingTokenizerBase
         {
             internal int sentenceStart, sentenceEnd;
             internal bool hasSentence;

             internal ICharTermAttribute termAtt;
             internal IOffsetAttribute offsetAtt;

             public WholeSentenceTokenizer(TextReader reader)
                 : base(reader, BreakIterator.CreateSentenceInstance(Locale.GetUS()))
             {
                 termAtt = AddAttribute<ICharTermAttribute>();
                 offsetAtt = AddAttribute<IOffsetAttribute>();
             }

             protected internal override void SetNextSentence(int sentenceStart, int sentenceEnd)
             {
                 this.sentenceStart = sentenceStart;
                 this.sentenceEnd = sentenceEnd;
                 hasSentence = true;
             }

             protected internal override bool IncrementWord()
             {
                 if (hasSentence)
                 {
                     hasSentence = false;
                     ClearAttributes();
                     termAtt.CopyBuffer(buffer, sentenceStart, sentenceEnd - sentenceStart);
                     offsetAtt.SetOffset(CorrectOffset(offset + sentenceStart), CorrectOffset(offset + sentenceEnd));
                     return true;
                 }
                 else
                 {
                     return false;
                 }
             }
         }

         /// <summary>
         /// simple tokenizer, that bumps posinc + 1 for tokens after a
         /// sentence boundary to inhibit phrase queries without slop.
         /// </summary>
         sealed class SentenceAndWordTokenizer : SegmentingTokenizerBase
         {
             internal int sentenceStart, sentenceEnd;
             internal int wordStart, wordEnd;
             internal int posBoost = -1; // initially set to -1 so the first word in the document doesn't get a pos boost

             internal ICharTermAttribute termAtt;
             internal IOffsetAttribute offsetAtt;
             internal IPositionIncrementAttribute posIncAtt;

             public SentenceAndWordTokenizer(TextReader reader)
                 : base(reader, BreakIterator.CreateSentenceInstance(Locale.GetUS()))
             {
                 termAtt = AddAttribute<ICharTermAttribute>();
                 offsetAtt = AddAttribute<IOffsetAttribute>();
                 posIncAtt = AddAttribute<IPositionIncrementAttribute>();
             }

             protected internal override void SetNextSentence(int sentenceStart, int sentenceEnd)
             {
                 this.wordStart = this.wordEnd = this.sentenceStart = sentenceStart;
                 this.sentenceEnd = sentenceEnd;
                 posBoost++;
             }

             public override void Reset()
             {
                 base.Reset();
                 posBoost = -1;
             }

             protected internal override bool IncrementWord()
             {
                 wordStart = wordEnd;
                 while (wordStart < sentenceEnd)
                 {
                     if (char.IsLetterOrDigit(buffer[wordStart]))
                     {
                         break;
                     }
                     wordStart++;
                 }

                 if (wordStart == sentenceEnd)
                 {
                     return false;
                 }

                 wordEnd = wordStart + 1;
                 while (wordEnd < sentenceEnd && char.IsLetterOrDigit(buffer[wordEnd]))
                 {
                     wordEnd++;
                 }

                 ClearAttributes();
                 termAtt.CopyBuffer(buffer, wordStart, wordEnd - wordStart);
                 offsetAtt.SetOffset(CorrectOffset(offset + wordStart), CorrectOffset(offset + wordEnd));
                 posIncAtt.PositionIncrement = posIncAtt.PositionIncrement + posBoost;
                 posBoost = 0;
                 return true;
             }
         }
     }

 }
	using ICU4NET;
	using Lucene.Net.Analysis;
	using Lucene.Net.Analysis.Tokenattributes;
	using Lucene.Net.Analysis.Util;
	using Lucene.Net.Support;
	using NUnit.Framework;
	using System.IO;
	using System.Text;

	namespace Lucene.Net.Analysis.Util
	{
	/*
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/

	/// <summary>
	/// Basic tests for <seealso cref="SegmentingTokenizerBase"/> </summary>
	[TestFixture]
	public class TestSegmentingTokenizerBase : BaseTokenStreamTestCase
	{
	private Analyzer sentence = new AnalyzerAnonymousInnerClassHelper();

	private class AnalyzerAnonymousInnerClassHelper : Analyzer
	{
	public override TokenStreamComponents CreateComponents(string fieldName, TextReader reader)
	{
	return new TokenStreamComponents(new WholeSentenceTokenizer(reader));
	}
	}

	private Analyzer sentenceAndWord = new AnalyzerAnonymousInnerClassHelper2();

	private class AnalyzerAnonymousInnerClassHelper2 : Analyzer
	{
	public override TokenStreamComponents CreateComponents(string fieldName, TextReader reader)
	{
	return new TokenStreamComponents(new SentenceAndWordTokenizer(reader));
	}
	}

	[Test]
	public virtual void TestBasics()
	{
	AssertAnalyzesTo(sentence, "The acronym for United States is U.S. but this doesn't end a sentence", new[] { "The acronym for United States is U.S. but this doesn't end a sentence" });
	AssertAnalyzesTo(sentence, "He said, \"Are you going?\" John shook his head.", new[] { "He said, \"Are you going?\" ", "John shook his head." });
	}

	[Test]
	public virtual void TestCustomAttributes()
	{
	AssertAnalyzesTo(sentenceAndWord, "He said, \"Are you going?\" John shook his head.", new[] { "He", "said", "Are", "you", "going", "John", "shook", "his", "head" }, new[] { 0, 3, 10, 14, 18, 26, 31, 37, 41 }, new[] { 2, 7, 13, 17, 23, 30, 36, 40, 45 }, new[] { 1, 1, 1, 1, 1, 2, 1, 1, 1 });
	}

	[Test]
	public virtual void TestReuse()
	{
	AssertAnalyzesTo(sentenceAndWord, "He said, \"Are you going?\"", new[] { "He", "said", "Are", "you", "going" }, new[] { 0, 3, 10, 14, 18 }, new[] { 2, 7, 13, 17, 23 }, new[] { 1, 1, 1, 1, 1 });
	AssertAnalyzesTo(sentenceAndWord, "John shook his head.", new[] { "John", "shook", "his", "head" }, new[] { 0, 5, 11, 15 }, new[] { 4, 10, 14, 19 }, new[] { 1, 1, 1, 1 });
	}

	[Test]
	public virtual void TestEnd()
	{
	// BaseTokenStreamTestCase asserts that end() is set to our StringReader's length for us here.
	// we add some junk whitespace to the end just to test it.
	AssertAnalyzesTo(sentenceAndWord, "John shook his head ", new[] { "John", "shook", "his", "head" });
	AssertAnalyzesTo(sentenceAndWord, "John shook his head. ", new[] { "John", "shook", "his", "head" });
	}

	[Test]
	public virtual void TestHugeDoc()
	{
	var sb = new StringBuilder();
	var whitespace = new char[4094];
	Arrays.Fill(whitespace, '\n');
	sb.Append(whitespace);
	sb.Append("testing 1234");
	var input = sb.ToString();
	AssertAnalyzesTo(sentenceAndWord, input, new[] { "testing", "1234" });
	}

	[Test]
	public virtual void TestHugeTerm()
	{
	var sb = new StringBuilder();
	for (int i = 0; i < 10240; i++)
	{
	sb.Append('a');
	}
	var input = sb.ToString();
	var token = new char[1024];
	Arrays.Fill(token, 'a');
	var expectedToken = new string(token);
	var expected = new[] { expectedToken, expectedToken, expectedToken, expectedToken, expectedToken, expectedToken, expectedToken, expectedToken, expectedToken, expectedToken };
	AssertAnalyzesTo(sentence, input, expected);
	}

	[Test]
	public virtual void TestRandomStrings()
	{
	CheckRandomData(Random(), sentence, 10000 * RANDOM_MULTIPLIER);
	CheckRandomData(Random(), sentenceAndWord, 10000 * RANDOM_MULTIPLIER);
	}

	// some tokenizers for testing

	/// <summary>
	/// silly tokenizer that just returns whole sentences as tokens </summary>
	sealed class WholeSentenceTokenizer : SegmentingTokenizerBase
	{
	internal int sentenceStart, sentenceEnd;
	internal bool hasSentence;

	internal ICharTermAttribute termAtt;
	internal IOffsetAttribute offsetAtt;

	public WholeSentenceTokenizer(TextReader reader)
	: base(reader, BreakIterator.CreateSentenceInstance(Locale.GetUS()))
	{
	termAtt = AddAttribute<ICharTermAttribute>();
	offsetAtt = AddAttribute<IOffsetAttribute>();
	}

	protected internal override void SetNextSentence(int sentenceStart, int sentenceEnd)
	{
	this.sentenceStart = sentenceStart;
	this.sentenceEnd = sentenceEnd;
	hasSentence = true;
	}

	protected internal override bool IncrementWord()
	{
	if (hasSentence)
	{
	hasSentence = false;
	ClearAttributes();
	termAtt.CopyBuffer(buffer, sentenceStart, sentenceEnd - sentenceStart);
	offsetAtt.SetOffset(CorrectOffset(offset + sentenceStart), CorrectOffset(offset + sentenceEnd));
	return true;
	}
	else
	{
	return false;
	}
	}
	}

	/// <summary>
	/// simple tokenizer, that bumps posinc + 1 for tokens after a
	/// sentence boundary to inhibit phrase queries without slop.
	/// </summary>
	sealed class SentenceAndWordTokenizer : SegmentingTokenizerBase
	{
	internal int sentenceStart, sentenceEnd;
	internal int wordStart, wordEnd;
	internal int posBoost = -1; // initially set to -1 so the first word in the document doesn't get a pos boost

	internal ICharTermAttribute termAtt;
	internal IOffsetAttribute offsetAtt;
	internal IPositionIncrementAttribute posIncAtt;

	public SentenceAndWordTokenizer(TextReader reader)
	: base(reader, BreakIterator.CreateSentenceInstance(Locale.GetUS()))
	{
	termAtt = AddAttribute<ICharTermAttribute>();
	offsetAtt = AddAttribute<IOffsetAttribute>();
	posIncAtt = AddAttribute<IPositionIncrementAttribute>();
	}

	protected internal override void SetNextSentence(int sentenceStart, int sentenceEnd)
	{
	this.wordStart = this.wordEnd = this.sentenceStart = sentenceStart;
	this.sentenceEnd = sentenceEnd;
	posBoost++;
	}

	public override void Reset()
	{
	base.Reset();
	posBoost = -1;
	}

	protected internal override bool IncrementWord()
	{
	wordStart = wordEnd;
	while (wordStart < sentenceEnd)
	{
	if (char.IsLetterOrDigit(buffer[wordStart]))
	{
	break;
	}
	wordStart++;
	}

	if (wordStart == sentenceEnd)
	{
	return false;
	}

	wordEnd = wordStart + 1;
	while (wordEnd < sentenceEnd && char.IsLetterOrDigit(buffer[wordEnd]))
	{
	wordEnd++;
	}

	ClearAttributes();
	termAtt.CopyBuffer(buffer, wordStart, wordEnd - wordStart);
	offsetAtt.SetOffset(CorrectOffset(offset + wordStart), CorrectOffset(offset + wordEnd));
	posIncAtt.PositionIncrement = posIncAtt.PositionIncrement + posBoost;
	posBoost = 0;
	return true;
	}
	}
	}

	}