src/Lucene.Net.Analysis.Common/Analysis/Th/ThaiTokenizer.cs - lucenenet - Git at Google

 using ICU4NET;
 using ICU4NETExtension;
 using Lucene.Net.Analysis.Tokenattributes;
 using Lucene.Net.Analysis.Util;
 using Lucene.Net.Support;
 using System;
 using System.Collections.Generic;
 using System.IO;
 using System.Linq;
 using System.Text.RegularExpressions;

 namespace Lucene.Net.Analysis.Th
 {
     /*
 	 * Licensed to the Apache Software Foundation (ASF) under one or more
 	 * contributor license agreements.  See the NOTICE file distributed with
 	 * this work for additional information regarding copyright ownership.
 	 * The ASF licenses this file to You under the Apache License, Version 2.0
 	 * (the "License"); you may not use this file except in compliance with
 	 * the License.  You may obtain a copy of the License at
 	 *
 	 *     http://www.apache.org/licenses/LICENSE-2.0
 	 *
 	 * Unless required by applicable law or agreed to in writing, software
 	 * distributed under the License is distributed on an "AS IS" BASIS,
 	 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 	 * See the License for the specific language governing permissions and
 	 * limitations under the License.
 	 */

     /// <summary>
     /// Tokenizer that use <seealso cref="BreakIterator"/> to tokenize Thai text.
     /// <para>WARNING: this tokenizer may not be supported by all JREs.
     ///    It is known to work with Sun/Oracle and Harmony JREs.
     ///    If your application needs to be fully portable, consider using ICUTokenizer instead,
     ///    which uses an ICU Thai BreakIterator that will always be available.
     /// </para>
     /// </summary>
     public class ThaiTokenizer : SegmentingTokenizerBase
     {
         /// <summary>
         /// True if the JRE supports a working dictionary-based breakiterator for Thai.
         /// If this is false, this tokenizer will not work at all!
         /// </summary>
         public static readonly bool DBBI_AVAILABLE;
         private static readonly BreakIterator proto = BreakIterator.CreateWordInstance(Locale.GetUS());
         static ThaiTokenizer()
         {
             // check that we have a working dictionary-based break iterator for thai
             proto.SetText("ภาษาไทย");
             DBBI_AVAILABLE = proto.IsBoundary(4);
         }

         private readonly ThaiWordBreaker wordBreaker;
         private readonly CharArrayIterator wrapper = CharArrayIterator.NewWordInstance();

         internal int sentenceStart;
         internal int sentenceEnd;

         private readonly ICharTermAttribute termAtt;
         private readonly IOffsetAttribute offsetAtt;

         /// <summary>
         /// Creates a new ThaiTokenizer </summary>
         public ThaiTokenizer(TextReader reader)
               : this(AttributeFactory.DEFAULT_ATTRIBUTE_FACTORY, reader)
         {
         }

         /// <summary>
         /// Creates a new ThaiTokenizer, supplying the AttributeFactory </summary>
         public ThaiTokenizer(AttributeFactory factory, TextReader reader)
               : base(factory, reader, BreakIterator.CreateSentenceInstance(Locale.GetUS()))
         {
             if (!DBBI_AVAILABLE)
             {
                 throw new System.NotSupportedException("This JRE does not have support for Thai segmentation");
             }
             wordBreaker = new ThaiWordBreaker(BreakIterator.CreateWordInstance(Locale.GetUS()));
             termAtt = AddAttribute<ICharTermAttribute>();
             offsetAtt = AddAttribute<IOffsetAttribute>();
         }

         protected internal override void SetNextSentence(int sentenceStart, int sentenceEnd)
         {
             this.sentenceStart = sentenceStart;
             this.sentenceEnd = sentenceEnd;
             wrapper.SetText(buffer, sentenceStart, sentenceEnd - sentenceStart);
             wordBreaker.SetText(new string(wrapper.Text, wrapper.Start, wrapper.Length));
         }

         protected internal override bool IncrementWord()
         {
             int start = wordBreaker.Current();
             if (start == BreakIterator.DONE)
             {
                 return false; // BreakIterator exhausted
             }

             // find the next set of boundaries, skipping over non-tokens
             int end = wordBreaker.Next();
             while (end != BreakIterator.DONE && !char.IsLetterOrDigit((char)Character.CodePointAt(buffer, sentenceStart + start, sentenceEnd)))
             {
                 start = end;
                 end = wordBreaker.Next();
             }

             if (end == BreakIterator.DONE)
             {
                 return false; // BreakIterator exhausted
             }

             ClearAttributes();
             termAtt.CopyBuffer(buffer, sentenceStart + start, end - start);
             offsetAtt.SetOffset(CorrectOffset(offset + sentenceStart + start), CorrectOffset(offset + sentenceStart + end));
             return true;
         }
     }

     /// <summary>
     /// LUCENENET specific class to patch the behavior of the ICU BreakIterator.
     /// Corrects the breaking of words by finding transitions between Thai and non-Thai
     /// characters.
     ///
     /// This logic assumes that the Java BreakIterator also breaks up Thai numerals from
     /// Arabic numerals (1, 2, 3, etc.). That is, it assumes the first test below passes
     /// and the second test fails in Lucene (not attempted).
     ///
     /// ThaiAnalyzer analyzer = new ThaiAnalyzer(TEST_VERSION_CURRENT, CharArraySet.EMPTY_SET);
     /// AssertAnalyzesTo(analyzer, "๑๒๓456", new string[] { "๑๒๓", "456" });
     /// AssertAnalyzesTo(analyzer, "๑๒๓456", new string[] { "๑๒๓456" });
     /// </summary>
     internal class ThaiWordBreaker
     {
         private readonly BreakIterator wordBreaker;
         private string text;
         private readonly IList<int> transitions = new List<int>();
         private readonly static Regex thaiPattern = new Regex(@"\p{IsThai}", RegexOptions.Compiled | RegexOptions.CultureInvariant);

         public ThaiWordBreaker(BreakIterator wordBreaker)
         {
             if (wordBreaker == null)
             {
                 throw new ArgumentNullException("wordBreaker");
             }
             this.wordBreaker = wordBreaker;
         }

         public void SetText(string text)
         {
             this.text = text;
             wordBreaker.SetText(text);
         }

         public int Current()
         {
             if (transitions.Any())
             {
                 return transitions.First();
             }
             return wordBreaker.Current();
         }

         public int Next()
         {
             if (transitions.Any())
             {
                 transitions.RemoveAt(0);
             }
             if (transitions.Any())
             {
                 return transitions.First();
             }
             return GetNext();
         }

         private int GetNext()
         {
             bool isThai = false, isNonThai = false;
             bool prevWasThai = false, prevWasNonThai = false;
             int prev = wordBreaker.Current();
             int current = wordBreaker.Next();

             if (current != BreakIterator.DONE && current - prev > 0)
             {
                 // Find all of the transitions between Thai and non-Thai characters and digits
                 for (int i = prev; i < current; i++)
                 {
                     char c = text[i];
                     isThai = thaiPattern.IsMatch(c.ToString());
                     isNonThai = char.IsLetterOrDigit(c) && !isThai;

                     if ((prevWasThai && isNonThai) ||
                         (prevWasNonThai && isThai))
                     {
                         transitions.Add(i);
                     }

                     // record the values for comparison with the next loop
                     prevWasThai = isThai;
                     prevWasNonThai = isNonThai;
                 }

                 if (transitions.Any())
                 {
                     transitions.Add(current);
                     return transitions.First();
                 }
             }

             return current;
         }
     }
 }
	using ICU4NET;
	using ICU4NETExtension;
	using Lucene.Net.Analysis.Tokenattributes;
	using Lucene.Net.Analysis.Util;
	using Lucene.Net.Support;
	using System;
	using System.Collections.Generic;
	using System.IO;
	using System.Linq;
	using System.Text.RegularExpressions;

	namespace Lucene.Net.Analysis.Th
	{
	/*
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/

	/// <summary>
	/// Tokenizer that use <seealso cref="BreakIterator"/> to tokenize Thai text.
	/// <para>WARNING: this tokenizer may not be supported by all JREs.
	/// It is known to work with Sun/Oracle and Harmony JREs.
	/// If your application needs to be fully portable, consider using ICUTokenizer instead,
	/// which uses an ICU Thai BreakIterator that will always be available.
	/// </para>
	/// </summary>
	public class ThaiTokenizer : SegmentingTokenizerBase
	{
	/// <summary>
	/// True if the JRE supports a working dictionary-based breakiterator for Thai.
	/// If this is false, this tokenizer will not work at all!
	/// </summary>
	public static readonly bool DBBI_AVAILABLE;
	private static readonly BreakIterator proto = BreakIterator.CreateWordInstance(Locale.GetUS());
	static ThaiTokenizer()
	{
	// check that we have a working dictionary-based break iterator for thai
	proto.SetText("ภาษาไทย");
	DBBI_AVAILABLE = proto.IsBoundary(4);
	}

	private readonly ThaiWordBreaker wordBreaker;
	private readonly CharArrayIterator wrapper = CharArrayIterator.NewWordInstance();

	internal int sentenceStart;
	internal int sentenceEnd;

	private readonly ICharTermAttribute termAtt;
	private readonly IOffsetAttribute offsetAtt;

	/// <summary>
	/// Creates a new ThaiTokenizer </summary>
	public ThaiTokenizer(TextReader reader)
	: this(AttributeFactory.DEFAULT_ATTRIBUTE_FACTORY, reader)
	{
	}

	/// <summary>
	/// Creates a new ThaiTokenizer, supplying the AttributeFactory </summary>
	public ThaiTokenizer(AttributeFactory factory, TextReader reader)
	: base(factory, reader, BreakIterator.CreateSentenceInstance(Locale.GetUS()))
	{
	if (!DBBI_AVAILABLE)
	{
	throw new System.NotSupportedException("This JRE does not have support for Thai segmentation");
	}
	wordBreaker = new ThaiWordBreaker(BreakIterator.CreateWordInstance(Locale.GetUS()));
	termAtt = AddAttribute<ICharTermAttribute>();
	offsetAtt = AddAttribute<IOffsetAttribute>();
	}

	protected internal override void SetNextSentence(int sentenceStart, int sentenceEnd)
	{
	this.sentenceStart = sentenceStart;
	this.sentenceEnd = sentenceEnd;
	wrapper.SetText(buffer, sentenceStart, sentenceEnd - sentenceStart);
	wordBreaker.SetText(new string(wrapper.Text, wrapper.Start, wrapper.Length));
	}

	protected internal override bool IncrementWord()
	{
	int start = wordBreaker.Current();
	if (start == BreakIterator.DONE)
	{
	return false; // BreakIterator exhausted
	}

	// find the next set of boundaries, skipping over non-tokens
	int end = wordBreaker.Next();
	while (end != BreakIterator.DONE && !char.IsLetterOrDigit((char)Character.CodePointAt(buffer, sentenceStart + start, sentenceEnd)))
	{
	start = end;
	end = wordBreaker.Next();
	}

	if (end == BreakIterator.DONE)
	{
	return false; // BreakIterator exhausted
	}

	ClearAttributes();
	termAtt.CopyBuffer(buffer, sentenceStart + start, end - start);
	offsetAtt.SetOffset(CorrectOffset(offset + sentenceStart + start), CorrectOffset(offset + sentenceStart + end));
	return true;
	}
	}

	/// <summary>
	/// LUCENENET specific class to patch the behavior of the ICU BreakIterator.
	/// Corrects the breaking of words by finding transitions between Thai and non-Thai
	/// characters.
	///
	/// This logic assumes that the Java BreakIterator also breaks up Thai numerals from
	/// Arabic numerals (1, 2, 3, etc.). That is, it assumes the first test below passes
	/// and the second test fails in Lucene (not attempted).
	///
	/// ThaiAnalyzer analyzer = new ThaiAnalyzer(TEST_VERSION_CURRENT, CharArraySet.EMPTY_SET);
	/// AssertAnalyzesTo(analyzer, "๑๒๓456", new string[] { "๑๒๓", "456" });
	/// AssertAnalyzesTo(analyzer, "๑๒๓456", new string[] { "๑๒๓456" });
	/// </summary>
	internal class ThaiWordBreaker
	{
	private readonly BreakIterator wordBreaker;
	private string text;
	private readonly IList<int> transitions = new List<int>();
	private readonly static Regex thaiPattern = new Regex(@"\p{IsThai}", RegexOptions.Compiled \| RegexOptions.CultureInvariant);

	public ThaiWordBreaker(BreakIterator wordBreaker)
	{
	if (wordBreaker == null)
	{
	throw new ArgumentNullException("wordBreaker");
	}
	this.wordBreaker = wordBreaker;
	}

	public void SetText(string text)
	{
	this.text = text;
	wordBreaker.SetText(text);
	}

	public int Current()
	{
	if (transitions.Any())
	{
	return transitions.First();
	}
	return wordBreaker.Current();
	}

	public int Next()
	{
	if (transitions.Any())
	{
	transitions.RemoveAt(0);
	}
	if (transitions.Any())
	{
	return transitions.First();
	}
	return GetNext();
	}

	private int GetNext()
	{
	bool isThai = false, isNonThai = false;
	bool prevWasThai = false, prevWasNonThai = false;
	int prev = wordBreaker.Current();
	int current = wordBreaker.Next();

	if (current != BreakIterator.DONE && current - prev > 0)
	{
	// Find all of the transitions between Thai and non-Thai characters and digits
	for (int i = prev; i < current; i++)
	{
	char c = text[i];
	isThai = thaiPattern.IsMatch(c.ToString());
	isNonThai = char.IsLetterOrDigit(c) && !isThai;

	if ((prevWasThai && isNonThai) \|\|
	(prevWasNonThai && isThai))
	{
	transitions.Add(i);
	}

	// record the values for comparison with the next loop
	prevWasThai = isThai;
	prevWasNonThai = isNonThai;
	}

	if (transitions.Any())
	{
	transitions.Add(current);
	return transitions.First();
	}
	}

	return current;
	}
	}
	}