src/Lucene.Net.Analysis.Common/Analysis/Th/ThaiTokenizer.cs - lucenenet - Git at Google

 #if FEATURE_BREAKITERATOR
 using ICU4N.Text;
 using J2N;
 using Lucene.Net.Analysis.TokenAttributes;
 using Lucene.Net.Analysis.Util;
 using System;
 using System.Collections.Generic;
 using System.Globalization;
 using System.IO;
 using System.Text.RegularExpressions;

 namespace Lucene.Net.Analysis.Th
 {
     /*
      * Licensed to the Apache Software Foundation (ASF) under one or more
      * contributor license agreements.  See the NOTICE file distributed with
      * this work for additional information regarding copyright ownership.
      * The ASF licenses this file to You under the Apache License, Version 2.0
      * (the "License"); you may not use this file except in compliance with
      * the License.  You may obtain a copy of the License at
      *
      *     http://www.apache.org/licenses/LICENSE-2.0
      *
      * Unless required by applicable law or agreed to in writing, software
      * distributed under the License is distributed on an "AS IS" BASIS,
      * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
      * See the License for the specific language governing permissions and
      * limitations under the License.
      */

     // LUCENENET NOTE: Removing this notice from the doc comment because it is not relevant for our purposes.

     // <para>WARNING: this tokenizer may not be supported by all JREs.
     //    It is known to work with Sun/Oracle and Harmony JREs.
     //    If your application needs to be fully portable, consider using ICUTokenizer instead,
     //    which uses an ICU Thai BreakIterator that will always be available.
     // </para>

     /// <summary>
     /// Tokenizer that use <see cref="BreakIterator"/> to tokenize Thai text.
     /// </summary>
     public class ThaiTokenizer : SegmentingTokenizerBase
     {
         private static readonly object syncLock = new object(); // LUCENENET specific - workaround until BreakIterator is made thread safe  (LUCENENET TODO: TO REVERT)

         // LUCENENET specific - DBBI_AVAILABLE removed because ICU always has a dictionary-based BreakIterator
         private static readonly BreakIterator proto = LoadProto();

         /// <summary>
         /// used for breaking the text into sentences
         /// </summary>
         private static readonly BreakIterator sentenceProto = LoadSentenceProto();

         private static BreakIterator LoadProto()
         {
             lock (syncLock)
                 return BreakIterator.GetWordInstance(new CultureInfo("th"));
         }

         private static BreakIterator LoadSentenceProto()
         {
             lock (syncLock)
                 return BreakIterator.GetSentenceInstance(CultureInfo.InvariantCulture);
         }

         private readonly ThaiWordBreaker wordBreaker;
         private readonly CharArrayIterator wrapper = Analysis.Util.CharArrayIterator.NewWordInstance();

         private int sentenceStart;
         private int sentenceEnd;

         private readonly ICharTermAttribute termAtt;
         private readonly IOffsetAttribute offsetAtt;

         /// <summary>
         /// Creates a new <see cref="ThaiTokenizer"/> </summary>
         public ThaiTokenizer(TextReader reader)
             : this(AttributeFactory.DEFAULT_ATTRIBUTE_FACTORY, reader)
         {
         }

         /// <summary>
         /// Creates a new <see cref="ThaiTokenizer"/>, supplying the <see cref="Lucene.Net.Util.AttributeSource.AttributeFactory"/> </summary>
         public ThaiTokenizer(AttributeFactory factory, TextReader reader)
             : base(factory, reader, CreateSentenceClone())
         {
             // LUCENENET specific - DBBI_AVAILABLE removed because ICU always has a dictionary-based BreakIterator

             lock (syncLock)
                 wordBreaker = new ThaiWordBreaker((BreakIterator)proto.Clone());
             termAtt = AddAttribute<ICharTermAttribute>();
             offsetAtt = AddAttribute<IOffsetAttribute>();
         }

         private static BreakIterator CreateSentenceClone()
         {
             lock (syncLock)
                 return (BreakIterator)sentenceProto.Clone();
         }

         public override void Reset()
         {
             lock (syncLock)
                 base.Reset();
         }

         public override State CaptureState()
         {
             lock (syncLock)
                 return base.CaptureState();
         }

         protected override void SetNextSentence(int sentenceStart, int sentenceEnd)
         {
             lock (syncLock)
             {
                 this.sentenceStart = sentenceStart;
                 this.sentenceEnd = sentenceEnd;
                 wrapper.SetText(m_buffer, sentenceStart, sentenceEnd - sentenceStart);
                 wordBreaker.SetText(new string(wrapper.Text, wrapper.Start, wrapper.Length));
             }
         }

         protected override bool IncrementWord()
         {
             int start, end;
             lock (syncLock)
             {
                 start = wordBreaker.Current;
                 if (start == BreakIterator.Done)
                 {
                     return false; // BreakIterator exhausted
                 }

                 // find the next set of boundaries, skipping over non-tokens
                 end = wordBreaker.Next();
                 while (end != BreakIterator.Done && !Character.IsLetterOrDigit(Character.CodePointAt(m_buffer, sentenceStart + start, sentenceEnd)))
                 {
                     start = end;
                     end = wordBreaker.Next();
                 }

                 if (end == BreakIterator.Done)
                 {
                     return false; // BreakIterator exhausted
                 }

                 ClearAttributes();
                 termAtt.CopyBuffer(m_buffer, sentenceStart + start, end - start);
                 offsetAtt.SetOffset(CorrectOffset(m_offset + sentenceStart + start), CorrectOffset(m_offset + sentenceStart + end));
                 return true;
             }
         }
     }

     /// <summary>
     /// LUCENENET specific class to patch the behavior of the ICU BreakIterator to match the behavior of the JDK.
     /// Corrects the breaking of words by finding transitions between Thai and non-Thai
     /// characters.
     /// </summary>
     internal class ThaiWordBreaker
     {
         private readonly BreakIterator wordBreaker;
         private string text;
         private readonly Queue<int> transitions = new Queue<int>();
         private static readonly Regex thaiPattern = new Regex(@"\p{IsThai}+", RegexOptions.Compiled | RegexOptions.CultureInvariant);

         public ThaiWordBreaker(BreakIterator wordBreaker)
         {
             this.wordBreaker = wordBreaker ?? throw new ArgumentNullException(nameof(wordBreaker));
         }

         public void SetText(string text)
         {
             this.text = text;
             wordBreaker.SetText(text);
         }

         public int Current
         {
             get
             {
                 if (transitions.Count > 0)
                     return transitions.Peek();

                 return wordBreaker.Current;
             }
         }

         public int Next()
         {
             if (transitions.Count > 0)
                 transitions.Dequeue();

             if (transitions.Count > 0)
                 return transitions.Peek();

             return GetNext();
         }

         private int GetNext()
         {
             bool isThai, isNonThai;
             bool prevWasThai = false, prevWasNonThai = false;
             int prev = wordBreaker.Current;
             int current = wordBreaker.Next();

             if (current != BreakIterator.Done && current - prev > 0)
             {
                 int length = text.Length;
                 string toMatch;
                 // Find all of the transitions between Thai and non-Thai characters and digits
                 for (int i = prev; i < current; i++)
                 {
                     char high = text[i];
                     // Account for surrogate pairs
                     if (char.IsHighSurrogate(high) && i < length && i + 1 < current && char.IsLowSurrogate(text[i + 1]))
                         toMatch = string.Empty + high + text[++i];
                     else
                         toMatch = string.Empty + high;

                     if (char.IsLetter(toMatch, 0)) // Always break letters apart from digits to match the JDK
                     {
                         isThai = thaiPattern.IsMatch(toMatch);
                         isNonThai = !isThai;
                     }
                     else
                     {
                         isThai = false;
                         isNonThai = false;
                     }

                     if ((prevWasThai && isNonThai) ||
                         (prevWasNonThai && isThai))
                     {
                         transitions.Enqueue(i);
                     }

                     // record the values for comparison with the next loop
                     prevWasThai = isThai;
                     prevWasNonThai = isNonThai;
                 }

                 if (transitions.Count > 0)
                 {
                     transitions.Enqueue(current);
                     return transitions.Peek();
                 }
             }

             return current;
         }
     }
 }
 #endif
	#if FEATURE_BREAKITERATOR
	using ICU4N.Text;
	using J2N;
	using Lucene.Net.Analysis.TokenAttributes;
	using Lucene.Net.Analysis.Util;
	using System;
	using System.Collections.Generic;
	using System.Globalization;
	using System.IO;
	using System.Text.RegularExpressions;

	namespace Lucene.Net.Analysis.Th
	{
	/*
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/

	// LUCENENET NOTE: Removing this notice from the doc comment because it is not relevant for our purposes.

	// <para>WARNING: this tokenizer may not be supported by all JREs.
	// It is known to work with Sun/Oracle and Harmony JREs.
	// If your application needs to be fully portable, consider using ICUTokenizer instead,
	// which uses an ICU Thai BreakIterator that will always be available.
	// </para>

	/// <summary>
	/// Tokenizer that use <see cref="BreakIterator"/> to tokenize Thai text.
	/// </summary>
	public class ThaiTokenizer : SegmentingTokenizerBase
	{
	private static readonly object syncLock = new object(); // LUCENENET specific - workaround until BreakIterator is made thread safe (LUCENENET TODO: TO REVERT)

	// LUCENENET specific - DBBI_AVAILABLE removed because ICU always has a dictionary-based BreakIterator
	private static readonly BreakIterator proto = LoadProto();

	/// <summary>
	/// used for breaking the text into sentences
	/// </summary>
	private static readonly BreakIterator sentenceProto = LoadSentenceProto();

	private static BreakIterator LoadProto()
	{
	lock (syncLock)
	return BreakIterator.GetWordInstance(new CultureInfo("th"));
	}

	private static BreakIterator LoadSentenceProto()
	{
	lock (syncLock)
	return BreakIterator.GetSentenceInstance(CultureInfo.InvariantCulture);
	}

	private readonly ThaiWordBreaker wordBreaker;
	private readonly CharArrayIterator wrapper = Analysis.Util.CharArrayIterator.NewWordInstance();

	private int sentenceStart;
	private int sentenceEnd;

	private readonly ICharTermAttribute termAtt;
	private readonly IOffsetAttribute offsetAtt;

	/// <summary>
	/// Creates a new <see cref="ThaiTokenizer"/> </summary>
	public ThaiTokenizer(TextReader reader)
	: this(AttributeFactory.DEFAULT_ATTRIBUTE_FACTORY, reader)
	{
	}

	/// <summary>
	/// Creates a new <see cref="ThaiTokenizer"/>, supplying the <see cref="Lucene.Net.Util.AttributeSource.AttributeFactory"/> </summary>
	public ThaiTokenizer(AttributeFactory factory, TextReader reader)
	: base(factory, reader, CreateSentenceClone())
	{
	// LUCENENET specific - DBBI_AVAILABLE removed because ICU always has a dictionary-based BreakIterator

	lock (syncLock)
	wordBreaker = new ThaiWordBreaker((BreakIterator)proto.Clone());
	termAtt = AddAttribute<ICharTermAttribute>();
	offsetAtt = AddAttribute<IOffsetAttribute>();
	}

	private static BreakIterator CreateSentenceClone()
	{
	lock (syncLock)
	return (BreakIterator)sentenceProto.Clone();
	}

	public override void Reset()
	{
	lock (syncLock)
	base.Reset();
	}

	public override State CaptureState()
	{
	lock (syncLock)
	return base.CaptureState();
	}

	protected override void SetNextSentence(int sentenceStart, int sentenceEnd)
	{
	lock (syncLock)
	{
	this.sentenceStart = sentenceStart;
	this.sentenceEnd = sentenceEnd;
	wrapper.SetText(m_buffer, sentenceStart, sentenceEnd - sentenceStart);
	wordBreaker.SetText(new string(wrapper.Text, wrapper.Start, wrapper.Length));
	}
	}

	protected override bool IncrementWord()
	{
	int start, end;
	lock (syncLock)
	{
	start = wordBreaker.Current;
	if (start == BreakIterator.Done)
	{
	return false; // BreakIterator exhausted
	}

	// find the next set of boundaries, skipping over non-tokens
	end = wordBreaker.Next();
	while (end != BreakIterator.Done && !Character.IsLetterOrDigit(Character.CodePointAt(m_buffer, sentenceStart + start, sentenceEnd)))
	{
	start = end;
	end = wordBreaker.Next();
	}

	if (end == BreakIterator.Done)
	{
	return false; // BreakIterator exhausted
	}

	ClearAttributes();
	termAtt.CopyBuffer(m_buffer, sentenceStart + start, end - start);
	offsetAtt.SetOffset(CorrectOffset(m_offset + sentenceStart + start), CorrectOffset(m_offset + sentenceStart + end));
	return true;
	}
	}
	}

	/// <summary>
	/// LUCENENET specific class to patch the behavior of the ICU BreakIterator to match the behavior of the JDK.
	/// Corrects the breaking of words by finding transitions between Thai and non-Thai
	/// characters.
	/// </summary>
	internal class ThaiWordBreaker
	{
	private readonly BreakIterator wordBreaker;
	private string text;
	private readonly Queue<int> transitions = new Queue<int>();
	private static readonly Regex thaiPattern = new Regex(@"\p{IsThai}+", RegexOptions.Compiled \| RegexOptions.CultureInvariant);

	public ThaiWordBreaker(BreakIterator wordBreaker)
	{
	this.wordBreaker = wordBreaker ?? throw new ArgumentNullException(nameof(wordBreaker));
	}

	public void SetText(string text)
	{
	this.text = text;
	wordBreaker.SetText(text);
	}

	public int Current
	{
	get
	{
	if (transitions.Count > 0)
	return transitions.Peek();

	return wordBreaker.Current;
	}
	}

	public int Next()
	{
	if (transitions.Count > 0)
	transitions.Dequeue();

	if (transitions.Count > 0)
	return transitions.Peek();

	return GetNext();
	}

	private int GetNext()
	{
	bool isThai, isNonThai;
	bool prevWasThai = false, prevWasNonThai = false;
	int prev = wordBreaker.Current;
	int current = wordBreaker.Next();

	if (current != BreakIterator.Done && current - prev > 0)
	{
	int length = text.Length;
	string toMatch;
	// Find all of the transitions between Thai and non-Thai characters and digits
	for (int i = prev; i < current; i++)
	{
	char high = text[i];
	// Account for surrogate pairs
	if (char.IsHighSurrogate(high) && i < length && i + 1 < current && char.IsLowSurrogate(text[i + 1]))
	toMatch = string.Empty + high + text[++i];
	else
	toMatch = string.Empty + high;

	if (char.IsLetter(toMatch, 0)) // Always break letters apart from digits to match the JDK
	{
	isThai = thaiPattern.IsMatch(toMatch);
	isNonThai = !isThai;
	}
	else
	{
	isThai = false;
	isNonThai = false;
	}

	if ((prevWasThai && isNonThai) \|\|
	(prevWasNonThai && isThai))
	{
	transitions.Enqueue(i);
	}

	// record the values for comparison with the next loop
	prevWasThai = isThai;
	prevWasNonThai = isNonThai;
	}

	if (transitions.Count > 0)
	{
	transitions.Enqueue(current);
	return transitions.Peek();
	}
	}

	return current;
	}
	}
	}
	#endif