blob: d8625d9f3b33f25213ac25a6816f62737ccd9c73 [file] [log] [blame]
using ICU4NET;
using ICU4NETExtension;
using Lucene.Net.Analysis.Tokenattributes;
using Lucene.Net.Analysis.Util;
using Lucene.Net.Support;
using System;
using System.Collections.Generic;
using System.IO;
using System.Linq;
using System.Text.RegularExpressions;
namespace Lucene.Net.Analysis.Th
{
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/// <summary>
/// Tokenizer that use <seealso cref="BreakIterator"/> to tokenize Thai text.
/// <para>WARNING: this tokenizer may not be supported by all JREs.
/// It is known to work with Sun/Oracle and Harmony JREs.
/// If your application needs to be fully portable, consider using ICUTokenizer instead,
/// which uses an ICU Thai BreakIterator that will always be available.
/// </para>
/// </summary>
public class ThaiTokenizer : SegmentingTokenizerBase
{
/// <summary>
/// True if the JRE supports a working dictionary-based breakiterator for Thai.
/// If this is false, this tokenizer will not work at all!
/// </summary>
public static readonly bool DBBI_AVAILABLE;
private static readonly BreakIterator proto = BreakIterator.CreateWordInstance(Locale.GetUS());
static ThaiTokenizer()
{
// check that we have a working dictionary-based break iterator for thai
proto.SetText("ภาษาไทย");
DBBI_AVAILABLE = proto.IsBoundary(4);
}
private readonly ThaiWordBreaker wordBreaker;
private readonly CharArrayIterator wrapper = CharArrayIterator.NewWordInstance();
internal int sentenceStart;
internal int sentenceEnd;
private readonly ICharTermAttribute termAtt;
private readonly IOffsetAttribute offsetAtt;
/// <summary>
/// Creates a new ThaiTokenizer </summary>
public ThaiTokenizer(TextReader reader)
: this(AttributeFactory.DEFAULT_ATTRIBUTE_FACTORY, reader)
{
}
/// <summary>
/// Creates a new ThaiTokenizer, supplying the AttributeFactory </summary>
public ThaiTokenizer(AttributeFactory factory, TextReader reader)
: base(factory, reader, BreakIterator.CreateSentenceInstance(Locale.GetUS()))
{
if (!DBBI_AVAILABLE)
{
throw new System.NotSupportedException("This JRE does not have support for Thai segmentation");
}
wordBreaker = new ThaiWordBreaker(BreakIterator.CreateWordInstance(Locale.GetUS()));
termAtt = AddAttribute<ICharTermAttribute>();
offsetAtt = AddAttribute<IOffsetAttribute>();
}
protected internal override void SetNextSentence(int sentenceStart, int sentenceEnd)
{
this.sentenceStart = sentenceStart;
this.sentenceEnd = sentenceEnd;
wrapper.SetText(buffer, sentenceStart, sentenceEnd - sentenceStart);
wordBreaker.SetText(new string(wrapper.Text, wrapper.Start, wrapper.Length));
}
protected internal override bool IncrementWord()
{
int start = wordBreaker.Current();
if (start == BreakIterator.DONE)
{
return false; // BreakIterator exhausted
}
// find the next set of boundaries, skipping over non-tokens
int end = wordBreaker.Next();
while (end != BreakIterator.DONE && !char.IsLetterOrDigit((char)Character.CodePointAt(buffer, sentenceStart + start, sentenceEnd)))
{
start = end;
end = wordBreaker.Next();
}
if (end == BreakIterator.DONE)
{
return false; // BreakIterator exhausted
}
ClearAttributes();
termAtt.CopyBuffer(buffer, sentenceStart + start, end - start);
offsetAtt.SetOffset(CorrectOffset(offset + sentenceStart + start), CorrectOffset(offset + sentenceStart + end));
return true;
}
}
/// <summary>
/// LUCENENET specific class to patch the behavior of the ICU BreakIterator.
/// Corrects the breaking of words by finding transitions between Thai and non-Thai
/// characters.
///
/// This logic assumes that the Java BreakIterator also breaks up Thai numerals from
/// Arabic numerals (1, 2, 3, etc.). That is, it assumes the first test below passes
/// and the second test fails in Lucene (not attempted).
///
/// ThaiAnalyzer analyzer = new ThaiAnalyzer(TEST_VERSION_CURRENT, CharArraySet.EMPTY_SET);
/// AssertAnalyzesTo(analyzer, "๑๒๓456", new string[] { "๑๒๓", "456" });
/// AssertAnalyzesTo(analyzer, "๑๒๓456", new string[] { "๑๒๓456" });
/// </summary>
internal class ThaiWordBreaker
{
private readonly BreakIterator wordBreaker;
private string text;
private readonly IList<int> transitions = new List<int>();
private readonly static Regex thaiPattern = new Regex(@"\p{IsThai}", RegexOptions.Compiled | RegexOptions.CultureInvariant);
public ThaiWordBreaker(BreakIterator wordBreaker)
{
if (wordBreaker == null)
{
throw new ArgumentNullException("wordBreaker");
}
this.wordBreaker = wordBreaker;
}
public void SetText(string text)
{
this.text = text;
wordBreaker.SetText(text);
}
public int Current()
{
if (transitions.Any())
{
return transitions.First();
}
return wordBreaker.Current();
}
public int Next()
{
if (transitions.Any())
{
transitions.RemoveAt(0);
}
if (transitions.Any())
{
return transitions.First();
}
return GetNext();
}
private int GetNext()
{
bool isThai = false, isNonThai = false;
bool prevWasThai = false, prevWasNonThai = false;
int prev = wordBreaker.Current();
int current = wordBreaker.Next();
if (current != BreakIterator.DONE && current - prev > 0)
{
// Find all of the transitions between Thai and non-Thai characters and digits
for (int i = prev; i < current; i++)
{
char c = text[i];
isThai = thaiPattern.IsMatch(c.ToString());
isNonThai = char.IsLetterOrDigit(c) && !isThai;
if ((prevWasThai && isNonThai) ||
(prevWasNonThai && isThai))
{
transitions.Add(i);
}
// record the values for comparison with the next loop
prevWasThai = isThai;
prevWasNonThai = isNonThai;
}
if (transitions.Any())
{
transitions.Add(current);
return transitions.First();
}
}
return current;
}
}
}