blob: 03f5cce057ef9aee8d3f8bdf04d8a3811d42906a [file] [log] [blame]
#if FEATURE_BREAKITERATOR
using ICU4N.Text;
using Lucene.Net.Analysis.Core;
using Lucene.Net.Analysis.TokenAttributes;
using Lucene.Net.Analysis.Util;
using Lucene.Net.Util;
using System;
using System.Globalization;
using System.Text.RegularExpressions;
namespace Lucene.Net.Analysis.Th
{
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
// LUCENENET NOTE: Removing this notice from the doc comment because it is not relevant for our purposes.
//<para>WARNING: this filter may not be supported by all JREs.
// It is known to work with Sun/Oracle and Harmony JREs.
// If your application needs to be fully portable, consider using ICUTokenizer instead,
// which uses an ICU Thai BreakIterator that will always be available.
// </para>
/// <summary>
/// <see cref="TokenFilter"/> that use <see cref="BreakIterator"/> to break each
/// Token that is Thai into separate Token(s) for each Thai word.
/// <para>Please note: Since matchVersion 3.1 on, this filter no longer lowercases non-thai text.
/// <see cref="ThaiAnalyzer"/> will insert a <see cref="LowerCaseFilter"/> before this filter
/// so the behaviour of the Analyzer does not change. With version 3.1, the filter handles
/// position increments correctly.
/// </para>
/// </summary>
/// @deprecated Use <see cref="ThaiTokenizer"/> instead.
[Obsolete("Use ThaiTokenizer instead.")]
public sealed class ThaiWordFilter : TokenFilter
{
// LUCENENET specific - DBBI_AVAILABLE removed because ICU always has a dictionary-based BreakIterator
private static readonly BreakIterator proto = BreakIterator.GetWordInstance(new CultureInfo("th"));
private readonly ThaiWordBreaker breaker = new ThaiWordBreaker((BreakIterator)proto.Clone());
private readonly CharArrayIterator charIterator = CharArrayIterator.NewWordInstance();
private readonly bool handlePosIncr;
private readonly ICharTermAttribute termAtt;
private readonly IOffsetAttribute offsetAtt;
private readonly IPositionIncrementAttribute posAtt;
private AttributeSource clonedToken = null;
private ICharTermAttribute clonedTermAtt = null;
private IOffsetAttribute clonedOffsetAtt = null;
private bool hasMoreTokensInClone = false;
private bool hasIllegalOffsets = false; // only if the length changed before this filter
private static readonly Regex thaiPattern = new Regex(@"\p{IsThai}", RegexOptions.Compiled | RegexOptions.CultureInvariant);
/// <summary>
/// Creates a new <see cref="ThaiWordFilter"/> with the specified match version. </summary>
public ThaiWordFilter(LuceneVersion matchVersion, TokenStream input)
: base(matchVersion.OnOrAfter(LuceneVersion.LUCENE_31) ? input : new LowerCaseFilter(matchVersion, input))
{
// LUCENENET specific - DBBI_AVAILABLE removed because ICU always has a dictionary-based BreakIterator
handlePosIncr = matchVersion.OnOrAfter(LuceneVersion.LUCENE_31);
termAtt = AddAttribute<ICharTermAttribute>();
offsetAtt = AddAttribute<IOffsetAttribute>();
posAtt = AddAttribute<IPositionIncrementAttribute>();
}
public override bool IncrementToken()
{
if (hasMoreTokensInClone)
{
int start = breaker.Current;
int end = breaker.Next();
if (end != BreakIterator.Done)
{
clonedToken.CopyTo(this);
termAtt.CopyBuffer(clonedTermAtt.Buffer, start, end - start);
if (hasIllegalOffsets)
{
offsetAtt.SetOffset(clonedOffsetAtt.StartOffset, clonedOffsetAtt.EndOffset);
}
else
{
offsetAtt.SetOffset(clonedOffsetAtt.StartOffset + start, clonedOffsetAtt.StartOffset + end);
}
if (handlePosIncr)
{
posAtt.PositionIncrement = 1;
}
return true;
}
hasMoreTokensInClone = false;
}
if (!m_input.IncrementToken())
{
return false;
}
if (termAtt.Length == 0 || !thaiPattern.IsMatch(string.Empty + termAtt[0]))
{
return true;
}
hasMoreTokensInClone = true;
// if length by start + end offsets doesn't match the term text then assume
// this is a synonym and don't adjust the offsets.
hasIllegalOffsets = offsetAtt.EndOffset - offsetAtt.StartOffset != termAtt.Length;
// we lazy init the cloned token, as in ctor not all attributes may be added
if (clonedToken == null)
{
clonedToken = CloneAttributes();
clonedTermAtt = clonedToken.GetAttribute<ICharTermAttribute>();
clonedOffsetAtt = clonedToken.GetAttribute<IOffsetAttribute>();
}
else
{
this.CopyTo(clonedToken);
}
// reinit CharacterIterator
charIterator.SetText(clonedTermAtt.Buffer, 0, clonedTermAtt.Length);
breaker.SetText(new string(charIterator.Text, charIterator.Start, charIterator.Length));
int end2 = breaker.Next();
if (end2 != BreakIterator.Done)
{
termAtt.Length = end2;
if (hasIllegalOffsets)
{
offsetAtt.SetOffset(clonedOffsetAtt.StartOffset, clonedOffsetAtt.EndOffset);
}
else
{
offsetAtt.SetOffset(clonedOffsetAtt.StartOffset, clonedOffsetAtt.StartOffset + end2);
}
// position increment keeps as it is for first token
return true;
}
return false;
}
public override void Reset()
{
base.Reset();
hasMoreTokensInClone = false;
clonedToken = null;
clonedTermAtt = null;
clonedOffsetAtt = null;
}
}
}
#endif