blob: cf602f6ba033ae882be7acc447f6b7fe395e2745 [file] [log] [blame]
using Lucene.Net.Analysis.TokenAttributes;
using Lucene.Net.Analysis.Util;
namespace Lucene.Net.Analysis.Cjk
{
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/// <summary>
/// A <see cref="TokenFilter"/> that normalizes CJK width differences:
/// <list type="bullet">
/// <item><description>Folds fullwidth ASCII variants into the equivalent basic latin</description></item>
/// <item><description>Folds halfwidth Katakana variants into the equivalent kana</description></item>
/// </list>
/// <para>
/// NOTE: this filter can be viewed as a (practical) subset of NFKC/NFKD
/// Unicode normalization. See the normalization support in the ICU package
/// for full normalization.
/// </para>
/// </summary>
public sealed class CJKWidthFilter : TokenFilter
{
private ICharTermAttribute termAtt;
/// <summary>
/// halfwidth kana mappings: 0xFF65-0xFF9D
/// <para/>
/// note: 0xFF9C and 0xFF9D are only mapped to 0x3099 and 0x309A
/// as a fallback when they cannot properly combine with a preceding
/// character into a composed form.
/// </summary>
private static readonly char[] KANA_NORM = new char[] {
(char)0x30fb, (char)0x30f2, (char)0x30a1, (char)0x30a3, (char)0x30a5, (char)0x30a7, (char)0x30a9, (char)0x30e3, (char)0x30e5,
(char)0x30e7, (char)0x30c3, (char)0x30fc, (char)0x30a2, (char)0x30a4, (char)0x30a6, (char)0x30a8, (char)0x30aa, (char)0x30ab,
(char)0x30ad, (char)0x30af, (char)0x30b1, (char)0x30b3, (char)0x30b5, (char)0x30b7, (char)0x30b9, (char)0x30bb, (char)0x30bd,
(char)0x30bf, (char)0x30c1, (char)0x30c4, (char)0x30c6, (char)0x30c8, (char)0x30ca, (char)0x30cb, (char)0x30cc, (char)0x30cd,
(char)0x30ce, (char)0x30cf, (char)0x30d2, (char)0x30d5, (char)0x30d8, (char)0x30db, (char)0x30de, (char)0x30df, (char)0x30e0,
(char)0x30e1, (char)0x30e2, (char)0x30e4, (char)0x30e6, (char)0x30e8, (char)0x30e9, (char)0x30ea, (char)0x30eb, (char)0x30ec,
(char)0x30ed, (char)0x30ef, (char)0x30f3, (char)0x3099, (char)0x309A
};
public CJKWidthFilter(TokenStream input)
: base(input)
{
termAtt = AddAttribute<ICharTermAttribute>();
}
public override bool IncrementToken()
{
if (m_input.IncrementToken())
{
char[] text = termAtt.Buffer;
int length = termAtt.Length;
for (int i = 0; i < length; i++)
{
char ch = text[i];
if (ch >= 0xFF01 && ch <= 0xFF5E)
{
// Fullwidth ASCII variants
text[i] = (char)(text[i] - 0xFEE0);
}
else if (ch >= 0xFF65 && ch <= 0xFF9F)
{
// Halfwidth Katakana variants
if ((ch == 0xFF9E || ch == 0xFF9F) && i > 0 && Combine(text, i, ch))
{
length = StemmerUtil.Delete(text, i--, length);
}
else
{
text[i] = KANA_NORM[ch - 0xFF65];
}
}
}
termAtt.Length = length;
return true;
}
else
{
return false;
}
}
/// <summary>kana combining diffs: 0x30A6-0x30FD </summary>
private static readonly sbyte[] KANA_COMBINE_VOICED = new sbyte[] {
78, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1,
0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1,
0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 8, 8, 8, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1
};
private static readonly sbyte[] KANA_COMBINE_HALF_VOICED = new sbyte[] {
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 2, 0, 0, 2,
0, 0, 2, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
};
/// <summary>
/// returns true if we successfully combined the voice mark </summary>
private static bool Combine(char[] text, int pos, char ch)
{
char prev = text[pos - 1];
if (prev >= 0x30A6 && prev <= 0x30FD)
{
text[pos - 1] += (char)((ch == 0xFF9F) ? KANA_COMBINE_HALF_VOICED[prev - 0x30A6] : KANA_COMBINE_VOICED[prev - 0x30A6]);
return text[pos - 1] != prev;
}
return false;
}
}
}