blob: ac781db9db116e11b15d0cce9bfb7960c2ec7004 [file] [log] [blame]
using Lucene.Net.Analysis.Util;
using Lucene.Net.Diagnostics;
using System.IO;
namespace Lucene.Net.Analysis.Ja
{
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/// <summary>
/// Normalizes Japanese horizontal iteration marks (odoriji) to their expanded form.
/// </summary>
/// <remarks>
/// Sequences of iteration marks are supported. In case an illegal sequence of iteration
/// marks is encountered, the implementation emits the illegal source character as-is
/// without considering its script. For example, with input "&#63;&#12445;", we get
/// "&#63;&#63;" even though "&#63;" isn't hiragana.
/// <para/>
/// Note that a full stop punctuation character "&#x3002;" (U+3002) can not be iterated
/// (see below). Iteration marks themselves can be emitted in case they are illegal,
/// i.e. if they go back past the beginning of the character stream.
/// <para/>
/// The implementation buffers input until a full stop punctuation character (U+3002)
/// or EOF is reached in order to not keep a copy of the character stream in memory.
/// Vertical iteration marks, which are even rarer than horizontal iteration marks in
/// contemporary Japanese, are unsupported.
/// </remarks>
public class JapaneseIterationMarkCharFilter : CharFilter
{
/// <summary>Normalize kanji iteration marks by default</summary>
public static readonly bool NORMALIZE_KANJI_DEFAULT = true;
/// <summary>Normalize kana iteration marks by default</summary>
public static readonly bool NORMALIZE_KANA_DEFAULT = true;
private const char KANJI_ITERATION_MARK = '\u3005'; // 々
private const char HIRAGANA_ITERATION_MARK = '\u309d'; // ゝ
private const char HIRAGANA_VOICED_ITERATION_MARK = '\u309e'; // ゞ
private const char KATAKANA_ITERATION_MARK = '\u30fd'; // ヽ
private const char KATAKANA_VOICED_ITERATION_MARK = '\u30fe'; // ヾ
private const char FULL_STOP_PUNCTUATION = '\u3002'; // 。
// Hiragana to dakuten map (lookup using code point - 0x30ab(か)*/
private static char[] h2d = new char[50];
// Katakana to dakuten map (lookup using code point - 0x30ab(カ
private static char[] k2d = new char[50];
private readonly RollingCharBuffer buffer = new RollingCharBuffer();
private int bufferPosition = 0;
private int iterationMarksSpanSize = 0;
private int iterationMarkSpanEndPosition = 0;
private bool normalizeKanji;
private bool normalizeKana;
static JapaneseIterationMarkCharFilter()
{
// Hiragana dakuten map
h2d[0] = '\u304c'; // か => が
h2d[1] = '\u304c'; // が => が
h2d[2] = '\u304e'; // き => ぎ
h2d[3] = '\u304e'; // ぎ => ぎ
h2d[4] = '\u3050'; // く => ぐ
h2d[5] = '\u3050'; // ぐ => ぐ
h2d[6] = '\u3052'; // け => げ
h2d[7] = '\u3052'; // げ => げ
h2d[8] = '\u3054'; // こ => ご
h2d[9] = '\u3054'; // ご => ご
h2d[10] = '\u3056'; // さ => ざ
h2d[11] = '\u3056'; // ざ => ざ
h2d[12] = '\u3058'; // し => じ
h2d[13] = '\u3058'; // じ => じ
h2d[14] = '\u305a'; // す => ず
h2d[15] = '\u305a'; // ず => ず
h2d[16] = '\u305c'; // せ => ぜ
h2d[17] = '\u305c'; // ぜ => ぜ
h2d[18] = '\u305e'; // そ => ぞ
h2d[19] = '\u305e'; // ぞ => ぞ
h2d[20] = '\u3060'; // た => だ
h2d[21] = '\u3060'; // だ => だ
h2d[22] = '\u3062'; // ち => ぢ
h2d[23] = '\u3062'; // ぢ => ぢ
h2d[24] = '\u3063';
h2d[25] = '\u3065'; // つ => づ
h2d[26] = '\u3065'; // づ => づ
h2d[27] = '\u3067'; // て => で
h2d[28] = '\u3067'; // で => で
h2d[29] = '\u3069'; // と => ど
h2d[30] = '\u3069'; // ど => ど
h2d[31] = '\u306a';
h2d[32] = '\u306b';
h2d[33] = '\u306c';
h2d[34] = '\u306d';
h2d[35] = '\u306e';
h2d[36] = '\u3070'; // は => ば
h2d[37] = '\u3070'; // ば => ば
h2d[38] = '\u3071';
h2d[39] = '\u3073'; // ひ => び
h2d[40] = '\u3073'; // び => び
h2d[41] = '\u3074';
h2d[42] = '\u3076'; // ふ => ぶ
h2d[43] = '\u3076'; // ぶ => ぶ
h2d[44] = '\u3077';
h2d[45] = '\u3079'; // へ => べ
h2d[46] = '\u3079'; // べ => べ
h2d[47] = '\u307a';
h2d[48] = '\u307c'; // ほ => ぼ
h2d[49] = '\u307c'; // ぼ => ぼ
// Make katakana dakuten map from hiragana map
char codePointDifference = (char)('\u30ab' - '\u304b'); // カ - か
if (Debugging.AssertsEnabled) Debugging.Assert(h2d.Length == k2d.Length);
for (int i = 0; i < k2d.Length; i++)
{
k2d[i] = (char)(h2d[i] + codePointDifference);
}
}
/// <summary>
/// Constructor. Normalizes both kanji and kana iteration marks by default.
/// </summary>
/// <param name="input">Char stream.</param>
public JapaneseIterationMarkCharFilter(TextReader input)
: this(input, NORMALIZE_KANJI_DEFAULT, NORMALIZE_KANA_DEFAULT)
{
}
/// <summary>
/// Constructor
/// </summary>
/// <param name="input">Char stream.</param>
/// <param name="normalizeKanji">Indicates whether kanji iteration marks should be normalized.</param>
/// <param name="normalizeKana">Indicates whether kana iteration marks should be normalized.</param>
public JapaneseIterationMarkCharFilter(TextReader input, bool normalizeKanji, bool normalizeKana)
: base(input)
{
this.normalizeKanji = normalizeKanji;
this.normalizeKana = normalizeKana;
buffer.Reset(input);
}
/// <summary>
/// Reads a specified maximum number of characters from the current reader and writes the data to a buffer, beginning at the specified index.
/// </summary>
/// <param name="buffer">
/// When this method returns, contains the specified character array with the values between index and (index + count - 1)
/// replaced by the characters read from the current source.</param>
/// <param name="offset">
/// The position in buffer at which to begin writing.
/// </param>
/// <param name="length">
/// The maximum number of characters to read. If the end of the reader is reached before the specified number of characters is
/// read into the buffer, the method returns.
/// </param>
/// <returns>
/// The number of characters that have been read. The number will be less than or equal to count, depending on whether the data is
/// available within the reader. This method returns 0 (zero) if it is called when no more characters are left to read.
/// </returns>
public override int Read(char[] buffer, int offset, int length)
{
int read = 0;
for (int i = offset; i < offset + length; i++)
{
int c = Read();
if (c == -1)
{
break;
}
buffer[i] = (char)c;
read++;
}
return read == 0 ? -1 : read;
}
/// <summary>
/// Reads the next character from the text reader and advances the character position by one character.
/// </summary>
/// <returns>The next character from the text reader, or -1 if no more characters are available.</returns>
public override int Read()
{
int ic = buffer.Get(bufferPosition);
// End of input
if (ic == -1)
{
buffer.FreeBefore(bufferPosition);
return ic;
}
char c = (char)ic;
// Skip surrogate pair characters
if (char.IsHighSurrogate(c) || char.IsLowSurrogate(c))
{
iterationMarkSpanEndPosition = bufferPosition + 1;
}
// Free rolling buffer on full stop
if (c == FULL_STOP_PUNCTUATION)
{
buffer.FreeBefore(bufferPosition);
iterationMarkSpanEndPosition = bufferPosition + 1;
}
// Normalize iteration mark
if (IsIterationMark(c))
{
c = NormalizeIterationMark(c);
}
bufferPosition++;
return c;
}
/// <summary>
/// Normalizes the iteration mark character <paramref name="c"/>
/// </summary>
/// <param name="c">Iteration mark character to normalize.</param>
/// <returns>Normalized iteration mark.</returns>
/// <exception cref="IOException">If there is a low-level I/O error.</exception>
private char NormalizeIterationMark(char c)
{
// Case 1: Inside an iteration mark span
if (bufferPosition < iterationMarkSpanEndPosition)
{
return Normalize(SourceCharacter(bufferPosition, iterationMarksSpanSize), c);
}
// Case 2: New iteration mark spans starts where the previous one ended, which is illegal
if (bufferPosition == iterationMarkSpanEndPosition)
{
// Emit the illegal iteration mark and increase end position to indicate that we can't
// start a new span on the next position either
iterationMarkSpanEndPosition++;
return c;
}
// Case 3: New iteration mark span
iterationMarksSpanSize = NextIterationMarkSpanSize();
iterationMarkSpanEndPosition = bufferPosition + iterationMarksSpanSize;
return Normalize(SourceCharacter(bufferPosition, iterationMarksSpanSize), c);
}
/// <summary>
/// Finds the number of subsequent next iteration marks
/// </summary>
/// <returns>Number of iteration marks starting at the current buffer position.</returns>
/// <exception cref="IOException">If there is a low-level I/O error.</exception>
private int NextIterationMarkSpanSize()
{
int spanSize = 0;
for (int i = bufferPosition; buffer.Get(i) != -1 && IsIterationMark((char)(buffer.Get(i))); i++)
{
spanSize++;
}
// Restrict span size so that we don't go past the previous end position
if (bufferPosition - spanSize < iterationMarkSpanEndPosition)
{
spanSize = bufferPosition - iterationMarkSpanEndPosition;
}
return spanSize;
}
/// <summary>
/// Returns the source character for a given position and iteration mark span size.
/// </summary>
/// <param name="position">Buffer position (should not exceed bufferPosition).</param>
/// <param name="spanSize">Iteration mark span size.</param>
/// <returns>Source character.</returns>
/// <exception cref="IOException">If there is a low-level I/O error.</exception>
private char SourceCharacter(int position, int spanSize)
{
return (char)buffer.Get(position - spanSize);
}
/// <summary>
/// Normalize a character.
/// </summary>
/// <param name="c">Character to normalize.</param>
/// <param name="m">Repetition mark referring to <paramref name="c"/>.</param>
/// <returns>Normalized character - return c on illegal iteration marks.</returns>
private char Normalize(char c, char m)
{
if (IsHiraganaIterationMark(m))
{
return NormalizedHiragana(c, m);
}
if (IsKatakanaIterationMark(m))
{
return NormalizedKatakana(c, m);
}
return c; // If m is not kana and we are to normalize it, we assume it is kanji and simply return it
}
/// <summary>
/// Normalize hiragana character.
/// </summary>
/// <param name="c">Hiragana character.</param>
/// <param name="m">Repetition mark referring to <paramref name="c"/>.</param>
/// <returns>Normalized character - return <paramref name="c"/> on illegal iteration marks.</returns>
private char NormalizedHiragana(char c, char m)
{
switch (m)
{
case HIRAGANA_ITERATION_MARK:
return IsHiraganaDakuten(c) ? (char)(c - 1) : c;
case HIRAGANA_VOICED_ITERATION_MARK:
return LookupHiraganaDakuten(c);
default:
return c;
}
}
/// <summary>
/// Normalize katakana character.
/// </summary>
/// <param name="c">Katakana character.</param>
/// <param name="m">Repetition mark referring to <paramref name="c"/>.</param>
/// <returns>Normalized character - return <paramref name="c"/> on illegal iteration marks.</returns>
private char NormalizedKatakana(char c, char m)
{
switch (m)
{
case KATAKANA_ITERATION_MARK:
return IsKatakanaDakuten(c) ? (char)(c - 1) : c;
case KATAKANA_VOICED_ITERATION_MARK:
return LookupKatakanaDakuten(c);
default:
return c;
}
}
/// <summary>
/// Iteration mark character predicate.
/// </summary>
/// <param name="c">Character to test.</param>
/// <returns><c>true</c> if <paramref name="c"/> is an iteration mark character. Otherwise <c>false</c>.</returns>
private bool IsIterationMark(char c)
{
return IsKanjiIterationMark(c) || IsHiraganaIterationMark(c) || IsKatakanaIterationMark(c);
}
/// <summary>
/// Hiragana iteration mark character predicate.
/// </summary>
/// <param name="c">Character to test.</param>
/// <returns><c>true</c> if <paramref name="c"/> is a hiragana iteration mark character. Otherwise <c>false</c>.</returns>
private bool IsHiraganaIterationMark(char c)
{
if (normalizeKana)
{
return c == HIRAGANA_ITERATION_MARK || c == HIRAGANA_VOICED_ITERATION_MARK;
}
else
{
return false;
}
}
/// <summary>
/// Katakana iteration mark character predicate.
/// </summary>
/// <param name="c">Character to test.</param>
/// <returns><c>true</c> if c is a katakana iteration mark character. Otherwise <c>false</c>.</returns>
private bool IsKatakanaIterationMark(char c)
{
if (normalizeKana)
{
return c == KATAKANA_ITERATION_MARK || c == KATAKANA_VOICED_ITERATION_MARK;
}
else
{
return false;
}
}
/// <summary>
/// Kanji iteration mark character predicate.
/// </summary>
/// <param name="c">Character to test.</param>
/// <returns><c>true</c> if c is a kanji iteration mark character. Otherwise <c>false</c>.</returns>
private bool IsKanjiIterationMark(char c)
{
if (normalizeKanji)
{
return c == KANJI_ITERATION_MARK;
}
else
{
return false;
}
}
/// <summary>
/// Look up hiragana dakuten.
/// </summary>
/// <param name="c">Character to look up.</param>
/// <returns>Hiragana dakuten variant of c or c itself if no dakuten variant exists.</returns>
private char LookupHiraganaDakuten(char c)
{
return Lookup(c, h2d, '\u304b'); // Code point is for か
}
/// <summary>
/// Look up katakana dakuten. Only full-width katakana are supported.
/// </summary>
/// <param name="c">Character to look up.</param>
/// <returns>Katakana dakuten variant of <paramref name="c"/> or <paramref name="c"/> itself if no dakuten variant exists.</returns>
private char LookupKatakanaDakuten(char c)
{
return Lookup(c, k2d, '\u30ab'); // Code point is for カ
}
/// <summary>
/// Hiragana dakuten predicate.
/// </summary>
/// <param name="c">Character to check.</param>
/// <returns><c>true</c> if c is a hiragana dakuten and otherwise <c>false</c>.</returns>
private bool IsHiraganaDakuten(char c)
{
return Inside(c, h2d, '\u304b') && c == LookupHiraganaDakuten(c);
}
/// <summary>
/// Katakana dakuten predicate.
/// </summary>
/// <param name="c">Character to check.</param>
/// <returns><c>true</c> if c is a hiragana dakuten and otherwise <c>false</c>.</returns>
private bool IsKatakanaDakuten(char c)
{
return Inside(c, k2d, '\u30ab') && c == LookupKatakanaDakuten(c);
}
/// <summary>
/// Looks up a character in dakuten map and returns the dakuten variant if it exists.
/// Otherwise return the character being looked up itself.
/// </summary>
/// <param name="c">Character to look up.</param>
/// <param name="map">Dakuten map.</param>
/// <param name="offset">Code point offset from <paramref name="c"/>.</param>
/// <returns>Mapped character or <paramref name="c"/> if no mapping exists.</returns>
private char Lookup(char c, char[] map, char offset)
{
if (!Inside(c, map, offset))
{
return c;
}
else
{
return map[c - offset];
}
}
/// <summary>
/// Predicate indicating if the lookup character is within dakuten map range.
/// </summary>
/// <param name="c">Character to look up.</param>
/// <param name="map">Dakuten map.</param>
/// <param name="offset">Code point offset from <paramref name="c"/>.</param>
/// <returns><c>true</c> if <paramref name="c"/> is mapped by map and otherwise <c>false</c>.</returns>
private bool Inside(char c, char[] map, char offset)
{
return c >= offset && c < offset + map.Length;
}
protected override int Correct(int currentOff)
{
return currentOff; // this filter doesn't change the length of strings
}
}
}