blob: 7e1d7a11529abf040f547f48b4e784727048fd91 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.analysis.ja;
import org.apache.lucene.analysis.CharFilter;
import org.apache.lucene.analysis.util.RollingCharBuffer;
import java.io.IOException;
import java.io.Reader;
/**
* Normalizes Japanese horizontal iteration marks (odoriji) to their expanded form.
* <p>
* Sequences of iteration marks are supported. In case an illegal sequence of iteration
* marks is encountered, the implementation emits the illegal source character as-is
* without considering its script. For example, with input "?ゝ", we get
* "??" even though the question mark isn't hiragana.
* </p>
* <p>
* Note that a full stop punctuation character "。" (U+3002) can not be iterated
* (see below). Iteration marks themselves can be emitted in case they are illegal,
* i.e. if they go back past the beginning of the character stream.
* </p>
* <p>
* The implementation buffers input until a full stop punctuation character (U+3002)
* or EOF is reached in order to not keep a copy of the character stream in memory.
* Vertical iteration marks, which are even rarer than horizontal iteration marks in
* contemporary Japanese, are unsupported.
* </p>
*/
public class JapaneseIterationMarkCharFilter extends CharFilter {
/** Normalize kanji iteration marks by default */
public static final boolean NORMALIZE_KANJI_DEFAULT = true;
/** Normalize kana iteration marks by default */
public static final boolean NORMALIZE_KANA_DEFAULT = true;
private static final char KANJI_ITERATION_MARK = '\u3005'; // 々
private static final char HIRAGANA_ITERATION_MARK = '\u309d'; // ゝ
private static final char HIRAGANA_VOICED_ITERATION_MARK = '\u309e'; // ゞ
private static final char KATAKANA_ITERATION_MARK = '\u30fd'; // ヽ
private static final char KATAKANA_VOICED_ITERATION_MARK = '\u30fe'; // ヾ
private static final char FULL_STOP_PUNCTUATION = '\u3002'; // 。
// Hiragana to dakuten map (lookup using code point - 0x30ab(か)*/
private static char[] h2d = new char[50];
// Katakana to dakuten map (lookup using code point - 0x30ab(カ
private static char[] k2d = new char[50];
private final RollingCharBuffer buffer = new RollingCharBuffer();
private int bufferPosition = 0;
private int iterationMarksSpanSize = 0;
private int iterationMarkSpanEndPosition = 0;
private boolean normalizeKanji;
private boolean normalizeKana;
static {
// Hiragana dakuten map
h2d[0] = '\u304c'; // か => が
h2d[1] = '\u304c'; // が => が
h2d[2] = '\u304e'; // き => ぎ
h2d[3] = '\u304e'; // ぎ => ぎ
h2d[4] = '\u3050'; // く => ぐ
h2d[5] = '\u3050'; // ぐ => ぐ
h2d[6] = '\u3052'; // け => げ
h2d[7] = '\u3052'; // げ => げ
h2d[8] = '\u3054'; // こ => ご
h2d[9] = '\u3054'; // ご => ご
h2d[10] = '\u3056'; // さ => ざ
h2d[11] = '\u3056'; // ざ => ざ
h2d[12] = '\u3058'; // し => じ
h2d[13] = '\u3058'; // じ => じ
h2d[14] = '\u305a'; // す => ず
h2d[15] = '\u305a'; // ず => ず
h2d[16] = '\u305c'; // せ => ぜ
h2d[17] = '\u305c'; // ぜ => ぜ
h2d[18] = '\u305e'; // そ => ぞ
h2d[19] = '\u305e'; // ぞ => ぞ
h2d[20] = '\u3060'; // た => だ
h2d[21] = '\u3060'; // だ => だ
h2d[22] = '\u3062'; // ち => ぢ
h2d[23] = '\u3062'; // ぢ => ぢ
h2d[24] = '\u3063';
h2d[25] = '\u3065'; // つ => づ
h2d[26] = '\u3065'; // づ => づ
h2d[27] = '\u3067'; // て => で
h2d[28] = '\u3067'; // で => で
h2d[29] = '\u3069'; // と => ど
h2d[30] = '\u3069'; // ど => ど
h2d[31] = '\u306a';
h2d[32] = '\u306b';
h2d[33] = '\u306c';
h2d[34] = '\u306d';
h2d[35] = '\u306e';
h2d[36] = '\u3070'; // は => ば
h2d[37] = '\u3070'; // ば => ば
h2d[38] = '\u3071';
h2d[39] = '\u3073'; // ひ => び
h2d[40] = '\u3073'; // び => び
h2d[41] = '\u3074';
h2d[42] = '\u3076'; // ふ => ぶ
h2d[43] = '\u3076'; // ぶ => ぶ
h2d[44] = '\u3077';
h2d[45] = '\u3079'; // へ => べ
h2d[46] = '\u3079'; // べ => べ
h2d[47] = '\u307a';
h2d[48] = '\u307c'; // ほ => ぼ
h2d[49] = '\u307c'; // ぼ => ぼ
// Make katakana dakuten map from hiragana map
char codePointDifference = '\u30ab' - '\u304b'; // カ - か
assert h2d.length == k2d.length;
for (int i = 0; i < k2d.length; i++) {
k2d[i] = (char) (h2d[i] + codePointDifference);
}
}
/**
* Constructor. Normalizes both kanji and kana iteration marks by default.
*
* @param input char stream
*/
public JapaneseIterationMarkCharFilter(Reader input) {
this(input, NORMALIZE_KANJI_DEFAULT, NORMALIZE_KANA_DEFAULT);
}
/**
* Constructor
*
* @param input char stream
* @param normalizeKanji indicates whether kanji iteration marks should be normalized
* @param normalizeKana indicates whether kana iteration marks should be normalized
*/
public JapaneseIterationMarkCharFilter(Reader input, boolean normalizeKanji, boolean normalizeKana) {
super(input);
this.normalizeKanji = normalizeKanji;
this.normalizeKana = normalizeKana;
buffer.reset(input);
}
@Override
public int read(char[] buffer, int offset, int length) throws IOException {
int read = 0;
for (int i = offset; i < offset + length; i++) {
int c = read();
if (c == -1) {
break;
}
buffer[i] = (char) c;
read++;
}
return read == 0 ? -1 : read;
}
@Override
public int read() throws IOException {
int ic = buffer.get(bufferPosition);
// End of input
if (ic == -1) {
buffer.freeBefore(bufferPosition);
return ic;
}
char c = (char) ic;
// Skip surrogate pair characters
if (Character.isHighSurrogate(c) || Character.isLowSurrogate(c)) {
iterationMarkSpanEndPosition = bufferPosition + 1;
}
// Free rolling buffer on full stop
if (c == FULL_STOP_PUNCTUATION) {
buffer.freeBefore(bufferPosition);
iterationMarkSpanEndPosition = bufferPosition + 1;
}
// Normalize iteration mark
if (isIterationMark(c)) {
c = normalizeIterationMark(c);
}
bufferPosition++;
return c;
}
/**
* Normalizes the iteration mark character c
*
* @param c iteration mark character to normalize
* @return normalized iteration mark
* @throws IOException If there is a low-level I/O error.
*/
private char normalizeIterationMark(char c) throws IOException {
// Case 1: Inside an iteration mark span
if (bufferPosition < iterationMarkSpanEndPosition) {
return normalize(sourceCharacter(bufferPosition, iterationMarksSpanSize), c);
}
// Case 2: New iteration mark spans starts where the previous one ended, which is illegal
if (bufferPosition == iterationMarkSpanEndPosition) {
// Emit the illegal iteration mark and increase end position to indicate that we can't
// start a new span on the next position either
iterationMarkSpanEndPosition++;
return c;
}
// Case 3: New iteration mark span
iterationMarksSpanSize = nextIterationMarkSpanSize();
iterationMarkSpanEndPosition = bufferPosition + iterationMarksSpanSize;
return normalize(sourceCharacter(bufferPosition, iterationMarksSpanSize), c);
}
/**
* Finds the number of subsequent next iteration marks
*
* @return number of iteration marks starting at the current buffer position
* @throws IOException If there is a low-level I/O error.
*/
private int nextIterationMarkSpanSize() throws IOException {
int spanSize = 0;
for (int i = bufferPosition; buffer.get(i) != -1 && isIterationMark((char) (buffer.get(i))); i++) {
spanSize++;
}
// Restrict span size so that we don't go past the previous end position
if (bufferPosition - spanSize < iterationMarkSpanEndPosition) {
spanSize = bufferPosition - iterationMarkSpanEndPosition;
}
return spanSize;
}
/**
* Returns the source character for a given position and iteration mark span size
*
* @param position buffer position (should not exceed bufferPosition)
* @param spanSize iteration mark span size
* @return source character
* @throws IOException If there is a low-level I/O error.
*/
private char sourceCharacter(int position, int spanSize) throws IOException {
return (char) buffer.get(position - spanSize);
}
/**
* Normalize a character
*
* @param c character to normalize
* @param m repetition mark referring to c
* @return normalized character - return c on illegal iteration marks
*/
private char normalize(char c, char m) {
if (isHiraganaIterationMark(m)) {
return normalizedHiragana(c, m);
}
if (isKatakanaIterationMark(m)) {
return normalizedKatakana(c, m);
}
return c; // If m is not kana and we are to normalize it, we assume it is kanji and simply return it
}
/**
* Normalize hiragana character
*
* @param c hiragana character
* @param m repetition mark referring to c
* @return normalized character - return c on illegal iteration marks
*/
private char normalizedHiragana(char c, char m) {
switch (m) {
case HIRAGANA_ITERATION_MARK:
return isHiraganaDakuten(c) ? (char) (c - 1) : c;
case HIRAGANA_VOICED_ITERATION_MARK:
return lookupHiraganaDakuten(c);
default:
return c;
}
}
/**
* Normalize katakana character
*
* @param c katakana character
* @param m repetition mark referring to c
* @return normalized character - return c on illegal iteration marks
*/
private char normalizedKatakana(char c, char m) {
switch (m) {
case KATAKANA_ITERATION_MARK:
return isKatakanaDakuten(c) ? (char) (c - 1) : c;
case KATAKANA_VOICED_ITERATION_MARK:
return lookupKatakanaDakuten(c);
default:
return c;
}
}
/**
* Iteration mark character predicate
*
* @param c character to test
* @return true if c is an iteration mark character. Otherwise false.
*/
private boolean isIterationMark(char c) {
return isKanjiIterationMark(c) || isHiraganaIterationMark(c) || isKatakanaIterationMark(c);
}
/**
* Hiragana iteration mark character predicate
*
* @param c character to test
* @return true if c is a hiragana iteration mark character. Otherwise false.
*/
private boolean isHiraganaIterationMark(char c) {
if (normalizeKana) {
return c == HIRAGANA_ITERATION_MARK || c == HIRAGANA_VOICED_ITERATION_MARK;
} else {
return false;
}
}
/**
* Katakana iteration mark character predicate
*
* @param c character to test
* @return true if c is a katakana iteration mark character. Otherwise false.
*/
private boolean isKatakanaIterationMark(char c) {
if (normalizeKana) {
return c == KATAKANA_ITERATION_MARK || c == KATAKANA_VOICED_ITERATION_MARK;
} else {
return false;
}
}
/**
* Kanji iteration mark character predicate
*
* @param c character to test
* @return true if c is a kanji iteration mark character. Otherwise false.
*/
private boolean isKanjiIterationMark(char c) {
if (normalizeKanji) {
return c == KANJI_ITERATION_MARK;
} else {
return false;
}
}
/**
* Look up hiragana dakuten
*
* @param c character to look up
* @return hiragana dakuten variant of c or c itself if no dakuten variant exists
*/
private char lookupHiraganaDakuten(char c) {
return lookup(c, h2d, '\u304b'); // Code point is for か
}
/**
* Look up katakana dakuten. Only full-width katakana are supported.
*
* @param c character to look up
* @return katakana dakuten variant of c or c itself if no dakuten variant exists
*/
private char lookupKatakanaDakuten(char c) {
return lookup(c, k2d, '\u30ab'); // Code point is for カ
}
/**
* Hiragana dakuten predicate
*
* @param c character to check
* @return true if c is a hiragana dakuten and otherwise false
*/
private boolean isHiraganaDakuten(char c) {
return inside(c, h2d, '\u304b') && c == lookupHiraganaDakuten(c);
}
/**
* Katakana dakuten predicate
*
* @param c character to check
* @return true if c is a hiragana dakuten and otherwise false
*/
private boolean isKatakanaDakuten(char c) {
return inside(c, k2d, '\u30ab') && c == lookupKatakanaDakuten(c);
}
/**
* Looks up a character in dakuten map and returns the dakuten variant if it exists.
* Otherwise return the character being looked up itself
*
* @param c character to look up
* @param map dakuten map
* @param offset code point offset from c
* @return mapped character or c if no mapping exists
*/
private char lookup(char c, char[] map, char offset) {
if (!inside(c, map, offset)) {
return c;
} else {
return map[c - offset];
}
}
/**
* Predicate indicating if the lookup character is within dakuten map range
*
* @param c character to look up
* @param map dakuten map
* @param offset code point offset from c
* @return true if c is mapped by map and otherwise false
*/
private boolean inside(char c, char[] map, char offset) {
return c >= offset && c < offset + map.length;
}
@Override
protected int correct(int currentOff) {
return currentOff; // this filter doesn't change the length of strings
}
}