blob: 9f4c1d555df46b73e53710953357c22c790e276b [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.analysis.ja;
import java.io.IOException;
import java.math.BigDecimal;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute;
/**
* A {@link TokenFilter} that normalizes Japanese numbers (kansūji) to regular Arabic
* decimal numbers in half-width characters.
* <p>
* Japanese numbers are often written using a combination of kanji and Arabic numbers with
* various kinds punctuation. For example, 3.2千 means 3200. This filter does this kind
* of normalization and allows a search for 3200 to match 3.2千 in text, but can also be
* used to make range facets based on the normalized numbers and so on.
* <p>
* Notice that this analyzer uses a token composition scheme and relies on punctuation
* tokens being found in the token stream. Please make sure your {@link JapaneseTokenizer}
* has {@code discardPunctuation} set to false. In case punctuation characters, such as .
* (U+FF0E FULLWIDTH FULL STOP), is removed from the token stream, this filter would find
* input tokens tokens 3 and 2千 and give outputs 3 and 2000 instead of 3200, which is
* likely not the intended result. If you want to remove punctuation characters from your
* index that are not part of normalized numbers, add a
* {@link org.apache.lucene.analysis.StopFilter} with the punctuation you wish to
* remove after {@link JapaneseNumberFilter} in your analyzer chain.
* <p>
* Below are some examples of normalizations this filter supports. The input is untokenized
* text and the result is the single term attribute emitted for the input.
* <ul>
* <li>〇〇七 becomes 7</li>
* <li>一〇〇〇 becomes 1000</li>
* <li>三千2百2十三 becomes 3223</li>
* <li>兆六百万五千一 becomes 1000006005001</li>
* <li>3.2千 becomes 3200</li>
* <li>1.2万345.67 becomes 12345.67</li>
* <li>4,647.100 becomes 4647.1</li>
* <li>15,7 becomes 157 (be aware of this weakness)</li>
* </ul>
* <p>
* Tokens preceded by a token with {@link PositionIncrementAttribute} of zero are left
* left untouched and emitted as-is.
* <p>
* This filter does not use any part-of-speech information for its normalization and
* the motivation for this is to also support n-grammed token streams in the future.
* <p>
* This filter may in some cases normalize tokens that are not numbers in their context.
* For example, is 田中京一 is a name and means Tanaka Kyōichi, but 京一 (Kyōichi) out of
* context can strictly speaking also represent the number 10000000000000001. This filter
* respects the {@link KeywordAttribute}, which can be used to prevent specific
* normalizations from happening.
* <p>
* Also notice that token attributes such as
* {@link org.apache.lucene.analysis.ja.tokenattributes.PartOfSpeechAttribute},
* {@link org.apache.lucene.analysis.ja.tokenattributes.ReadingAttribute},
* {@link org.apache.lucene.analysis.ja.tokenattributes.InflectionAttribute} and
* {@link org.apache.lucene.analysis.ja.tokenattributes.BaseFormAttribute} are left
* unchanged and will inherit the values of the last token used to compose the normalized
* number and can be wrong. Hence, for 10万 (10000), we will have
* {@link org.apache.lucene.analysis.ja.tokenattributes.ReadingAttribute}
* set to マン. This is a known issue and is subject to a future improvement.
* <p>
* Japanese formal numbers (daiji), accounting numbers and decimal fractions are currently
* not supported.
*/
public class JapaneseNumberFilter extends TokenFilter {
private final CharTermAttribute termAttr = addAttribute(CharTermAttribute.class);
private final OffsetAttribute offsetAttr = addAttribute(OffsetAttribute.class);
private final KeywordAttribute keywordAttr = addAttribute(KeywordAttribute.class);
private final PositionIncrementAttribute posIncrAttr = addAttribute(PositionIncrementAttribute.class);
private final PositionLengthAttribute posLengthAttr = addAttribute(PositionLengthAttribute.class);
private static char NO_NUMERAL = Character.MAX_VALUE;
private static char[] numerals;
private static char[] exponents;
private State state;
private StringBuilder numeral;
private int fallThroughTokens;
private boolean exhausted = false;
static {
numerals = new char[0x10000];
for (int i = 0; i < numerals.length; i++) {
numerals[i] = NO_NUMERAL;
}
numerals['〇'] = 0; // 〇 U+3007 0
numerals['一'] = 1; // 一 U+4E00 1
numerals['二'] = 2; // 二 U+4E8C 2
numerals['三'] = 3; // 三 U+4E09 3
numerals['四'] = 4; // 四 U+56DB 4
numerals['五'] = 5; // 五 U+4E94 5
numerals['六'] = 6; // 六 U+516D 6
numerals['七'] = 7; // 七 U+4E03 7
numerals['八'] = 8; // 八 U+516B 8
numerals['九'] = 9; // 九 U+4E5D 9
exponents = new char[0x10000];
for (int i = 0; i < exponents.length; i++) {
exponents[i] = 0;
}
exponents['十'] = 1; // 十 U+5341 10
exponents['百'] = 2; // 百 U+767E 100
exponents['千'] = 3; // 千 U+5343 1,000
exponents['万'] = 4; // 万 U+4E07 10,000
exponents['億'] = 8; // 億 U+5104 100,000,000
exponents['兆'] = 12; // 兆 U+5146 1,000,000,000,000
exponents['京'] = 16; // 京 U+4EAC 10,000,000,000,000,000
exponents['垓'] = 20; // 垓 U+5793 100,000,000,000,000,000,000
}
public JapaneseNumberFilter(TokenStream input) {
super(input);
}
@Override
public final boolean incrementToken() throws IOException {
// Emit previously captured token we read past earlier
if (state != null) {
restoreState(state);
state = null;
return true;
}
if (exhausted) {
return false;
}
if (!input.incrementToken()) {
exhausted = true;
return false;
}
if (keywordAttr.isKeyword()) {
return true;
}
if (fallThroughTokens > 0) {
fallThroughTokens--;
return true;
}
if (posIncrAttr.getPositionIncrement() == 0) {
fallThroughTokens = posLengthAttr.getPositionLength() - 1;
return true;
}
boolean moreTokens = true;
boolean composedNumberToken = false;
int startOffset = 0;
int endOffset = 0;
State preCompositionState = captureState();
String term = termAttr.toString();
boolean numeralTerm = isNumeral(term);
while (moreTokens && numeralTerm) {
if (!composedNumberToken) {
startOffset = offsetAttr.startOffset();
composedNumberToken = true;
}
endOffset = offsetAttr.endOffset();
moreTokens = input.incrementToken();
if (moreTokens == false) {
exhausted = true;
}
if (posIncrAttr.getPositionIncrement() == 0) {
// This token is a stacked/synonym token, capture number of tokens "under" this token,
// except the first token, which we will emit below after restoring state
fallThroughTokens = posLengthAttr.getPositionLength() - 1;
state = captureState();
restoreState(preCompositionState);
return moreTokens;
}
numeral.append(term);
if (moreTokens) {
term = termAttr.toString();
numeralTerm = isNumeral(term) || isNumeralPunctuation(term);
}
}
if (composedNumberToken) {
if (moreTokens) {
// We have read past all numerals and there are still tokens left, so
// capture the state of this token and emit it on our next incrementToken()
state = captureState();
}
String normalizedNumber = normalizeNumber(numeral.toString());
termAttr.setEmpty();
termAttr.append(normalizedNumber);
offsetAttr.setOffset(startOffset, endOffset);
numeral = new StringBuilder();
return true;
}
return moreTokens;
}
@Override
public void reset() throws IOException {
super.reset();
fallThroughTokens = 0;
numeral = new StringBuilder();
state = null;
exhausted = false;
}
/**
* Normalizes a Japanese number
*
* @param number number or normalize
* @return normalized number, or number to normalize on error (no op)
*/
public String normalizeNumber(String number) {
try {
BigDecimal normalizedNumber = parseNumber(new NumberBuffer(number));
if (normalizedNumber == null) {
return number;
}
return normalizedNumber.stripTrailingZeros().toPlainString();
} catch (NumberFormatException | ArithmeticException e) {
// Return the source number in case of error, i.e. malformed input
return number;
}
}
/**
* Parses a Japanese number
*
* @param buffer buffer to parse
* @return parsed number, or null on error or end of input
*/
private BigDecimal parseNumber(NumberBuffer buffer) {
BigDecimal sum = BigDecimal.ZERO;
BigDecimal result = parseLargePair(buffer);
if (result == null) {
return null;
}
while (result != null) {
sum = sum.add(result);
result = parseLargePair(buffer);
}
return sum;
}
/**
* Parses a pair of large numbers, i.e. large kanji factor is 10,000(万)or larger
*
* @param buffer buffer to parse
* @return parsed pair, or null on error or end of input
*/
private BigDecimal parseLargePair(NumberBuffer buffer) {
BigDecimal first = parseMediumNumber(buffer);
BigDecimal second = parseLargeKanjiNumeral(buffer);
if (first == null && second == null) {
return null;
}
if (second == null) {
// If there's no second factor, we return the first one
// This can happen if we our number is smaller than 10,000 (万)
return first;
}
if (first == null) {
// If there's no first factor, just return the second one,
// which is the same as multiplying by 1, i.e. with 万
return second;
}
return first.multiply(second);
}
/**
* Parses a "medium sized" number, typically less than 10,000(万), but might be larger
* due to a larger factor from {link parseBasicNumber}.
*
* @param buffer buffer to parse
* @return parsed number, or null on error or end of input
*/
private BigDecimal parseMediumNumber(NumberBuffer buffer) {
BigDecimal sum = BigDecimal.ZERO;
BigDecimal result = parseMediumPair(buffer);
if (result == null) {
return null;
}
while (result != null) {
sum = sum.add(result);
result = parseMediumPair(buffer);
}
return sum;
}
/**
* Parses a pair of "medium sized" numbers, i.e. large kanji factor is at most 1,000(千)
*
* @param buffer buffer to parse
* @return parsed pair, or null on error or end of input
*/
private BigDecimal parseMediumPair(NumberBuffer buffer) {
BigDecimal first = parseBasicNumber(buffer);
BigDecimal second = parseMediumKanjiNumeral(buffer);
if (first == null && second == null) {
return null;
}
if (second == null) {
// If there's no second factor, we return the first one
// This can happen if we just have a plain number such as 五
return first;
}
if (first == null) {
// If there's no first factor, just return the second one,
// which is the same as multiplying by 1, i.e. with 千
return second;
}
// Return factors multiplied
return first.multiply(second);
}
/**
* Parse a basic number, which is a sequence of Arabic numbers or a sequence or 0-9 kanji numerals (〇 to 九).
*
* @param buffer buffer to parse
* @return parsed number, or null on error or end of input
*/
private BigDecimal parseBasicNumber(NumberBuffer buffer) {
StringBuilder builder = new StringBuilder();
int i = buffer.position();
while (i < buffer.length()) {
char c = buffer.charAt(i);
if (isArabicNumeral(c)) {
// Arabic numerals; 0 to 9 or 0 to 9 (full-width)
builder.append(arabicNumeralValue(c));
} else if (isKanjiNumeral(c)) {
// Kanji numerals; 〇, 一, 二, 三, 四, 五, 六, 七, 八, or 九
builder.append(kanjiNumeralValue(c));
} else if (isDecimalPoint(c)) {
builder.append(".");
} else if (isThousandSeparator(c)) {
// Just skip and move to the next character
} else {
// We don't have an Arabic nor kanji numeral, nor separation or punctuation, so we'll stop.
break;
}
i++;
buffer.advance();
}
if (builder.length() == 0) {
// We didn't build anything, so we don't have a number
return null;
}
return new BigDecimal(builder.toString());
}
/**
* Parse large kanji numerals (ten thousands or larger)
*
* @param buffer buffer to parse
* @return parsed number, or null on error or end of input
*/
public BigDecimal parseLargeKanjiNumeral(NumberBuffer buffer) {
int i = buffer.position();
if (i >= buffer.length()) {
return null;
}
char c = buffer.charAt(i);
int power = exponents[c];
if (power > 3) {
buffer.advance();
return BigDecimal.TEN.pow(power);
}
return null;
}
/**
* Parse medium kanji numerals (tens, hundreds or thousands)
*
* @param buffer buffer to parse
* @return parsed number or null on error
*/
public BigDecimal parseMediumKanjiNumeral(NumberBuffer buffer) {
int i = buffer.position();
if (i >= buffer.length()) {
return null;
}
char c = buffer.charAt(i);
int power = exponents[c];
if (1 <= power && power <= 3) {
buffer.advance();
return BigDecimal.TEN.pow(power);
}
return null;
}
/**
* Numeral predicate
*
* @param input string to test
* @return true if and only if input is a numeral
*/
public boolean isNumeral(String input) {
for (int i = 0; i < input.length(); i++) {
if (!isNumeral(input.charAt(i))) {
return false;
}
}
return true;
}
/**
* Numeral predicate
*
* @param c character to test
* @return true if and only if c is a numeral
*/
public boolean isNumeral(char c) {
return isArabicNumeral(c) || isKanjiNumeral(c) || exponents[c] > 0;
}
/**
* Numeral punctuation predicate
*
* @param input string to test
* @return true if and only if c is a numeral punctuation string
*/
public boolean isNumeralPunctuation(String input) {
for (int i = 0; i < input.length(); i++) {
if (!isNumeralPunctuation(input.charAt(i))) {
return false;
}
}
return true;
}
/**
* Numeral punctuation predicate
*
* @param c character to test
* @return true if and only if c is a numeral punctuation character
*/
public boolean isNumeralPunctuation(char c) {
return isDecimalPoint(c) || isThousandSeparator(c);
}
/**
* Arabic numeral predicate. Both half-width and full-width characters are supported
*
* @param c character to test
* @return true if and only if c is an Arabic numeral
*/
public boolean isArabicNumeral(char c) {
return isHalfWidthArabicNumeral(c) || isFullWidthArabicNumeral(c);
}
/**
* Arabic half-width numeral predicate
*
* @param c character to test
* @return true if and only if c is a half-width Arabic numeral
*/
private boolean isHalfWidthArabicNumeral(char c) {
// 0 U+0030 - 9 U+0039
return '0' <= c && c <= '9';
}
/**
* Arabic full-width numeral predicate
*
* @param c character to test
* @return true if and only if c is a full-width Arabic numeral
*/
private boolean isFullWidthArabicNumeral(char c) {
// 0 U+FF10 - 9 U+FF19
return '0' <= c && c <= '9';
}
/**
* Returns the numeric value for the specified character Arabic numeral.
* Behavior is undefined if a non-Arabic numeral is provided
*
* @param c arabic numeral character
* @return numeral value
*/
private int arabicNumeralValue(char c) {
int offset;
if (isHalfWidthArabicNumeral(c)) {
offset = '0';
} else {
offset = '0';
}
return c - offset;
}
/**
* Kanji numeral predicate that tests if the provided character is one of 〇, 一, 二, 三, 四, 五, 六, 七, 八, or 九.
* Larger number kanji gives a false value.
*
* @param c character to test
* @return true if and only is character is one of 〇, 一, 二, 三, 四, 五, 六, 七, 八, or 九 (0 to 9)
*/
private boolean isKanjiNumeral(char c) {
return numerals[c] != NO_NUMERAL;
}
/**
* Returns the value for the provided kanji numeral. Only numeric values for the characters where
* {link isKanjiNumeral} return true are supported - behavior is undefined for other characters.
*
* @param c kanji numeral character
* @return numeral value
* @see #isKanjiNumeral(char)
*/
private int kanjiNumeralValue(char c) {
return numerals[c];
}
/**
* Decimal point predicate
*
* @param c character to test
* @return true if and only if c is a decimal point
*/
private boolean isDecimalPoint(char c) {
return c == '.' // U+002E FULL STOP
|| c == '.'; // U+FF0E FULLWIDTH FULL STOP
}
/**
* Thousand separator predicate
*
* @param c character to test
* @return true if and only if c is a thousand separator predicate
*/
private boolean isThousandSeparator(char c) {
return c == ',' // U+002C COMMA
|| c == ','; // U+FF0C FULLWIDTH COMMA
}
/**
* Buffer that holds a Japanese number string and a position index used as a parsed-to marker
*/
public static class NumberBuffer {
private int position;
private String string;
public NumberBuffer(String string) {
this.string = string;
this.position = 0;
}
public char charAt(int index) {
return string.charAt(index);
}
public int length() {
return string.length();
}
public void advance() {
position++;
}
public int position() {
return position;
}
}
}