| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| package org.apache.lucene.analysis.ja; |
| |
| |
| import java.io.IOException; |
| import java.math.BigDecimal; |
| |
| import org.apache.lucene.analysis.TokenFilter; |
| import org.apache.lucene.analysis.TokenStream; |
| import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; |
| import org.apache.lucene.analysis.tokenattributes.KeywordAttribute; |
| import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; |
| import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; |
| import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute; |
| |
| /** |
| * A {@link TokenFilter} that normalizes Japanese numbers (kansūji) to regular Arabic |
| * decimal numbers in half-width characters. |
| * <p> |
| * Japanese numbers are often written using a combination of kanji and Arabic numbers with |
| * various kinds punctuation. For example, 3.2千 means 3200. This filter does this kind |
| * of normalization and allows a search for 3200 to match 3.2千 in text, but can also be |
| * used to make range facets based on the normalized numbers and so on. |
| * <p> |
| * Notice that this analyzer uses a token composition scheme and relies on punctuation |
| * tokens being found in the token stream. Please make sure your {@link JapaneseTokenizer} |
| * has {@code discardPunctuation} set to false. In case punctuation characters, such as . |
| * (U+FF0E FULLWIDTH FULL STOP), is removed from the token stream, this filter would find |
| * input tokens tokens 3 and 2千 and give outputs 3 and 2000 instead of 3200, which is |
| * likely not the intended result. If you want to remove punctuation characters from your |
| * index that are not part of normalized numbers, add a |
| * {@link org.apache.lucene.analysis.StopFilter} with the punctuation you wish to |
| * remove after {@link JapaneseNumberFilter} in your analyzer chain. |
| * <p> |
| * Below are some examples of normalizations this filter supports. The input is untokenized |
| * text and the result is the single term attribute emitted for the input. |
| * <ul> |
| * <li>〇〇七 becomes 7</li> |
| * <li>一〇〇〇 becomes 1000</li> |
| * <li>三千2百2十三 becomes 3223</li> |
| * <li>兆六百万五千一 becomes 1000006005001</li> |
| * <li>3.2千 becomes 3200</li> |
| * <li>1.2万345.67 becomes 12345.67</li> |
| * <li>4,647.100 becomes 4647.1</li> |
| * <li>15,7 becomes 157 (be aware of this weakness)</li> |
| * </ul> |
| * <p> |
| * Tokens preceded by a token with {@link PositionIncrementAttribute} of zero are left |
| * left untouched and emitted as-is. |
| * <p> |
| * This filter does not use any part-of-speech information for its normalization and |
| * the motivation for this is to also support n-grammed token streams in the future. |
| * <p> |
| * This filter may in some cases normalize tokens that are not numbers in their context. |
| * For example, is 田中京一 is a name and means Tanaka Kyōichi, but 京一 (Kyōichi) out of |
| * context can strictly speaking also represent the number 10000000000000001. This filter |
| * respects the {@link KeywordAttribute}, which can be used to prevent specific |
| * normalizations from happening. |
| * <p> |
| * Also notice that token attributes such as |
| * {@link org.apache.lucene.analysis.ja.tokenattributes.PartOfSpeechAttribute}, |
| * {@link org.apache.lucene.analysis.ja.tokenattributes.ReadingAttribute}, |
| * {@link org.apache.lucene.analysis.ja.tokenattributes.InflectionAttribute} and |
| * {@link org.apache.lucene.analysis.ja.tokenattributes.BaseFormAttribute} are left |
| * unchanged and will inherit the values of the last token used to compose the normalized |
| * number and can be wrong. Hence, for 10万 (10000), we will have |
| * {@link org.apache.lucene.analysis.ja.tokenattributes.ReadingAttribute} |
| * set to マン. This is a known issue and is subject to a future improvement. |
| * <p> |
| * Japanese formal numbers (daiji), accounting numbers and decimal fractions are currently |
| * not supported. |
| */ |
| public class JapaneseNumberFilter extends TokenFilter { |
| |
| private final CharTermAttribute termAttr = addAttribute(CharTermAttribute.class); |
| private final OffsetAttribute offsetAttr = addAttribute(OffsetAttribute.class); |
| private final KeywordAttribute keywordAttr = addAttribute(KeywordAttribute.class); |
| private final PositionIncrementAttribute posIncrAttr = addAttribute(PositionIncrementAttribute.class); |
| private final PositionLengthAttribute posLengthAttr = addAttribute(PositionLengthAttribute.class); |
| |
| private static char NO_NUMERAL = Character.MAX_VALUE; |
| |
| private static char[] numerals; |
| |
| private static char[] exponents; |
| |
| private State state; |
| |
| private StringBuilder numeral; |
| |
| private int fallThroughTokens; |
| |
| private boolean exhausted = false; |
| |
| static { |
| numerals = new char[0x10000]; |
| for (int i = 0; i < numerals.length; i++) { |
| numerals[i] = NO_NUMERAL; |
| } |
| numerals['〇'] = 0; // 〇 U+3007 0 |
| numerals['一'] = 1; // 一 U+4E00 1 |
| numerals['二'] = 2; // 二 U+4E8C 2 |
| numerals['三'] = 3; // 三 U+4E09 3 |
| numerals['四'] = 4; // 四 U+56DB 4 |
| numerals['五'] = 5; // 五 U+4E94 5 |
| numerals['六'] = 6; // 六 U+516D 6 |
| numerals['七'] = 7; // 七 U+4E03 7 |
| numerals['八'] = 8; // 八 U+516B 8 |
| numerals['九'] = 9; // 九 U+4E5D 9 |
| |
| exponents = new char[0x10000]; |
| for (int i = 0; i < exponents.length; i++) { |
| exponents[i] = 0; |
| } |
| exponents['十'] = 1; // 十 U+5341 10 |
| exponents['百'] = 2; // 百 U+767E 100 |
| exponents['千'] = 3; // 千 U+5343 1,000 |
| exponents['万'] = 4; // 万 U+4E07 10,000 |
| exponents['億'] = 8; // 億 U+5104 100,000,000 |
| exponents['兆'] = 12; // 兆 U+5146 1,000,000,000,000 |
| exponents['京'] = 16; // 京 U+4EAC 10,000,000,000,000,000 |
| exponents['垓'] = 20; // 垓 U+5793 100,000,000,000,000,000,000 |
| } |
| |
| public JapaneseNumberFilter(TokenStream input) { |
| super(input); |
| } |
| |
| @Override |
| public final boolean incrementToken() throws IOException { |
| |
| // Emit previously captured token we read past earlier |
| if (state != null) { |
| restoreState(state); |
| state = null; |
| return true; |
| } |
| |
| if (exhausted) { |
| return false; |
| } |
| |
| if (!input.incrementToken()) { |
| exhausted = true; |
| return false; |
| } |
| |
| if (keywordAttr.isKeyword()) { |
| return true; |
| } |
| |
| if (fallThroughTokens > 0) { |
| fallThroughTokens--; |
| return true; |
| } |
| |
| if (posIncrAttr.getPositionIncrement() == 0) { |
| fallThroughTokens = posLengthAttr.getPositionLength() - 1; |
| return true; |
| } |
| |
| boolean moreTokens = true; |
| boolean composedNumberToken = false; |
| int startOffset = 0; |
| int endOffset = 0; |
| State preCompositionState = captureState(); |
| String term = termAttr.toString(); |
| boolean numeralTerm = isNumeral(term); |
| |
| while (moreTokens && numeralTerm) { |
| |
| if (!composedNumberToken) { |
| startOffset = offsetAttr.startOffset(); |
| composedNumberToken = true; |
| } |
| |
| endOffset = offsetAttr.endOffset(); |
| moreTokens = input.incrementToken(); |
| if (moreTokens == false) { |
| exhausted = true; |
| } |
| |
| if (posIncrAttr.getPositionIncrement() == 0) { |
| // This token is a stacked/synonym token, capture number of tokens "under" this token, |
| // except the first token, which we will emit below after restoring state |
| fallThroughTokens = posLengthAttr.getPositionLength() - 1; |
| state = captureState(); |
| restoreState(preCompositionState); |
| return moreTokens; |
| } |
| |
| numeral.append(term); |
| |
| if (moreTokens) { |
| term = termAttr.toString(); |
| numeralTerm = isNumeral(term) || isNumeralPunctuation(term); |
| } |
| } |
| |
| if (composedNumberToken) { |
| if (moreTokens) { |
| // We have read past all numerals and there are still tokens left, so |
| // capture the state of this token and emit it on our next incrementToken() |
| state = captureState(); |
| } |
| |
| String normalizedNumber = normalizeNumber(numeral.toString()); |
| |
| termAttr.setEmpty(); |
| termAttr.append(normalizedNumber); |
| offsetAttr.setOffset(startOffset, endOffset); |
| |
| numeral = new StringBuilder(); |
| return true; |
| } |
| return moreTokens; |
| } |
| |
| @Override |
| public void reset() throws IOException { |
| super.reset(); |
| fallThroughTokens = 0; |
| numeral = new StringBuilder(); |
| state = null; |
| exhausted = false; |
| } |
| |
| /** |
| * Normalizes a Japanese number |
| * |
| * @param number number or normalize |
| * @return normalized number, or number to normalize on error (no op) |
| */ |
| public String normalizeNumber(String number) { |
| try { |
| BigDecimal normalizedNumber = parseNumber(new NumberBuffer(number)); |
| if (normalizedNumber == null) { |
| return number; |
| } |
| return normalizedNumber.stripTrailingZeros().toPlainString(); |
| } catch (NumberFormatException | ArithmeticException e) { |
| // Return the source number in case of error, i.e. malformed input |
| return number; |
| } |
| } |
| |
| /** |
| * Parses a Japanese number |
| * |
| * @param buffer buffer to parse |
| * @return parsed number, or null on error or end of input |
| */ |
| private BigDecimal parseNumber(NumberBuffer buffer) { |
| BigDecimal sum = BigDecimal.ZERO; |
| BigDecimal result = parseLargePair(buffer); |
| |
| if (result == null) { |
| return null; |
| } |
| |
| while (result != null) { |
| sum = sum.add(result); |
| result = parseLargePair(buffer); |
| } |
| |
| return sum; |
| } |
| |
| /** |
| * Parses a pair of large numbers, i.e. large kanji factor is 10,000(万)or larger |
| * |
| * @param buffer buffer to parse |
| * @return parsed pair, or null on error or end of input |
| */ |
| private BigDecimal parseLargePair(NumberBuffer buffer) { |
| BigDecimal first = parseMediumNumber(buffer); |
| BigDecimal second = parseLargeKanjiNumeral(buffer); |
| |
| if (first == null && second == null) { |
| return null; |
| } |
| |
| if (second == null) { |
| // If there's no second factor, we return the first one |
| // This can happen if we our number is smaller than 10,000 (万) |
| return first; |
| } |
| |
| if (first == null) { |
| // If there's no first factor, just return the second one, |
| // which is the same as multiplying by 1, i.e. with 万 |
| return second; |
| } |
| |
| return first.multiply(second); |
| } |
| |
| /** |
| * Parses a "medium sized" number, typically less than 10,000(万), but might be larger |
| * due to a larger factor from {link parseBasicNumber}. |
| * |
| * @param buffer buffer to parse |
| * @return parsed number, or null on error or end of input |
| */ |
| private BigDecimal parseMediumNumber(NumberBuffer buffer) { |
| BigDecimal sum = BigDecimal.ZERO; |
| BigDecimal result = parseMediumPair(buffer); |
| |
| if (result == null) { |
| return null; |
| } |
| |
| while (result != null) { |
| sum = sum.add(result); |
| result = parseMediumPair(buffer); |
| } |
| |
| return sum; |
| } |
| |
| /** |
| * Parses a pair of "medium sized" numbers, i.e. large kanji factor is at most 1,000(千) |
| * |
| * @param buffer buffer to parse |
| * @return parsed pair, or null on error or end of input |
| */ |
| private BigDecimal parseMediumPair(NumberBuffer buffer) { |
| |
| BigDecimal first = parseBasicNumber(buffer); |
| BigDecimal second = parseMediumKanjiNumeral(buffer); |
| |
| if (first == null && second == null) { |
| return null; |
| } |
| |
| if (second == null) { |
| // If there's no second factor, we return the first one |
| // This can happen if we just have a plain number such as 五 |
| return first; |
| } |
| |
| if (first == null) { |
| // If there's no first factor, just return the second one, |
| // which is the same as multiplying by 1, i.e. with 千 |
| return second; |
| } |
| |
| // Return factors multiplied |
| return first.multiply(second); |
| } |
| |
| /** |
| * Parse a basic number, which is a sequence of Arabic numbers or a sequence or 0-9 kanji numerals (〇 to 九). |
| * |
| * @param buffer buffer to parse |
| * @return parsed number, or null on error or end of input |
| */ |
| private BigDecimal parseBasicNumber(NumberBuffer buffer) { |
| StringBuilder builder = new StringBuilder(); |
| int i = buffer.position(); |
| |
| while (i < buffer.length()) { |
| char c = buffer.charAt(i); |
| |
| if (isArabicNumeral(c)) { |
| // Arabic numerals; 0 to 9 or 0 to 9 (full-width) |
| builder.append(arabicNumeralValue(c)); |
| } else if (isKanjiNumeral(c)) { |
| // Kanji numerals; 〇, 一, 二, 三, 四, 五, 六, 七, 八, or 九 |
| builder.append(kanjiNumeralValue(c)); |
| } else if (isDecimalPoint(c)) { |
| builder.append("."); |
| } else if (isThousandSeparator(c)) { |
| // Just skip and move to the next character |
| } else { |
| // We don't have an Arabic nor kanji numeral, nor separation or punctuation, so we'll stop. |
| break; |
| } |
| |
| i++; |
| buffer.advance(); |
| } |
| |
| if (builder.length() == 0) { |
| // We didn't build anything, so we don't have a number |
| return null; |
| } |
| |
| return new BigDecimal(builder.toString()); |
| } |
| |
| /** |
| * Parse large kanji numerals (ten thousands or larger) |
| * |
| * @param buffer buffer to parse |
| * @return parsed number, or null on error or end of input |
| */ |
| public BigDecimal parseLargeKanjiNumeral(NumberBuffer buffer) { |
| int i = buffer.position(); |
| |
| if (i >= buffer.length()) { |
| return null; |
| } |
| |
| char c = buffer.charAt(i); |
| int power = exponents[c]; |
| |
| if (power > 3) { |
| buffer.advance(); |
| return BigDecimal.TEN.pow(power); |
| } |
| |
| return null; |
| } |
| |
| /** |
| * Parse medium kanji numerals (tens, hundreds or thousands) |
| * |
| * @param buffer buffer to parse |
| * @return parsed number or null on error |
| */ |
| public BigDecimal parseMediumKanjiNumeral(NumberBuffer buffer) { |
| int i = buffer.position(); |
| |
| if (i >= buffer.length()) { |
| return null; |
| } |
| |
| char c = buffer.charAt(i); |
| int power = exponents[c]; |
| |
| if (1 <= power && power <= 3) { |
| buffer.advance(); |
| return BigDecimal.TEN.pow(power); |
| } |
| |
| return null; |
| } |
| |
| /** |
| * Numeral predicate |
| * |
| * @param input string to test |
| * @return true if and only if input is a numeral |
| */ |
| public boolean isNumeral(String input) { |
| for (int i = 0; i < input.length(); i++) { |
| if (!isNumeral(input.charAt(i))) { |
| return false; |
| } |
| } |
| return true; |
| } |
| |
| /** |
| * Numeral predicate |
| * |
| * @param c character to test |
| * @return true if and only if c is a numeral |
| */ |
| public boolean isNumeral(char c) { |
| return isArabicNumeral(c) || isKanjiNumeral(c) || exponents[c] > 0; |
| } |
| |
| /** |
| * Numeral punctuation predicate |
| * |
| * @param input string to test |
| * @return true if and only if c is a numeral punctuation string |
| */ |
| public boolean isNumeralPunctuation(String input) { |
| for (int i = 0; i < input.length(); i++) { |
| if (!isNumeralPunctuation(input.charAt(i))) { |
| return false; |
| } |
| } |
| return true; |
| } |
| |
| /** |
| * Numeral punctuation predicate |
| * |
| * @param c character to test |
| * @return true if and only if c is a numeral punctuation character |
| */ |
| public boolean isNumeralPunctuation(char c) { |
| return isDecimalPoint(c) || isThousandSeparator(c); |
| } |
| |
| /** |
| * Arabic numeral predicate. Both half-width and full-width characters are supported |
| * |
| * @param c character to test |
| * @return true if and only if c is an Arabic numeral |
| */ |
| public boolean isArabicNumeral(char c) { |
| return isHalfWidthArabicNumeral(c) || isFullWidthArabicNumeral(c); |
| } |
| |
| /** |
| * Arabic half-width numeral predicate |
| * |
| * @param c character to test |
| * @return true if and only if c is a half-width Arabic numeral |
| */ |
| private boolean isHalfWidthArabicNumeral(char c) { |
| // 0 U+0030 - 9 U+0039 |
| return '0' <= c && c <= '9'; |
| } |
| |
| /** |
| * Arabic full-width numeral predicate |
| * |
| * @param c character to test |
| * @return true if and only if c is a full-width Arabic numeral |
| */ |
| private boolean isFullWidthArabicNumeral(char c) { |
| // 0 U+FF10 - 9 U+FF19 |
| return '0' <= c && c <= '9'; |
| } |
| |
| /** |
| * Returns the numeric value for the specified character Arabic numeral. |
| * Behavior is undefined if a non-Arabic numeral is provided |
| * |
| * @param c arabic numeral character |
| * @return numeral value |
| */ |
| private int arabicNumeralValue(char c) { |
| int offset; |
| if (isHalfWidthArabicNumeral(c)) { |
| offset = '0'; |
| } else { |
| offset = '0'; |
| } |
| return c - offset; |
| } |
| |
| /** |
| * Kanji numeral predicate that tests if the provided character is one of 〇, 一, 二, 三, 四, 五, 六, 七, 八, or 九. |
| * Larger number kanji gives a false value. |
| * |
| * @param c character to test |
| * @return true if and only is character is one of 〇, 一, 二, 三, 四, 五, 六, 七, 八, or 九 (0 to 9) |
| */ |
| private boolean isKanjiNumeral(char c) { |
| return numerals[c] != NO_NUMERAL; |
| } |
| |
| /** |
| * Returns the value for the provided kanji numeral. Only numeric values for the characters where |
| * {link isKanjiNumeral} return true are supported - behavior is undefined for other characters. |
| * |
| * @param c kanji numeral character |
| * @return numeral value |
| * @see #isKanjiNumeral(char) |
| */ |
| private int kanjiNumeralValue(char c) { |
| return numerals[c]; |
| } |
| |
| /** |
| * Decimal point predicate |
| * |
| * @param c character to test |
| * @return true if and only if c is a decimal point |
| */ |
| private boolean isDecimalPoint(char c) { |
| return c == '.' // U+002E FULL STOP |
| || c == '.'; // U+FF0E FULLWIDTH FULL STOP |
| } |
| |
| /** |
| * Thousand separator predicate |
| * |
| * @param c character to test |
| * @return true if and only if c is a thousand separator predicate |
| */ |
| private boolean isThousandSeparator(char c) { |
| return c == ',' // U+002C COMMA |
| || c == ','; // U+FF0C FULLWIDTH COMMA |
| } |
| |
| /** |
| * Buffer that holds a Japanese number string and a position index used as a parsed-to marker |
| */ |
| public static class NumberBuffer { |
| |
| private int position; |
| |
| private String string; |
| |
| public NumberBuffer(String string) { |
| this.string = string; |
| this.position = 0; |
| } |
| |
| public char charAt(int index) { |
| return string.charAt(index); |
| } |
| |
| public int length() { |
| return string.length(); |
| } |
| |
| public void advance() { |
| position++; |
| } |
| |
| public int position() { |
| return position; |
| } |
| } |
| } |