blob: 0b6459c8eba44e667e97be948c53497bbb7a101d [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.analysis.ja;
import java.io.IOException;
import java.io.Reader;
import java.io.Writer;
import java.nio.charset.StandardCharsets;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.junit.Ignore;
import org.junit.Test;
public class TestJapaneseNumberFilter extends BaseTokenStreamTestCase {
private Analyzer analyzer;
@Override
public void setUp() throws Exception {
super.setUp();
analyzer =
new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName) {
Tokenizer tokenizer =
new JapaneseTokenizer(
newAttributeFactory(), null, false, false, JapaneseTokenizer.Mode.SEARCH);
return new TokenStreamComponents(tokenizer, new JapaneseNumberFilter(tokenizer));
}
};
}
@Override
public void tearDown() throws Exception {
analyzer.close();
super.tearDown();
}
@Test
public void testBasics() throws IOException {
assertAnalyzesTo(
analyzer,
"本日十万二千五百円のワインを買った",
new String[] {"本日", "102500", "円", "の", "ワイン", "を", "買っ", "た"},
new int[] {0, 2, 8, 9, 10, 13, 14, 16},
new int[] {2, 8, 9, 10, 13, 14, 16, 17});
assertAnalyzesTo(
analyzer,
"昨日のお寿司は10万円でした。",
new String[] {"昨日", "の", "お", "寿司", "は", "100000", "円", "でし", "た", "。"},
new int[] {0, 2, 3, 4, 6, 7, 10, 11, 13, 14},
new int[] {2, 3, 4, 6, 7, 10, 11, 13, 14, 15});
assertAnalyzesTo(
analyzer,
"アティリカの資本金は600万円です",
new String[] {"アティリカ", "の", "資本", "金", "は", "6000000", "円", "です"},
new int[] {0, 5, 6, 8, 9, 10, 14, 15},
new int[] {5, 6, 8, 9, 10, 14, 15, 17});
}
@Test
public void testVariants() throws IOException {
// Test variants of three
assertAnalyzesTo(analyzer, "3", new String[] {"3"});
assertAnalyzesTo(analyzer, "3", new String[] {"3"});
assertAnalyzesTo(analyzer, "三", new String[] {"3"});
// Test three variations with trailing zero
assertAnalyzesTo(analyzer, "03", new String[] {"3"});
assertAnalyzesTo(analyzer, "03", new String[] {"3"});
assertAnalyzesTo(analyzer, "〇三", new String[] {"3"});
assertAnalyzesTo(analyzer, "003", new String[] {"3"});
assertAnalyzesTo(analyzer, "003", new String[] {"3"});
assertAnalyzesTo(analyzer, "〇〇三", new String[] {"3"});
// Test thousand variants
assertAnalyzesTo(analyzer, "千", new String[] {"1000"});
assertAnalyzesTo(analyzer, "1千", new String[] {"1000"});
assertAnalyzesTo(analyzer, "1千", new String[] {"1000"});
assertAnalyzesTo(analyzer, "一千", new String[] {"1000"});
assertAnalyzesTo(analyzer, "一〇〇〇", new String[] {"1000"});
assertAnalyzesTo(analyzer, "10百", new String[] {"1000"}); // Strange, but supported
}
@Test
public void testLargeVariants() throws IOException {
// Test large numbers
assertAnalyzesTo(analyzer, "三五七八九", new String[] {"35789"});
assertAnalyzesTo(analyzer, "六百二万五千一", new String[] {"6025001"});
assertAnalyzesTo(analyzer, "兆六百万五千一", new String[] {"1000006005001"});
assertAnalyzesTo(analyzer, "十兆六百万五千一", new String[] {"10000006005001"});
assertAnalyzesTo(analyzer, "一京一", new String[] {"10000000000000001"});
assertAnalyzesTo(analyzer, "十京十", new String[] {"100000000000000010"});
assertAnalyzesTo(analyzer, "垓京兆億万千百十一", new String[] {"100010001000100011111"});
}
@Test
public void testNegative() throws IOException {
assertAnalyzesTo(analyzer, "-100万", new String[] {"-", "1000000"});
}
@Test
public void testMixed() throws IOException {
// Test mixed numbers
assertAnalyzesTo(analyzer, "三千2百2十三", new String[] {"3223"});
assertAnalyzesTo(analyzer, "32二三", new String[] {"3223"});
}
@Test
public void testNininsankyaku() throws IOException {
// Unstacked tokens
assertAnalyzesTo(analyzer, "二", new String[] {"2"});
assertAnalyzesTo(analyzer, "二人", new String[] {"2", "人"});
assertAnalyzesTo(analyzer, "二人三", new String[] {"2", "人", "3"});
// Stacked tokens - emit tokens as they are
assertAnalyzesTo(analyzer, "二人三脚", new String[] {"二", "二人三脚", "人", "三", "脚"});
}
@Test
public void testFujiyaichinisanu() throws IOException {
// Stacked tokens with a numeral partial
assertAnalyzesTo(analyzer, "不二家一二三", new String[] {"不", "不二家", "二", "家", "123"});
}
@Test
public void testFunny() throws IOException {
// Test some oddities for inconsistent input
assertAnalyzesTo(analyzer, "十十", new String[] {"20"}); // 100?
assertAnalyzesTo(analyzer, "百百百", new String[] {"300"}); // 10,000?
assertAnalyzesTo(analyzer, "千千千千", new String[] {"4000"}); // 1,000,000,000,000?
}
@Test
public void testKanjiArabic() throws IOException {
// Test kanji numerals used as Arabic numbers (with head zero)
assertAnalyzesTo(analyzer, "〇一二三四五六七八九九八七六五四三二一〇", new String[] {"1234567899876543210"});
// I'm Bond, James "normalized" Bond...
assertAnalyzesTo(analyzer, "〇〇七", new String[] {"7"});
}
@Test
public void testDoubleZero() throws IOException {
assertAnalyzesTo(
analyzer, "〇〇", new String[] {"0"}, new int[] {0}, new int[] {2}, new int[] {1});
}
@Test
public void testName() throws IOException {
// Test name that normalises to number
assertAnalyzesTo(
analyzer,
"田中京一",
new String[] {"田中", "10000000000000001"}, // 京一 is normalized to a number
new int[] {0, 2},
new int[] {2, 4},
new int[] {1, 1});
// An analyzer that marks 京一 as a keyword
Analyzer keywordMarkingAnalyzer =
new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName) {
CharArraySet set = new CharArraySet(1, false);
set.add("京一");
Tokenizer tokenizer =
new JapaneseTokenizer(
newAttributeFactory(), null, false, JapaneseTokenizer.Mode.SEARCH);
return new TokenStreamComponents(
tokenizer, new JapaneseNumberFilter(new SetKeywordMarkerFilter(tokenizer, set)));
}
};
assertAnalyzesTo(
keywordMarkingAnalyzer,
"田中京一",
new String[] {"田中", "京一"}, // 京一 is not normalized
new int[] {0, 2},
new int[] {2, 4},
new int[] {1, 1});
keywordMarkingAnalyzer.close();
}
@Test
public void testDecimal() throws IOException {
// Test Arabic numbers with punctuation, i.e. 3.2 thousands
assertAnalyzesTo(analyzer, "1.2万345.67", new String[] {"12345.67"});
}
@Test
public void testDecimalPunctuation() throws IOException {
// Test Arabic numbers with punctuation, i.e. 3.2 thousands yen
assertAnalyzesTo(analyzer, "3.2千円", new String[] {"3200", "円"});
}
@Test
public void testThousandSeparator() throws IOException {
assertAnalyzesTo(analyzer, "4,647", new String[] {"4647"});
}
@Test
public void testDecimalThousandSeparator() throws IOException {
assertAnalyzesTo(analyzer, "4,647.0010", new String[] {"4647.001"});
}
@Test
public void testCommaDecimalSeparator() throws IOException {
assertAnalyzesTo(analyzer, "15,7", new String[] {"157"});
}
@Test
public void testTrailingZeroStripping() throws IOException {
assertAnalyzesTo(analyzer, "1000.1000", new String[] {"1000.1"});
assertAnalyzesTo(analyzer, "1000.0000", new String[] {"1000"});
}
@Test
public void testEmpty() throws IOException {
assertAnalyzesTo(analyzer, "", new String[] {});
}
@Test
public void testRandomHugeStrings() throws Exception {
checkRandomData(random(), analyzer, RANDOM_MULTIPLIER, 4096);
}
@Test
@Nightly
public void testRandomHugeStringsAtNight() throws Exception {
checkRandomData(random(), analyzer, 3 * RANDOM_MULTIPLIER, 8192);
}
@Test
public void testRandomSmallStrings() throws Exception {
checkRandomData(random(), analyzer, 100 * RANDOM_MULTIPLIER, 128);
}
@Test
public void testFunnyIssue() throws Exception {
BaseTokenStreamTestCase.checkAnalysisConsistency(
random(), analyzer, true, "〇〇\u302f\u3029\u3039\u3023\u3033\u302bB", true);
}
@Ignore(
"This test is used during development when analyze normalizations in large amounts of text")
@Test
public void testLargeData() throws IOException {
Path input = Paths.get("/tmp/test.txt");
Path tokenizedOutput = Paths.get("/tmp/test.tok.txt");
Path normalizedOutput = Paths.get("/tmp/test.norm.txt");
Analyzer plainAnalyzer =
new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName) {
Tokenizer tokenizer =
new JapaneseTokenizer(
newAttributeFactory(), null, false, JapaneseTokenizer.Mode.SEARCH);
return new TokenStreamComponents(tokenizer);
}
};
analyze(
plainAnalyzer,
Files.newBufferedReader(input, StandardCharsets.UTF_8),
Files.newBufferedWriter(tokenizedOutput, StandardCharsets.UTF_8));
analyze(
analyzer,
Files.newBufferedReader(input, StandardCharsets.UTF_8),
Files.newBufferedWriter(normalizedOutput, StandardCharsets.UTF_8));
plainAnalyzer.close();
}
public void analyze(Analyzer analyzer, Reader reader, Writer writer) throws IOException {
TokenStream stream = analyzer.tokenStream("dummy", reader);
stream.reset();
CharTermAttribute termAttr = stream.addAttribute(CharTermAttribute.class);
while (stream.incrementToken()) {
writer.write(termAttr.toString());
writer.write("\n");
}
reader.close();
writer.close();
}
}