| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| package org.apache.lucene.analysis.ko; |
| |
| import java.io.IOException; |
| import java.io.InputStream; |
| import java.io.InputStreamReader; |
| import java.io.Reader; |
| import java.io.Writer; |
| import java.nio.charset.StandardCharsets; |
| import java.nio.file.Files; |
| import java.nio.file.Path; |
| import java.nio.file.Paths; |
| import java.util.HashSet; |
| import java.util.Set; |
| import org.apache.lucene.analysis.Analyzer; |
| import org.apache.lucene.analysis.BaseTokenStreamTestCase; |
| import org.apache.lucene.analysis.CharArraySet; |
| import org.apache.lucene.analysis.TokenStream; |
| import org.apache.lucene.analysis.Tokenizer; |
| import org.apache.lucene.analysis.ko.dict.UserDictionary; |
| import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter; |
| import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; |
| import org.junit.Ignore; |
| import org.junit.Test; |
| |
| public class TestKoreanNumberFilter extends BaseTokenStreamTestCase { |
| private Analyzer analyzer; |
| |
| public static UserDictionary readDict() { |
| InputStream is = TestKoreanTokenizer.class.getResourceAsStream("userdict.txt"); |
| if (is == null) { |
| throw new RuntimeException("Cannot find userdict.txt in test classpath!"); |
| } |
| try { |
| try { |
| Reader reader = new InputStreamReader(is, StandardCharsets.UTF_8); |
| return UserDictionary.open(reader); |
| } finally { |
| is.close(); |
| } |
| } catch (IOException ioe) { |
| throw new RuntimeException(ioe); |
| } |
| } |
| |
| @Override |
| public void setUp() throws Exception { |
| super.setUp(); |
| UserDictionary userDictionary = readDict(); |
| Set<POS.Tag> stopTags = new HashSet<>(); |
| stopTags.add(POS.Tag.SP); |
| analyzer = |
| new Analyzer() { |
| @Override |
| protected TokenStreamComponents createComponents(String fieldName) { |
| Tokenizer tokenizer = |
| new KoreanTokenizer( |
| newAttributeFactory(), |
| userDictionary, |
| KoreanTokenizer.DEFAULT_DECOMPOUND, |
| false, |
| false); |
| TokenStream stream = new KoreanPartOfSpeechStopFilter(tokenizer, stopTags); |
| return new TokenStreamComponents(tokenizer, new KoreanNumberFilter(stream)); |
| } |
| }; |
| } |
| |
| @Override |
| public void tearDown() throws Exception { |
| analyzer.close(); |
| super.tearDown(); |
| } |
| |
| @Test |
| public void testBasics() throws IOException { |
| |
| assertAnalyzesTo( |
| analyzer, |
| "오늘 십만이천오백원의 와인 구입", |
| new String[] {"오늘", "102500", "원", "의", "와인", "구입"}, |
| new int[] {0, 3, 9, 10, 12, 15}, |
| new int[] {2, 9, 10, 11, 14, 17}); |
| |
| // Wrong analysis |
| // "초밥" => "초밥" O, "초"+"밥" X |
| assertAnalyzesTo( |
| analyzer, |
| "어제 초밥 가격은 10만 원", |
| new String[] {"어제", "초", "밥", "가격", "은", "100000", "원"}, |
| new int[] {0, 3, 4, 6, 8, 10, 14, 15, 13}, |
| new int[] {2, 4, 5, 8, 9, 13, 15, 13, 14}); |
| |
| assertAnalyzesTo( |
| analyzer, |
| "자본금 600만 원", |
| new String[] {"자본", "금", "6000000", "원"}, |
| new int[] {0, 2, 4, 9, 10}, |
| new int[] {2, 3, 8, 10, 11}); |
| } |
| |
| @Test |
| public void testVariants() throws IOException { |
| // Test variants of three |
| assertAnalyzesTo(analyzer, "3", new String[] {"3"}); |
| assertAnalyzesTo(analyzer, "3", new String[] {"3"}); |
| assertAnalyzesTo(analyzer, "삼", new String[] {"3"}); |
| |
| // Test three variations with trailing zero |
| assertAnalyzesTo(analyzer, "03", new String[] {"3"}); |
| assertAnalyzesTo(analyzer, "03", new String[] {"3"}); |
| assertAnalyzesTo(analyzer, "영삼", new String[] {"3"}); |
| assertAnalyzesTo(analyzer, "003", new String[] {"3"}); |
| assertAnalyzesTo(analyzer, "003", new String[] {"3"}); |
| assertAnalyzesTo(analyzer, "영영삼", new String[] {"3"}); |
| |
| // Test thousand variants |
| assertAnalyzesTo(analyzer, "천", new String[] {"1000"}); |
| assertAnalyzesTo(analyzer, "1천", new String[] {"1000"}); |
| assertAnalyzesTo(analyzer, "1천", new String[] {"1000"}); |
| assertAnalyzesTo(analyzer, "일천", new String[] {"1000"}); |
| assertAnalyzesTo(analyzer, "일영영영", new String[] {"1000"}); |
| assertAnalyzesTo(analyzer, "10백", new String[] {"1000"}); // Strange, but supported |
| } |
| |
| @Test |
| public void testLargeVariants() throws IOException { |
| // Test large numbers |
| assertAnalyzesTo(analyzer, "삼오칠팔구", new String[] {"35789"}); |
| assertAnalyzesTo(analyzer, "육백이만오천일", new String[] {"6025001"}); |
| assertAnalyzesTo(analyzer, "조육백만오천일", new String[] {"1000006005001"}); |
| assertAnalyzesTo(analyzer, "십조육백만오천일", new String[] {"10000006005001"}); |
| assertAnalyzesTo(analyzer, "일경일", new String[] {"10000000000000001"}); |
| assertAnalyzesTo(analyzer, "십경십", new String[] {"100000000000000010"}); |
| assertAnalyzesTo(analyzer, "해경조억만천백십일", new String[] {"100010001000100011111"}); |
| } |
| |
| @Test |
| public void testNegative() throws IOException { |
| assertAnalyzesTo(analyzer, "-백만", new String[] {"-", "1000000"}); |
| } |
| |
| @Test |
| public void testMixed() throws IOException { |
| // Test mixed numbers |
| assertAnalyzesTo(analyzer, "삼천2백2십삼", new String[] {"3223"}); |
| assertAnalyzesTo(analyzer, "32이삼", new String[] {"3223"}); |
| } |
| |
| @Test |
| public void testFunny() throws IOException { |
| // Test some oddities for inconsistent input |
| assertAnalyzesTo(analyzer, "십십", new String[] {"20"}); // 100? |
| assertAnalyzesTo(analyzer, "백백백", new String[] {"300"}); // 10,000? |
| assertAnalyzesTo(analyzer, "천천천천", new String[] {"4000"}); // 1,000,000,000,000? |
| } |
| |
| @Test |
| public void testHangulArabic() throws IOException { |
| // Test kanji numerals used as Arabic numbers (with head zero) |
| assertAnalyzesTo(analyzer, "영일이삼사오육칠팔구구팔칠육오사삼이일영", new String[] {"1234567899876543210"}); |
| |
| // I'm Bond, James "normalized" Bond... |
| assertAnalyzesTo(analyzer, "영영칠", new String[] {"7"}); |
| } |
| |
| @Test |
| public void testDoubleZero() throws IOException { |
| assertAnalyzesTo( |
| analyzer, "영영", new String[] {"0"}, new int[] {0}, new int[] {2}, new int[] {1}); |
| } |
| |
| @Test |
| public void testName() throws IOException { |
| // Test name that normalises to number |
| assertAnalyzesTo( |
| analyzer, |
| "전중경일", |
| new String[] {"전중", "10000000000000001"}, // 경일 is normalized to a number |
| new int[] {0, 2}, |
| new int[] {2, 4}, |
| new int[] {1, 1}); |
| |
| // An analyzer that marks 경일 as a keyword |
| Analyzer keywordMarkingAnalyzer = |
| new Analyzer() { |
| |
| @Override |
| protected TokenStreamComponents createComponents(String fieldName) { |
| CharArraySet set = new CharArraySet(1, false); |
| set.add("경일"); |
| UserDictionary userDictionary = readDict(); |
| Set<POS.Tag> stopTags = new HashSet<>(); |
| stopTags.add(POS.Tag.SP); |
| Tokenizer tokenizer = |
| new KoreanTokenizer( |
| newAttributeFactory(), |
| userDictionary, |
| KoreanTokenizer.DEFAULT_DECOMPOUND, |
| false, |
| false); |
| TokenStream stream = new KoreanPartOfSpeechStopFilter(tokenizer, stopTags); |
| return new TokenStreamComponents( |
| tokenizer, new KoreanNumberFilter(new SetKeywordMarkerFilter(stream, set))); |
| } |
| }; |
| |
| assertAnalyzesTo( |
| keywordMarkingAnalyzer, |
| "전중경일", |
| new String[] {"전중", "경일"}, // 경일 is not normalized |
| new int[] {0, 2}, |
| new int[] {2, 4}, |
| new int[] {1, 1}); |
| keywordMarkingAnalyzer.close(); |
| } |
| |
| @Test |
| public void testDecimal() throws IOException { |
| // Test Arabic numbers with punctuation, i.e. 3.2 thousands |
| assertAnalyzesTo(analyzer, "1.2만345.67", new String[] {"12345.67"}); |
| } |
| |
| @Test |
| public void testDecimalPunctuation() throws IOException { |
| // Test Arabic numbers with punctuation, i.e. 3.2 thousands won |
| assertAnalyzesTo(analyzer, "3.2천 원", new String[] {"3200", "원"}); |
| } |
| |
| @Test |
| public void testThousandSeparator() throws IOException { |
| assertAnalyzesTo(analyzer, "4,647", new String[] {"4647"}); |
| } |
| |
| @Test |
| public void testDecimalThousandSeparator() throws IOException { |
| assertAnalyzesTo(analyzer, "4,647.0010", new String[] {"4647.001"}); |
| } |
| |
| @Test |
| public void testCommaDecimalSeparator() throws IOException { |
| assertAnalyzesTo(analyzer, "15,7", new String[] {"157"}); |
| } |
| |
| @Test |
| public void testTrailingZeroStripping() throws IOException { |
| assertAnalyzesTo(analyzer, "1000.1000", new String[] {"1000.1"}); |
| assertAnalyzesTo(analyzer, "1000.0000", new String[] {"1000"}); |
| } |
| |
| @Test |
| public void testEmpty() throws IOException { |
| assertAnalyzesTo(analyzer, "", new String[] {}); |
| } |
| |
| @Test |
| public void testRandomHugeStrings() throws Exception { |
| checkRandomData(random(), analyzer, RANDOM_MULTIPLIER, 4096); |
| } |
| |
| @Test |
| @Nightly |
| public void testRandomHugeStringsAtNight() throws Exception { |
| checkRandomData(random(), analyzer, 5 * RANDOM_MULTIPLIER, 8192); |
| } |
| |
| @Test |
| public void testRandomSmallStrings() throws Exception { |
| checkRandomData(random(), analyzer, 100 * RANDOM_MULTIPLIER, 128); |
| } |
| |
| @Test |
| public void testFunnyIssue() throws Exception { |
| BaseTokenStreamTestCase.checkAnalysisConsistency( |
| random(), analyzer, true, "영영\u302f\u3029\u3039\u3023\u3033\u302bB", true); |
| } |
| |
| @Ignore( |
| "This test is used during development when analyze normalizations in large amounts of text") |
| @Test |
| public void testLargeData() throws IOException { |
| Path input = Paths.get("/tmp/test.txt"); |
| Path tokenizedOutput = Paths.get("/tmp/test.tok.txt"); |
| Path normalizedOutput = Paths.get("/tmp/test.norm.txt"); |
| |
| Analyzer plainAnalyzer = |
| new Analyzer() { |
| @Override |
| protected TokenStreamComponents createComponents(String fieldName) { |
| UserDictionary userDictionary = readDict(); |
| Set<POS.Tag> stopTags = new HashSet<>(); |
| stopTags.add(POS.Tag.SP); |
| Tokenizer tokenizer = |
| new KoreanTokenizer( |
| newAttributeFactory(), |
| userDictionary, |
| KoreanTokenizer.DEFAULT_DECOMPOUND, |
| false, |
| false); |
| return new TokenStreamComponents( |
| tokenizer, new KoreanPartOfSpeechStopFilter(tokenizer, stopTags)); |
| } |
| }; |
| |
| analyze( |
| plainAnalyzer, |
| Files.newBufferedReader(input, StandardCharsets.UTF_8), |
| Files.newBufferedWriter(tokenizedOutput, StandardCharsets.UTF_8)); |
| |
| analyze( |
| analyzer, |
| Files.newBufferedReader(input, StandardCharsets.UTF_8), |
| Files.newBufferedWriter(normalizedOutput, StandardCharsets.UTF_8)); |
| plainAnalyzer.close(); |
| } |
| |
| public void analyze(Analyzer analyzer, Reader reader, Writer writer) throws IOException { |
| TokenStream stream = analyzer.tokenStream("dummy", reader); |
| stream.reset(); |
| |
| CharTermAttribute termAttr = stream.addAttribute(CharTermAttribute.class); |
| |
| while (stream.incrementToken()) { |
| writer.write(termAttr.toString()); |
| writer.write("\n"); |
| } |
| |
| reader.close(); |
| writer.close(); |
| } |
| } |