| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| package org.apache.lucene.analysis.ko; |
| |
| import java.io.IOException; |
| import java.util.Arrays; |
| import java.util.HashSet; |
| import java.util.Random; |
| import java.util.Set; |
| |
| import org.apache.lucene.analysis.Analyzer; |
| import org.apache.lucene.analysis.BaseTokenStreamTestCase; |
| |
| /** |
| * Test Korean morphological analyzer |
| */ |
| public class TestKoreanAnalyzer extends BaseTokenStreamTestCase { |
| public void testSentence() throws IOException { |
| Analyzer a = new KoreanAnalyzer(); |
| assertAnalyzesTo(a, "한국은 대단한 나라입니다.", |
| new String[]{"한국", "대단", "나라", "이"}, |
| new int[]{ 0, 4, 8, 10 }, |
| new int[]{ 2, 6, 10, 13 }, |
| new int[]{ 1, 2, 3, 1 } |
| ); |
| a.close(); |
| } |
| |
| public void testStopTags() throws IOException { |
| Set<POS.Tag> stopTags = new HashSet<>(Arrays.asList(POS.Tag.NNP, POS.Tag.NNG)); |
| Analyzer a = new KoreanAnalyzer(null, KoreanTokenizer.DecompoundMode.DISCARD, stopTags, false); |
| assertAnalyzesTo(a, "한국은 대단한 나라입니다.", |
| new String[]{"은", "대단", "하", "ᆫ", "이", "ᄇ니다"}, |
| new int[]{ 2, 4, 6, 6, 10, 10 }, |
| new int[]{ 3, 6, 7, 7, 13, 13 }, |
| new int[]{ 2, 1, 1, 1, 2, 1 } |
| ); |
| a.close(); |
| } |
| |
| public void testUnknownWord() throws IOException { |
| Analyzer a = new KoreanAnalyzer(null, KoreanTokenizer.DecompoundMode.DISCARD, |
| KoreanPartOfSpeechStopFilter.DEFAULT_STOP_TAGS, true); |
| |
| assertAnalyzesTo(a,"2018 평창 동계올림픽대회", |
| new String[]{"2", "0", "1", "8", "평창", "동계", "올림픽", "대회"}, |
| new int[]{0, 1, 2, 3, 5, 8, 10, 13}, |
| new int[]{1, 2, 3, 4, 7, 10, 13, 15}, |
| new int[]{1, 1, 1, 1, 1, 1, 1, 1}); |
| a.close(); |
| |
| a = new KoreanAnalyzer(null, KoreanTokenizer.DecompoundMode.DISCARD, |
| KoreanPartOfSpeechStopFilter.DEFAULT_STOP_TAGS, false); |
| |
| assertAnalyzesTo(a,"2018 평창 동계올림픽대회", |
| new String[]{"2018", "평창", "동계", "올림픽", "대회"}, |
| new int[]{0, 5, 8, 10, 13}, |
| new int[]{4, 7, 10, 13, 15}, |
| new int[]{1, 1, 1, 1, 1}); |
| a.close(); |
| } |
| |
| /** |
| * blast random strings against the analyzer |
| */ |
| public void testRandom() throws IOException { |
| Random random = random(); |
| final Analyzer a = new KoreanAnalyzer(); |
| checkRandomData(random, a, atLeast(200)); |
| a.close(); |
| } |
| |
| /** |
| * blast some random large strings through the analyzer |
| */ |
| public void testRandomHugeStrings() throws Exception { |
| Random random = random(); |
| final Analyzer a = new KoreanAnalyzer(); |
| checkRandomData(random, a, RANDOM_MULTIPLIER, 4096); |
| a.close(); |
| } |
| |
| @Nightly |
| public void testRandomHugeStringsAtNight() throws Exception { |
| Random random = random(); |
| final Analyzer a = new KoreanAnalyzer(); |
| checkRandomData(random, a, 3 * RANDOM_MULTIPLIER, 8192); |
| a.close(); |
| } |
| |
| // Copied from TestKoreanTokenizer, to make sure passing |
| // user dict to analyzer works: |
| public void testUserDict() throws IOException { |
| final Analyzer analyzer = new KoreanAnalyzer(TestKoreanTokenizer.readDict(), |
| KoreanTokenizer.DEFAULT_DECOMPOUND, KoreanPartOfSpeechStopFilter.DEFAULT_STOP_TAGS, false); |
| assertAnalyzesTo(analyzer, "c++ 프로그래밍 언어", |
| new String[]{"c++", "프로그래밍", "언어"}, |
| new int[]{0, 4, 10}, |
| new int[]{3, 9, 12}, |
| new int[]{1, 1, 1} |
| ); |
| } |
| } |