blob: e0c35a03d638e93957873d2fceae34349b23b275 [file] [log] [blame]
package org.apache.lucene.analysis.kuromoji;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.InputStreamReader;
import java.io.LineNumberReader;
import java.util.List;
import org.apache.lucene.util.LuceneTestCase;
import org.junit.AfterClass;
import org.junit.BeforeClass;
import org.junit.Test;
public class SegmenterTest extends LuceneTestCase {
private static Segmenter segmenter;
@BeforeClass
public static void setUpBeforeClass() throws Exception {
segmenter = new Segmenter();
}
@AfterClass
public static void afterClass() throws Exception {
segmenter = null;
}
@Test
public void testSegmentation() {
// Skip tests for Michelle Kwan -- UniDic segments Kwan as ク ワン
// String input = "ミシェル・クワンが優勝しました。スペースステーションに行きます。うたがわしい。";
// String[] surfaceForms = {
// "ミシェル", "・", "クワン", "が", "優勝", "し", "まし", "た", "。",
// "スペース", "ステーション", "に", "行き", "ます", "。",
// "うたがわしい", "。"
// };
String input = "スペースステーションに行きます。うたがわしい。";
String[] surfaceForms = {
"スペース", "ステーション", "に", "行き", "ます", "。",
"うたがわしい", "。"
};
List<Token> tokens = segmenter.tokenize(input);
assertTrue(tokens.size() == surfaceForms.length);
for (int i = 0; i < tokens.size(); i++) {
assertEquals(surfaceForms[i], tokens.get(i).getSurfaceFormString());
}
}
@Test
public void testReadings() {
List<Token> tokens = segmenter.tokenize("寿司が食べたいです。");
assertEquals(6, tokens.size());
assertEquals("スシ", tokens.get(0).getReading());
assertEquals("ガ", tokens.get(1).getReading());
assertEquals("タベ", tokens.get(2).getReading());
assertEquals("タイ", tokens.get(3).getReading());
assertEquals("デス", tokens.get(4).getReading());
assertEquals("。", tokens.get(5).getReading());
}
@Test
public void testReadings2() {
List<Token> tokens = segmenter.tokenize("多くの学生が試験に落ちた。");
assertEquals(9, tokens.size());
assertEquals("オオク", tokens.get(0).getReading());
assertEquals("ノ", tokens.get(1).getReading());
assertEquals("ガクセイ", tokens.get(2).getReading());
assertEquals("ガ", tokens.get(3).getReading());
assertEquals("シケン", tokens.get(4).getReading());
assertEquals("ニ", tokens.get(5).getReading());
assertEquals("オチ", tokens.get(6).getReading());
assertEquals("タ", tokens.get(7).getReading());
assertEquals("。", tokens.get(8).getReading());
}
@Test
public void testPronunciations() {
List<Token> tokens = segmenter.tokenize("寿司が食べたいです。");
assertEquals(6, tokens.size());
assertEquals("スシ", tokens.get(0).getPronunciation());
assertEquals("ガ", tokens.get(1).getPronunciation());
assertEquals("タベ", tokens.get(2).getPronunciation());
assertEquals("タイ", tokens.get(3).getPronunciation());
assertEquals("デス", tokens.get(4).getPronunciation());
assertEquals("。", tokens.get(5).getPronunciation());
}
@Test
public void testPronunciations2() {
List<Token> tokens = segmenter.tokenize("多くの学生が試験に落ちた。");
assertEquals(9, tokens.size());
// pronunciation differs from reading here
assertEquals("オーク", tokens.get(0).getPronunciation());
assertEquals("ノ", tokens.get(1).getPronunciation());
assertEquals("ガクセイ", tokens.get(2).getPronunciation());
assertEquals("ガ", tokens.get(3).getPronunciation());
assertEquals("シケン", tokens.get(4).getPronunciation());
assertEquals("ニ", tokens.get(5).getPronunciation());
assertEquals("オチ", tokens.get(6).getPronunciation());
assertEquals("タ", tokens.get(7).getPronunciation());
assertEquals("。", tokens.get(8).getPronunciation());
}
@Test
public void testBasicForms() {
List<Token> tokens = segmenter.tokenize("それはまだ実験段階にあります。");
assertEquals(9, tokens.size());
assertNull(tokens.get(0).getBaseForm());
assertNull(tokens.get(1).getBaseForm());
assertNull(tokens.get(2).getBaseForm());
assertNull(tokens.get(3).getBaseForm());
assertNull(tokens.get(4).getBaseForm());
assertNull(tokens.get(5).getBaseForm());
assertEquals(tokens.get(6).getBaseForm(), "ある");
assertNull(tokens.get(7).getBaseForm());
assertNull(tokens.get(8).getBaseForm());
}
@Test
public void testInflectionTypes() {
List<Token> tokens = segmenter.tokenize("それはまだ実験段階にあります。");
assertEquals(9, tokens.size());
assertNull(tokens.get(0).getInflectionType());
assertNull(tokens.get(1).getInflectionType());
assertNull(tokens.get(2).getInflectionType());
assertNull(tokens.get(3).getInflectionType());
assertNull(tokens.get(4).getInflectionType());
assertNull(tokens.get(5).getInflectionType());
assertEquals("五段・ラ行", tokens.get(6).getInflectionType());
assertEquals("特殊・マス", tokens.get(7).getInflectionType());
assertNull(tokens.get(8).getInflectionType());
}
@Test
public void testInflectionForms() {
List<Token> tokens = segmenter.tokenize("それはまだ実験段階にあります。");
assertEquals(9, tokens.size());
assertNull(tokens.get(0).getInflectionForm());
assertNull(tokens.get(1).getInflectionForm());
assertNull(tokens.get(2).getInflectionForm());
assertNull(tokens.get(3).getInflectionForm());
assertNull(tokens.get(4).getInflectionForm());
assertNull(tokens.get(5).getInflectionForm());
assertEquals("連用形", tokens.get(6).getInflectionForm());
assertEquals("基本形", tokens.get(7).getInflectionForm());
assertNull(tokens.get(8).getInflectionForm());
}
@Test
public void testPartOfSpeech() {
List<Token> tokens = segmenter.tokenize("それはまだ実験段階にあります。");
assertEquals(9, tokens.size());
assertEquals("名詞-代名詞-一般", tokens.get(0).getPartOfSpeech());
assertEquals("助詞-係助詞", tokens.get(1).getPartOfSpeech());
assertEquals("副詞-助詞類接続", tokens.get(2).getPartOfSpeech());
assertEquals("名詞-サ変接続", tokens.get(3).getPartOfSpeech());
assertEquals("名詞-一般", tokens.get(4).getPartOfSpeech());
assertEquals("助詞-格助詞-一般", tokens.get(5).getPartOfSpeech());
assertEquals("動詞-自立", tokens.get(6).getPartOfSpeech());
assertEquals("助動詞", tokens.get(7).getPartOfSpeech());
assertEquals("記号-句点", tokens.get(8).getPartOfSpeech());
}
// TODO: the next 2 tests are no longer using the first/last word ids, maybe lookup the words and fix?
// do we have a possibility to actually lookup the first and last word from dictionary?
public void testYabottai() {
List<Token> tokens = segmenter.tokenize("やぼったい");
assertEquals(1, tokens.size());
assertEquals("やぼったい", tokens.get(0).getSurfaceFormString());
}
public void testTsukitosha() {
List<Token> tokens = segmenter.tokenize("突き通しゃ");
assertEquals(1, tokens.size());
assertEquals("突き通しゃ", tokens.get(0).getSurfaceFormString());
}
public void testBocchan() throws Exception {
doTestBocchan(1);
}
@Test @Nightly
public void testBocchanBig() throws Exception {
doTestBocchan(100);
}
private void doTestBocchan(int numIterations) throws Exception {
LineNumberReader reader = new LineNumberReader(new InputStreamReader(
this.getClass().getResourceAsStream("bocchan.utf-8")));
String line = reader.readLine();
reader.close();
if (VERBOSE) {
System.out.println("Test for Bocchan without pre-splitting sentences");
}
long totalStart = System.currentTimeMillis();
for (int i = 0; i < numIterations; i++){
segmenter.tokenize(line);
}
if (VERBOSE) {
System.out.println("Total time : " + (System.currentTimeMillis() - totalStart));
System.out.println("Test for Bocchan with pre-splitting sentences");
}
String[] sentences = line.split("、|。");
totalStart = System.currentTimeMillis();
for (int i = 0; i < numIterations; i++) {
for (String sentence: sentences) {
segmenter.tokenize(sentence);
}
}
if (VERBOSE) {
System.out.println("Total time : " + (System.currentTimeMillis() - totalStart));
}
}
}