blob: a3b608ed2709106a698f3351bed204606aa812d9 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.analysis.icu.segmentation;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.Tokenizer;
/** Test tokenizing Myanmar text into syllables */
public class TestMyanmarSyllable extends BaseTokenStreamTestCase {
Analyzer a;
@Override
public void setUp() throws Exception {
super.setUp();
a = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName) {
Tokenizer tokenizer = new ICUTokenizer(newAttributeFactory(), new DefaultICUTokenizerConfig(false, false));
return new TokenStreamComponents(tokenizer);
}
};
}
@Override
public void tearDown() throws Exception {
a.close();
super.tearDown();
}
/** as opposed to dictionary break of သက်ဝင်|လှုပ်ရှား|စေ|ပြီး */
public void testBasics() throws Exception {
assertAnalyzesTo(a, "သက်ဝင်လှုပ်ရှားစေပြီး", new String[] { "သက်", "ဝင်", "လှုပ်", "ရှား", "စေ", "ပြီး" });
}
// simple tests from "A Rule-based Syllable Segmentation of Myanmar Text"
// * http://www.aclweb.org/anthology/I08-3010
// (see also the presentation: http://gii2.nagaokaut.ac.jp/gii/media/share/20080901-ZMM%20Presentation.pdf)
// The words are fake, we just test the categories.
// note that currently our algorithm is not sophisticated enough to handle some of the special cases!
/** constant */
public void testC() throws Exception {
assertAnalyzesTo(a, "ကက", new String[] { "က", "က" });
}
/** consonant + sign */
public void testCF() throws Exception {
assertAnalyzesTo(a, "ကံကံ", new String[] { "ကံ", "ကံ" });
}
/** consonant + consonant + asat */
public void testCCA() throws Exception {
assertAnalyzesTo(a, "ကင်ကင်", new String[] { "ကင်", "ကင်" });
}
/** consonant + consonant + asat + sign */
public void testCCAF() throws Exception {
assertAnalyzesTo(a, "ကင်းကင်း", new String[] { "ကင်း", "ကင်း" });
}
/** consonant + vowel */
public void testCV() throws Exception {
assertAnalyzesTo(a, "ကာကာ", new String[] { "ကာ", "ကာ" });
}
/** consonant + vowel + sign */
public void testCVF() throws Exception {
assertAnalyzesTo(a, "ကားကား", new String[] { "ကား", "ကား" });
}
/** consonant + vowel + vowel + asat */
public void testCVVA() throws Exception {
assertAnalyzesTo(a, "ကော်ကော်", new String[] { "ကော်", "ကော်" });
}
/** consonant + vowel + vowel + consonant + asat */
public void testCVVCA() throws Exception {
assertAnalyzesTo(a, "ကောင်ကောင်", new String[] { "ကောင်", "ကောင်" });
}
/** consonant + vowel + vowel + consonant + asat + sign */
public void testCVVCAF() throws Exception {
assertAnalyzesTo(a, "ကောင်းကောင်း", new String[] { "ကောင်း", "ကောင်း" });
}
/** consonant + medial */
public void testCM() throws Exception {
assertAnalyzesTo(a, "ကျကျ", new String[] { "ကျ", "ကျ" });
}
/** consonant + medial + sign */
public void testCMF() throws Exception {
assertAnalyzesTo(a, "ကျံကျံ", new String[] { "ကျံ", "ကျံ" });
}
/** consonant + medial + consonant + asat */
public void testCMCA() throws Exception {
assertAnalyzesTo(a, "ကျင်ကျင်", new String[] { "ကျင်", "ကျင်" });
}
/** consonant + medial + consonant + asat + sign */
public void testCMCAF() throws Exception {
assertAnalyzesTo(a, "ကျင်းကျင်း", new String[] { "ကျင်း", "ကျင်း" });
}
/** consonant + medial + vowel */
public void testCMV() throws Exception {
assertAnalyzesTo(a, "ကျာကျာ", new String[] { "ကျာ", "ကျာ" });
}
/** consonant + medial + vowel + sign */
public void testCMVF() throws Exception {
assertAnalyzesTo(a, "ကျားကျား", new String[] { "ကျား", "ကျား" });
}
/** consonant + medial + vowel + vowel + asat */
public void testCMVVA() throws Exception {
assertAnalyzesTo(a, "ကျော်ကျော်", new String[] { "ကျော်", "ကျော်" });
}
/** consonant + medial + vowel + vowel + consonant + asat */
public void testCMVVCA() throws Exception {
assertAnalyzesTo(a, "ကြောင်ကြောင်", new String[] { "ကြောင်", "ကြောင်"});
}
/** consonant + medial + vowel + vowel + consonant + asat + sign */
public void testCMVVCAF() throws Exception {
assertAnalyzesTo(a, "ကြောင်းကြောင်း", new String[] { "ကြောင်း", "ကြောင်း"});
}
/** independent vowel */
public void testI() throws Exception {
assertAnalyzesTo(a, "ဪဪ", new String[] { "ဪ", "ဪ" });
}
/** independent vowel */
public void testE() throws Exception {
assertAnalyzesTo(a, "ဣဣ", new String[] { "ဣ", "ဣ" });
}
}