blob: 33bfa017681361b0d581cce20147c580e10f2011 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.analysis.icu.segmentation;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.Tokenizer;
/** Test tokenizing Myanmar text into syllables */
public class TestMyanmarSyllable extends BaseTokenStreamTestCase {
Analyzer a;
@Override
public void setUp() throws Exception {
super.setUp();
a =
new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName) {
Tokenizer tokenizer =
new ICUTokenizer(
newAttributeFactory(), new DefaultICUTokenizerConfig(false, false));
return new TokenStreamComponents(tokenizer);
}
};
}
@Override
public void tearDown() throws Exception {
a.close();
super.tearDown();
}
/** as opposed to dictionary break of သက်ဝင်|လှုပ်ရှား|စေ|ပြီး */
public void testBasics() throws Exception {
assertAnalyzesTo(
a, "သက်ဝင်လှုပ်ရှားစေပြီး", new String[] {"သက်", "ဝင်", "လှုပ်", "ရှား", "စေ", "ပြီး"});
}
// simple tests from "A Rule-based Syllable Segmentation of Myanmar Text"
// * http://www.aclweb.org/anthology/I08-3010
// (see also the presentation:
// http://gii2.nagaokaut.ac.jp/gii/media/share/20080901-ZMM%20Presentation.pdf)
// The words are fake, we just test the categories.
// note that currently our algorithm is not sophisticated enough to handle some of the special
// cases!
/** constant */
public void testC() throws Exception {
assertAnalyzesTo(a, "ကက", new String[] {"က", "က"});
}
/** consonant + sign */
public void testCF() throws Exception {
assertAnalyzesTo(a, "ကံကံ", new String[] {"ကံ", "ကံ"});
}
/** consonant + consonant + asat */
public void testCCA() throws Exception {
assertAnalyzesTo(a, "ကင်ကင်", new String[] {"ကင်", "ကင်"});
}
/** consonant + consonant + asat + sign */
public void testCCAF() throws Exception {
assertAnalyzesTo(a, "ကင်းကင်း", new String[] {"ကင်း", "ကင်း"});
}
/** consonant + vowel */
public void testCV() throws Exception {
assertAnalyzesTo(a, "ကာကာ", new String[] {"ကာ", "ကာ"});
}
/** consonant + vowel + sign */
public void testCVF() throws Exception {
assertAnalyzesTo(a, "ကားကား", new String[] {"ကား", "ကား"});
}
/** consonant + vowel + vowel + asat */
public void testCVVA() throws Exception {
assertAnalyzesTo(a, "ကော်ကော်", new String[] {"ကော်", "ကော်"});
}
/** consonant + vowel + vowel + consonant + asat */
public void testCVVCA() throws Exception {
assertAnalyzesTo(a, "ကောင်ကောင်", new String[] {"ကောင်", "ကောင်"});
}
/** consonant + vowel + vowel + consonant + asat + sign */
public void testCVVCAF() throws Exception {
assertAnalyzesTo(a, "ကောင်းကောင်း", new String[] {"ကောင်း", "ကောင်း"});
}
/** consonant + medial */
public void testCM() throws Exception {
assertAnalyzesTo(a, "ကျကျ", new String[] {"ကျ", "ကျ"});
}
/** consonant + medial + sign */
public void testCMF() throws Exception {
assertAnalyzesTo(a, "ကျံကျံ", new String[] {"ကျံ", "ကျံ"});
}
/** consonant + medial + consonant + asat */
public void testCMCA() throws Exception {
assertAnalyzesTo(a, "ကျင်ကျင်", new String[] {"ကျင်", "ကျင်"});
}
/** consonant + medial + consonant + asat + sign */
public void testCMCAF() throws Exception {
assertAnalyzesTo(a, "ကျင်းကျင်း", new String[] {"ကျင်း", "ကျင်း"});
}
/** consonant + medial + vowel */
public void testCMV() throws Exception {
assertAnalyzesTo(a, "ကျာကျာ", new String[] {"ကျာ", "ကျာ"});
}
/** consonant + medial + vowel + sign */
public void testCMVF() throws Exception {
assertAnalyzesTo(a, "ကျားကျား", new String[] {"ကျား", "ကျား"});
}
/** consonant + medial + vowel + vowel + asat */
public void testCMVVA() throws Exception {
assertAnalyzesTo(a, "ကျော်ကျော်", new String[] {"ကျော်", "ကျော်"});
}
/** consonant + medial + vowel + vowel + consonant + asat */
public void testCMVVCA() throws Exception {
assertAnalyzesTo(a, "ကြောင်ကြောင်", new String[] {"ကြောင်", "ကြောင်"});
}
/** consonant + medial + vowel + vowel + consonant + asat + sign */
public void testCMVVCAF() throws Exception {
assertAnalyzesTo(a, "ကြောင်းကြောင်း", new String[] {"ကြောင်း", "ကြောင်း"});
}
/** independent vowel */
public void testI() throws Exception {
assertAnalyzesTo(a, "ဪဪ", new String[] {"ဪ", "ဪ"});
}
/** independent vowel */
public void testE() throws Exception {
assertAnalyzesTo(a, "ဣဣ", new String[] {"ဣ", "ဣ"});
}
}