| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| package org.apache.lucene.analysis.icu.segmentation; |
| |
| import org.apache.lucene.analysis.Analyzer; |
| import org.apache.lucene.analysis.BaseTokenStreamTestCase; |
| import org.apache.lucene.analysis.Tokenizer; |
| |
| /** Test tokenizing Myanmar text into syllables */ |
| public class TestMyanmarSyllable extends BaseTokenStreamTestCase { |
| |
| Analyzer a; |
| |
| @Override |
| public void setUp() throws Exception { |
| super.setUp(); |
| a = |
| new Analyzer() { |
| @Override |
| protected TokenStreamComponents createComponents(String fieldName) { |
| Tokenizer tokenizer = |
| new ICUTokenizer( |
| newAttributeFactory(), new DefaultICUTokenizerConfig(false, false)); |
| return new TokenStreamComponents(tokenizer); |
| } |
| }; |
| } |
| |
| @Override |
| public void tearDown() throws Exception { |
| a.close(); |
| super.tearDown(); |
| } |
| |
| /** as opposed to dictionary break of သက်ဝင်|လှုပ်ရှား|စေ|ပြီး */ |
| public void testBasics() throws Exception { |
| assertAnalyzesTo( |
| a, "သက်ဝင်လှုပ်ရှားစေပြီး", new String[] {"သက်", "ဝင်", "လှုပ်", "ရှား", "စေ", "ပြီး"}); |
| } |
| |
| // simple tests from "A Rule-based Syllable Segmentation of Myanmar Text" |
| // * http://www.aclweb.org/anthology/I08-3010 |
| // (see also the presentation: |
| // http://gii2.nagaokaut.ac.jp/gii/media/share/20080901-ZMM%20Presentation.pdf) |
| // The words are fake, we just test the categories. |
| // note that currently our algorithm is not sophisticated enough to handle some of the special |
| // cases! |
| |
| /** constant */ |
| public void testC() throws Exception { |
| assertAnalyzesTo(a, "ကက", new String[] {"က", "က"}); |
| } |
| |
| /** consonant + sign */ |
| public void testCF() throws Exception { |
| assertAnalyzesTo(a, "ကံကံ", new String[] {"ကံ", "ကံ"}); |
| } |
| |
| /** consonant + consonant + asat */ |
| public void testCCA() throws Exception { |
| assertAnalyzesTo(a, "ကင်ကင်", new String[] {"ကင်", "ကင်"}); |
| } |
| |
| /** consonant + consonant + asat + sign */ |
| public void testCCAF() throws Exception { |
| assertAnalyzesTo(a, "ကင်းကင်း", new String[] {"ကင်း", "ကင်း"}); |
| } |
| |
| /** consonant + vowel */ |
| public void testCV() throws Exception { |
| assertAnalyzesTo(a, "ကာကာ", new String[] {"ကာ", "ကာ"}); |
| } |
| |
| /** consonant + vowel + sign */ |
| public void testCVF() throws Exception { |
| assertAnalyzesTo(a, "ကားကား", new String[] {"ကား", "ကား"}); |
| } |
| |
| /** consonant + vowel + vowel + asat */ |
| public void testCVVA() throws Exception { |
| assertAnalyzesTo(a, "ကော်ကော်", new String[] {"ကော်", "ကော်"}); |
| } |
| |
| /** consonant + vowel + vowel + consonant + asat */ |
| public void testCVVCA() throws Exception { |
| assertAnalyzesTo(a, "ကောင်ကောင်", new String[] {"ကောင်", "ကောင်"}); |
| } |
| |
| /** consonant + vowel + vowel + consonant + asat + sign */ |
| public void testCVVCAF() throws Exception { |
| assertAnalyzesTo(a, "ကောင်းကောင်း", new String[] {"ကောင်း", "ကောင်း"}); |
| } |
| |
| /** consonant + medial */ |
| public void testCM() throws Exception { |
| assertAnalyzesTo(a, "ကျကျ", new String[] {"ကျ", "ကျ"}); |
| } |
| |
| /** consonant + medial + sign */ |
| public void testCMF() throws Exception { |
| assertAnalyzesTo(a, "ကျံကျံ", new String[] {"ကျံ", "ကျံ"}); |
| } |
| |
| /** consonant + medial + consonant + asat */ |
| public void testCMCA() throws Exception { |
| assertAnalyzesTo(a, "ကျင်ကျင်", new String[] {"ကျင်", "ကျင်"}); |
| } |
| |
| /** consonant + medial + consonant + asat + sign */ |
| public void testCMCAF() throws Exception { |
| assertAnalyzesTo(a, "ကျင်းကျင်း", new String[] {"ကျင်း", "ကျင်း"}); |
| } |
| |
| /** consonant + medial + vowel */ |
| public void testCMV() throws Exception { |
| assertAnalyzesTo(a, "ကျာကျာ", new String[] {"ကျာ", "ကျာ"}); |
| } |
| |
| /** consonant + medial + vowel + sign */ |
| public void testCMVF() throws Exception { |
| assertAnalyzesTo(a, "ကျားကျား", new String[] {"ကျား", "ကျား"}); |
| } |
| |
| /** consonant + medial + vowel + vowel + asat */ |
| public void testCMVVA() throws Exception { |
| assertAnalyzesTo(a, "ကျော်ကျော်", new String[] {"ကျော်", "ကျော်"}); |
| } |
| |
| /** consonant + medial + vowel + vowel + consonant + asat */ |
| public void testCMVVCA() throws Exception { |
| assertAnalyzesTo(a, "ကြောင်ကြောင်", new String[] {"ကြောင်", "ကြောင်"}); |
| } |
| |
| /** consonant + medial + vowel + vowel + consonant + asat + sign */ |
| public void testCMVVCAF() throws Exception { |
| assertAnalyzesTo(a, "ကြောင်းကြောင်း", new String[] {"ကြောင်း", "ကြောင်း"}); |
| } |
| |
| /** independent vowel */ |
| public void testI() throws Exception { |
| assertAnalyzesTo(a, "ဪဪ", new String[] {"ဪ", "ဪ"}); |
| } |
| |
| /** independent vowel */ |
| public void testE() throws Exception { |
| assertAnalyzesTo(a, "ဣဣ", new String[] {"ဣ", "ဣ"}); |
| } |
| } |