| diff --git a/lucene/analysis/icu/src/data/uax29/MyanmarSyllable.rbbi b/lucene/analysis/icu/src/data/uax29/MyanmarSyllable.rbbi |
| new file mode 100644 |
| index 0000000..1840803 |
| --- /dev/null |
| +++ b/lucene/analysis/icu/src/data/uax29/MyanmarSyllable.rbbi |
| @@ -0,0 +1,50 @@ |
| +# |
| +# Licensed to the Apache Software Foundation (ASF) under one or more |
| +# contributor license agreements. See the NOTICE file distributed with |
| +# this work for additional information regarding copyright ownership. |
| +# The ASF licenses this file to You under the Apache License, Version 2.0 |
| +# (the "License"); you may not use this file except in compliance with |
| +# the License. You may obtain a copy of the License at |
| +# |
| +# http://www.apache.org/licenses/LICENSE-2.0 |
| +# |
| +# Unless required by applicable law or agreed to in writing, software |
| +# distributed under the License is distributed on an "AS IS" BASIS, |
| +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| +# See the License for the specific language governing permissions and |
| +# limitations under the License. |
| +# |
| +# |
| +# Parses Myanmar text, with syllable as token. |
| +# |
| + |
| +$Cons = [[:Other_Letter:]&[:Myanmar:]]; |
| +$Virama = [\u1039]; |
| +$Asat = [\u103A]; |
| + |
| +$WordJoin = [:Line_Break=Word_Joiner:]; |
| + |
| +# |
| +# default numerical definitions |
| +# |
| +$Extend = [\p{Word_Break = Extend}]; |
| +$Format = [\p{Word_Break = Format}]; |
| +$MidNumLet = [\p{Word_Break = MidNumLet}]; |
| +$MidNum = [\p{Word_Break = MidNum}]; |
| +$Numeric = [\p{Word_Break = Numeric}]; |
| +$ExtendNumLet = [\p{Word_Break = ExtendNumLet}]; |
| +$MidNumLetEx = $MidNumLet ($Extend | $Format)*; |
| +$MidNumEx = $MidNum ($Extend | $Format)*; |
| +$NumericEx = $Numeric ($Extend | $Format)*; |
| +$ExtendNumLetEx = $ExtendNumLet ($Extend | $Format)*; |
| + |
| +$ConsEx = $Cons ($Extend | $Format)*; |
| +$AsatEx = $Cons $Asat ($Virama $ConsEx)? ($Extend | $Format)*; |
| +$MyanmarSyllableEx = $ConsEx ($Virama $ConsEx)? ($AsatEx)*; |
| +$MyanmarJoinedSyllableEx = $MyanmarSyllableEx ($WordJoin $MyanmarSyllableEx)*; |
| + |
| +!!forward; |
| +$MyanmarJoinedSyllableEx {200}; |
| + |
| +# default numeric rules |
| +$NumericEx $ExtendNumLetEx? (($MidNumEx | $MidNumLetEx)? $NumericEx $ExtendNumLetEx?)* {100}; |
| diff --git a/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/segmentation/DefaultICUTokenizerConfig.java b/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/segmentation/DefaultICUTokenizerConfig.java |
| index b33663d..3cd62c8 100644 |
| --- a/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/segmentation/DefaultICUTokenizerConfig.java |
| +++ b/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/segmentation/DefaultICUTokenizerConfig.java |
| @@ -63,9 +63,12 @@ public class DefaultICUTokenizerConfig extends ICUTokenizerConfig { |
| // the same as ROOT, except no dictionary segmentation for cjk |
| private static final BreakIterator defaultBreakIterator = |
| readBreakIterator("Default.brk"); |
| + private static final BreakIterator myanmarSyllableIterator = |
| + readBreakIterator("MyanmarSyllable.brk"); |
| |
| // TODO: deprecate this boolean? you only care if you are doing super-expert stuff... |
| private final boolean cjkAsWords; |
| + private final boolean myanmarAsWords; |
| |
| /** |
| * Creates a new config. This object is lightweight, but the first |
| @@ -74,9 +77,12 @@ public class DefaultICUTokenizerConfig extends ICUTokenizerConfig { |
| * otherwise text will be segmented according to UAX#29 defaults. |
| * If this is true, all Han+Hiragana+Katakana words will be tagged as |
| * IDEOGRAPHIC. |
| + * @param myanmarAsWords true if Myanmar text should undergo dictionary-based segmentation, |
| + * otherwise it will be tokenized as syllables. |
| */ |
| - public DefaultICUTokenizerConfig(boolean cjkAsWords) { |
| + public DefaultICUTokenizerConfig(boolean cjkAsWords, boolean myanmarAsWords) { |
| this.cjkAsWords = cjkAsWords; |
| + this.myanmarAsWords = myanmarAsWords; |
| } |
| |
| @Override |
| @@ -88,6 +94,12 @@ public class DefaultICUTokenizerConfig extends ICUTokenizerConfig { |
| public BreakIterator getBreakIterator(int script) { |
| switch(script) { |
| case UScript.JAPANESE: return (BreakIterator)cjkBreakIterator.clone(); |
| + case UScript.MYANMAR: |
| + if (myanmarAsWords) { |
| + return (BreakIterator)defaultBreakIterator.clone(); |
| + } else { |
| + return (BreakIterator)myanmarSyllableIterator.clone(); |
| + } |
| default: return (BreakIterator)defaultBreakIterator.clone(); |
| } |
| } |
| diff --git a/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/segmentation/ICUTokenizer.java b/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/segmentation/ICUTokenizer.java |
| index 64c6785..0941551 100644 |
| --- a/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/segmentation/ICUTokenizer.java |
| +++ b/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/segmentation/ICUTokenizer.java |
| @@ -68,7 +68,7 @@ public final class ICUTokenizer extends Tokenizer { |
| * @see DefaultICUTokenizerConfig |
| */ |
| public ICUTokenizer() { |
| - this(new DefaultICUTokenizerConfig(true)); |
| + this(new DefaultICUTokenizerConfig(true, true)); |
| } |
| |
| /** |
| diff --git a/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/segmentation/ICUTokenizerFactory.java b/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/segmentation/ICUTokenizerFactory.java |
| index deb5d4f..974e719 100644 |
| --- a/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/segmentation/ICUTokenizerFactory.java |
| +++ b/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/segmentation/ICUTokenizerFactory.java |
| @@ -79,6 +79,7 @@ public class ICUTokenizerFactory extends TokenizerFactory implements ResourceLoa |
| private final Map<Integer,String> tailored; |
| private ICUTokenizerConfig config; |
| private final boolean cjkAsWords; |
| + private final boolean myanmarAsWords; |
| |
| /** Creates a new ICUTokenizerFactory */ |
| public ICUTokenizerFactory(Map<String,String> args) { |
| @@ -95,6 +96,7 @@ public class ICUTokenizerFactory extends TokenizerFactory implements ResourceLoa |
| } |
| } |
| cjkAsWords = getBoolean(args, "cjkAsWords", true); |
| + myanmarAsWords = getBoolean(args, "myanmarAsWords", true); |
| if (!args.isEmpty()) { |
| throw new IllegalArgumentException("Unknown parameters: " + args); |
| } |
| @@ -104,7 +106,7 @@ public class ICUTokenizerFactory extends TokenizerFactory implements ResourceLoa |
| public void inform(ResourceLoader loader) throws IOException { |
| assert tailored != null : "init must be called first!"; |
| if (tailored.isEmpty()) { |
| - config = new DefaultICUTokenizerConfig(cjkAsWords); |
| + config = new DefaultICUTokenizerConfig(cjkAsWords, myanmarAsWords); |
| } else { |
| final BreakIterator breakers[] = new BreakIterator[UScript.CODE_LIMIT]; |
| for (Map.Entry<Integer,String> entry : tailored.entrySet()) { |
| @@ -112,7 +114,7 @@ public class ICUTokenizerFactory extends TokenizerFactory implements ResourceLoa |
| String resourcePath = entry.getValue(); |
| breakers[code] = parseRules(resourcePath, loader); |
| } |
| - config = new DefaultICUTokenizerConfig(cjkAsWords) { |
| + config = new DefaultICUTokenizerConfig(cjkAsWords, myanmarAsWords) { |
| |
| @Override |
| public BreakIterator getBreakIterator(int script) { |
| diff --git a/lucene/analysis/icu/src/resources/org/apache/lucene/analysis/icu/segmentation/MyanmarSyllable.brk b/lucene/analysis/icu/src/resources/org/apache/lucene/analysis/icu/segmentation/MyanmarSyllable.brk |
| new file mode 100644 |
| index 0000000..41b977b |
| Binary files /dev/null and b/lucene/analysis/icu/src/resources/org/apache/lucene/analysis/icu/segmentation/MyanmarSyllable.brk differ |
| diff --git a/lucene/analysis/icu/src/test/org/apache/lucene/analysis/icu/segmentation/TestICUTokenizer.java b/lucene/analysis/icu/src/test/org/apache/lucene/analysis/icu/segmentation/TestICUTokenizer.java |
| index 6398b2c..027baa3 100644 |
| --- a/lucene/analysis/icu/src/test/org/apache/lucene/analysis/icu/segmentation/TestICUTokenizer.java |
| +++ b/lucene/analysis/icu/src/test/org/apache/lucene/analysis/icu/segmentation/TestICUTokenizer.java |
| @@ -42,7 +42,7 @@ public class TestICUTokenizer extends BaseTokenStreamTestCase { |
| sb.append(whitespace); |
| sb.append("testing 1234"); |
| String input = sb.toString(); |
| - ICUTokenizer tokenizer = new ICUTokenizer(newAttributeFactory(), new DefaultICUTokenizerConfig(false)); |
| + ICUTokenizer tokenizer = new ICUTokenizer(newAttributeFactory(), new DefaultICUTokenizerConfig(false, true)); |
| tokenizer.setReader(new StringReader(input)); |
| assertTokenStreamContents(tokenizer, new String[] { "testing", "1234" }); |
| } |
| @@ -53,7 +53,7 @@ public class TestICUTokenizer extends BaseTokenStreamTestCase { |
| sb.append('a'); |
| } |
| String input = sb.toString(); |
| - ICUTokenizer tokenizer = new ICUTokenizer(newAttributeFactory(), new DefaultICUTokenizerConfig(false)); |
| + ICUTokenizer tokenizer = new ICUTokenizer(newAttributeFactory(), new DefaultICUTokenizerConfig(false, true)); |
| tokenizer.setReader(new StringReader(input)); |
| char token[] = new char[4096]; |
| Arrays.fill(token, 'a'); |
| @@ -75,7 +75,7 @@ public class TestICUTokenizer extends BaseTokenStreamTestCase { |
| a = new Analyzer() { |
| @Override |
| protected TokenStreamComponents createComponents(String fieldName) { |
| - Tokenizer tokenizer = new ICUTokenizer(newAttributeFactory(), new DefaultICUTokenizerConfig(false)); |
| + Tokenizer tokenizer = new ICUTokenizer(newAttributeFactory(), new DefaultICUTokenizerConfig(false, true)); |
| TokenFilter filter = new ICUNormalizer2Filter(tokenizer); |
| return new TokenStreamComponents(tokenizer, filter); |
| } |
| diff --git a/lucene/analysis/icu/src/test/org/apache/lucene/analysis/icu/segmentation/TestICUTokenizerCJK.java b/lucene/analysis/icu/src/test/org/apache/lucene/analysis/icu/segmentation/TestICUTokenizerCJK.java |
| index a29686c..96f44d6 100644 |
| --- a/lucene/analysis/icu/src/test/org/apache/lucene/analysis/icu/segmentation/TestICUTokenizerCJK.java |
| +++ b/lucene/analysis/icu/src/test/org/apache/lucene/analysis/icu/segmentation/TestICUTokenizerCJK.java |
| @@ -34,7 +34,7 @@ public class TestICUTokenizerCJK extends BaseTokenStreamTestCase { |
| a = new Analyzer() { |
| @Override |
| protected TokenStreamComponents createComponents(String fieldName) { |
| - return new TokenStreamComponents(new ICUTokenizer(newAttributeFactory(), new DefaultICUTokenizerConfig(true))); |
| + return new TokenStreamComponents(new ICUTokenizer(newAttributeFactory(), new DefaultICUTokenizerConfig(true, true))); |
| } |
| }; |
| } |
| diff --git a/lucene/analysis/icu/src/test/org/apache/lucene/analysis/icu/segmentation/TestMyanmarSyllable.java b/lucene/analysis/icu/src/test/org/apache/lucene/analysis/icu/segmentation/TestMyanmarSyllable.java |
| new file mode 100644 |
| index 0000000..a3b608e |
| --- /dev/null |
| +++ b/lucene/analysis/icu/src/test/org/apache/lucene/analysis/icu/segmentation/TestMyanmarSyllable.java |
| @@ -0,0 +1,156 @@ |
| +/* |
| + * Licensed to the Apache Software Foundation (ASF) under one or more |
| + * contributor license agreements. See the NOTICE file distributed with |
| + * this work for additional information regarding copyright ownership. |
| + * The ASF licenses this file to You under the Apache License, Version 2.0 |
| + * (the "License"); you may not use this file except in compliance with |
| + * the License. You may obtain a copy of the License at |
| + * |
| + * http://www.apache.org/licenses/LICENSE-2.0 |
| + * |
| + * Unless required by applicable law or agreed to in writing, software |
| + * distributed under the License is distributed on an "AS IS" BASIS, |
| + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| + * See the License for the specific language governing permissions and |
| + * limitations under the License. |
| + */ |
| +package org.apache.lucene.analysis.icu.segmentation; |
| + |
| +import org.apache.lucene.analysis.Analyzer; |
| +import org.apache.lucene.analysis.BaseTokenStreamTestCase; |
| +import org.apache.lucene.analysis.Tokenizer; |
| + |
| +/** Test tokenizing Myanmar text into syllables */ |
| +public class TestMyanmarSyllable extends BaseTokenStreamTestCase { |
| + |
| + Analyzer a; |
| + |
| + @Override |
| + public void setUp() throws Exception { |
| + super.setUp(); |
| + a = new Analyzer() { |
| + @Override |
| + protected TokenStreamComponents createComponents(String fieldName) { |
| + Tokenizer tokenizer = new ICUTokenizer(newAttributeFactory(), new DefaultICUTokenizerConfig(false, false)); |
| + return new TokenStreamComponents(tokenizer); |
| + } |
| + }; |
| + } |
| + |
| + @Override |
| + public void tearDown() throws Exception { |
| + a.close(); |
| + super.tearDown(); |
| + } |
| + |
| + /** as opposed to dictionary break of သက်ဝင်|လှုပ်ရှား|စေ|ပြီး */ |
| + public void testBasics() throws Exception { |
| + assertAnalyzesTo(a, "သက်ဝင်လှုပ်ရှားစေပြီး", new String[] { "သက်", "ဝင်", "လှုပ်", "ရှား", "စေ", "ပြီး" }); |
| + } |
| + |
| + // simple tests from "A Rule-based Syllable Segmentation of Myanmar Text" |
| + // * http://www.aclweb.org/anthology/I08-3010 |
| + // (see also the presentation: http://gii2.nagaokaut.ac.jp/gii/media/share/20080901-ZMM%20Presentation.pdf) |
| + // The words are fake, we just test the categories. |
| + // note that currently our algorithm is not sophisticated enough to handle some of the special cases! |
| + |
| + /** constant */ |
| + public void testC() throws Exception { |
| + assertAnalyzesTo(a, "ကက", new String[] { "က", "က" }); |
| + } |
| + |
| + /** consonant + sign */ |
| + public void testCF() throws Exception { |
| + assertAnalyzesTo(a, "ကံကံ", new String[] { "ကံ", "ကံ" }); |
| + } |
| + |
| + /** consonant + consonant + asat */ |
| + public void testCCA() throws Exception { |
| + assertAnalyzesTo(a, "ကင်ကင်", new String[] { "ကင်", "ကင်" }); |
| + } |
| + |
| + /** consonant + consonant + asat + sign */ |
| + public void testCCAF() throws Exception { |
| + assertAnalyzesTo(a, "ကင်းကင်း", new String[] { "ကင်း", "ကင်း" }); |
| + } |
| + |
| + /** consonant + vowel */ |
| + public void testCV() throws Exception { |
| + assertAnalyzesTo(a, "ကာကာ", new String[] { "ကာ", "ကာ" }); |
| + } |
| + |
| + /** consonant + vowel + sign */ |
| + public void testCVF() throws Exception { |
| + assertAnalyzesTo(a, "ကားကား", new String[] { "ကား", "ကား" }); |
| + } |
| + |
| + /** consonant + vowel + vowel + asat */ |
| + public void testCVVA() throws Exception { |
| + assertAnalyzesTo(a, "ကော်ကော်", new String[] { "ကော်", "ကော်" }); |
| + } |
| + |
| + /** consonant + vowel + vowel + consonant + asat */ |
| + public void testCVVCA() throws Exception { |
| + assertAnalyzesTo(a, "ကောင်ကောင်", new String[] { "ကောင်", "ကောင်" }); |
| + } |
| + |
| + /** consonant + vowel + vowel + consonant + asat + sign */ |
| + public void testCVVCAF() throws Exception { |
| + assertAnalyzesTo(a, "ကောင်းကောင်း", new String[] { "ကောင်း", "ကောင်း" }); |
| + } |
| + |
| + /** consonant + medial */ |
| + public void testCM() throws Exception { |
| + assertAnalyzesTo(a, "ကျကျ", new String[] { "ကျ", "ကျ" }); |
| + } |
| + |
| + /** consonant + medial + sign */ |
| + public void testCMF() throws Exception { |
| + assertAnalyzesTo(a, "ကျံကျံ", new String[] { "ကျံ", "ကျံ" }); |
| + } |
| + |
| + /** consonant + medial + consonant + asat */ |
| + public void testCMCA() throws Exception { |
| + assertAnalyzesTo(a, "ကျင်ကျင်", new String[] { "ကျင်", "ကျင်" }); |
| + } |
| + |
| + /** consonant + medial + consonant + asat + sign */ |
| + public void testCMCAF() throws Exception { |
| + assertAnalyzesTo(a, "ကျင်းကျင်း", new String[] { "ကျင်း", "ကျင်း" }); |
| + } |
| + |
| + /** consonant + medial + vowel */ |
| + public void testCMV() throws Exception { |
| + assertAnalyzesTo(a, "ကျာကျာ", new String[] { "ကျာ", "ကျာ" }); |
| + } |
| + |
| + /** consonant + medial + vowel + sign */ |
| + public void testCMVF() throws Exception { |
| + assertAnalyzesTo(a, "ကျားကျား", new String[] { "ကျား", "ကျား" }); |
| + } |
| + |
| + /** consonant + medial + vowel + vowel + asat */ |
| + public void testCMVVA() throws Exception { |
| + assertAnalyzesTo(a, "ကျော်ကျော်", new String[] { "ကျော်", "ကျော်" }); |
| + } |
| + |
| + /** consonant + medial + vowel + vowel + consonant + asat */ |
| + public void testCMVVCA() throws Exception { |
| + assertAnalyzesTo(a, "ကြောင်ကြောင်", new String[] { "ကြောင်", "ကြောင်"}); |
| + } |
| + |
| + /** consonant + medial + vowel + vowel + consonant + asat + sign */ |
| + public void testCMVVCAF() throws Exception { |
| + assertAnalyzesTo(a, "ကြောင်းကြောင်း", new String[] { "ကြောင်း", "ကြောင်း"}); |
| + } |
| + |
| + /** independent vowel */ |
| + public void testI() throws Exception { |
| + assertAnalyzesTo(a, "ဪဪ", new String[] { "ဪ", "ဪ" }); |
| + } |
| + |
| + /** independent vowel */ |
| + public void testE() throws Exception { |
| + assertAnalyzesTo(a, "ဣဣ", new String[] { "ဣ", "ဣ" }); |
| + } |
| +} |
| diff --git a/lucene/analysis/icu/src/test/org/apache/lucene/analysis/icu/segmentation/TestWithCJKBigramFilter.java b/lucene/analysis/icu/src/test/org/apache/lucene/analysis/icu/segmentation/TestWithCJKBigramFilter.java |
| index 17ea967..411b85e 100644 |
| --- a/lucene/analysis/icu/src/test/org/apache/lucene/analysis/icu/segmentation/TestWithCJKBigramFilter.java |
| +++ b/lucene/analysis/icu/src/test/org/apache/lucene/analysis/icu/segmentation/TestWithCJKBigramFilter.java |
| @@ -46,7 +46,7 @@ public class TestWithCJKBigramFilter extends BaseTokenStreamTestCase { |
| analyzer = new Analyzer() { |
| @Override |
| protected TokenStreamComponents createComponents(String fieldName) { |
| - Tokenizer source = new ICUTokenizer(newAttributeFactory(), new DefaultICUTokenizerConfig(false)); |
| + Tokenizer source = new ICUTokenizer(newAttributeFactory(), new DefaultICUTokenizerConfig(false, true)); |
| TokenStream result = new CJKBigramFilter(source); |
| return new TokenStreamComponents(source, new StopFilter(result, CharArraySet.EMPTY_SET)); |
| } |
| @@ -60,7 +60,7 @@ public class TestWithCJKBigramFilter extends BaseTokenStreamTestCase { |
| analyzer2 = new Analyzer() { |
| @Override |
| protected TokenStreamComponents createComponents(String fieldName) { |
| - Tokenizer source = new ICUTokenizer(newAttributeFactory(), new DefaultICUTokenizerConfig(false)); |
| + Tokenizer source = new ICUTokenizer(newAttributeFactory(), new DefaultICUTokenizerConfig(false, true)); |
| // we put this before the CJKBigramFilter, because the normalization might combine |
| // some halfwidth katakana forms, which will affect the bigramming. |
| TokenStream result = new ICUNormalizer2Filter(source); |