docs/attachments/LUCENE-7393/LUCENE-7393.patch - lucene-jira-archive - Git at Google

 diff --git a/lucene/analysis/icu/src/data/uax29/MyanmarSyllable.rbbi b/lucene/analysis/icu/src/data/uax29/MyanmarSyllable.rbbi
 new file mode 100644
 index 0000000..1840803
 --- /dev/null
 +++ b/lucene/analysis/icu/src/data/uax29/MyanmarSyllable.rbbi
 @@ -0,0 +1,50 @@
 +#
 +# Licensed to the Apache Software Foundation (ASF) under one or more
 +# contributor license agreements.  See the NOTICE file distributed with
 +# this work for additional information regarding copyright ownership.
 +# The ASF licenses this file to You under the Apache License, Version 2.0
 +# (the "License"); you may not use this file except in compliance with
 +# the License.  You may obtain a copy of the License at
 +#
 +#     http://www.apache.org/licenses/LICENSE-2.0
 +#
 +# Unless required by applicable law or agreed to in writing, software
 +# distributed under the License is distributed on an "AS IS" BASIS,
 +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 +# See the License for the specific language governing permissions and
 +# limitations under the License.
 +#
 +#
 +# Parses Myanmar text, with syllable as token.
 +#
 +
 +$Cons = [[:Other_Letter:]&[:Myanmar:]];
 +$Virama = [\u1039];
 +$Asat = [\u103A];
 +
 +$WordJoin = [:Line_Break=Word_Joiner:];
 +
 +#
 +# default numerical definitions
 +#
 +$Extend       = [\p{Word_Break = Extend}];
 +$Format       = [\p{Word_Break = Format}];
 +$MidNumLet    = [\p{Word_Break = MidNumLet}];
 +$MidNum       = [\p{Word_Break = MidNum}];
 +$Numeric      = [\p{Word_Break = Numeric}];
 +$ExtendNumLet = [\p{Word_Break = ExtendNumLet}];
 +$MidNumLetEx    = $MidNumLet    ($Extend |  $Format)*;
 +$MidNumEx       = $MidNum       ($Extend |  $Format)*;
 +$NumericEx      = $Numeric      ($Extend |  $Format)*;
 +$ExtendNumLetEx = $ExtendNumLet ($Extend |  $Format)*;
 +
 +$ConsEx = $Cons ($Extend | $Format)*;
 +$AsatEx = $Cons $Asat ($Virama $ConsEx)? ($Extend | $Format)*;
 +$MyanmarSyllableEx = $ConsEx ($Virama $ConsEx)? ($AsatEx)*;
 +$MyanmarJoinedSyllableEx = $MyanmarSyllableEx ($WordJoin $MyanmarSyllableEx)*;
 +
 +!!forward;
 +$MyanmarJoinedSyllableEx {200};
 +
 +# default numeric rules
 +$NumericEx $ExtendNumLetEx? (($MidNumEx | $MidNumLetEx)? $NumericEx $ExtendNumLetEx?)*  {100};
 diff --git a/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/segmentation/DefaultICUTokenizerConfig.java b/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/segmentation/DefaultICUTokenizerConfig.java
 index b33663d..3cd62c8 100644
 --- a/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/segmentation/DefaultICUTokenizerConfig.java
 +++ b/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/segmentation/DefaultICUTokenizerConfig.java
 @@ -63,9 +63,12 @@ public class DefaultICUTokenizerConfig extends ICUTokenizerConfig {
    // the same as ROOT, except no dictionary segmentation for cjk
    private static final BreakIterator defaultBreakIterator =
      readBreakIterator("Default.brk");
 +  private static final BreakIterator myanmarSyllableIterator =
 +    readBreakIterator("MyanmarSyllable.brk");

    // TODO: deprecate this boolean? you only care if you are doing super-expert stuff...
    private final boolean cjkAsWords;
 +  private final boolean myanmarAsWords;

    /**
     * Creates a new config. This object is lightweight, but the first
 @@ -74,9 +77,12 @@ public class DefaultICUTokenizerConfig extends ICUTokenizerConfig {
     *                   otherwise text will be segmented according to UAX#29 defaults.
     *                   If this is true, all Han+Hiragana+Katakana words will be tagged as
     *                   IDEOGRAPHIC.
 +   * @param myanmarAsWords true if Myanmar text should undergo dictionary-based segmentation,
 +   *                       otherwise it will be tokenized as syllables.
     */
 -  public DefaultICUTokenizerConfig(boolean cjkAsWords) {
 +  public DefaultICUTokenizerConfig(boolean cjkAsWords, boolean myanmarAsWords) {
      this.cjkAsWords = cjkAsWords;
 +    this.myanmarAsWords = myanmarAsWords;
    }

    @Override
 @@ -88,6 +94,12 @@ public class DefaultICUTokenizerConfig extends ICUTokenizerConfig {
    public BreakIterator getBreakIterator(int script) {
      switch(script) {
        case UScript.JAPANESE: return (BreakIterator)cjkBreakIterator.clone();
 +      case UScript.MYANMAR:
 +        if (myanmarAsWords) {
 +          return (BreakIterator)defaultBreakIterator.clone();
 +        } else {
 +          return (BreakIterator)myanmarSyllableIterator.clone();
 +        }
        default: return (BreakIterator)defaultBreakIterator.clone();
      }
    }
 diff --git a/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/segmentation/ICUTokenizer.java b/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/segmentation/ICUTokenizer.java
 index 64c6785..0941551 100644
 --- a/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/segmentation/ICUTokenizer.java
 +++ b/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/segmentation/ICUTokenizer.java
 @@ -68,7 +68,7 @@ public final class ICUTokenizer extends Tokenizer {
     * @see DefaultICUTokenizerConfig
     */
    public ICUTokenizer() {
 -    this(new DefaultICUTokenizerConfig(true));
 +    this(new DefaultICUTokenizerConfig(true, true));
    }

    /**
 diff --git a/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/segmentation/ICUTokenizerFactory.java b/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/segmentation/ICUTokenizerFactory.java
 index deb5d4f..974e719 100644
 --- a/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/segmentation/ICUTokenizerFactory.java
 +++ b/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/segmentation/ICUTokenizerFactory.java
 @@ -79,6 +79,7 @@ public class ICUTokenizerFactory extends TokenizerFactory implements ResourceLoa
    private final Map<Integer,String> tailored;
    private ICUTokenizerConfig config;
    private final boolean cjkAsWords;
 +  private final boolean myanmarAsWords;

    /** Creates a new ICUTokenizerFactory */
    public ICUTokenizerFactory(Map<String,String> args) {
 @@ -95,6 +96,7 @@ public class ICUTokenizerFactory extends TokenizerFactory implements ResourceLoa
        }
      }
      cjkAsWords = getBoolean(args, "cjkAsWords", true);
 +    myanmarAsWords = getBoolean(args, "myanmarAsWords", true);
      if (!args.isEmpty()) {
        throw new IllegalArgumentException("Unknown parameters: " + args);
      }
 @@ -104,7 +106,7 @@ public class ICUTokenizerFactory extends TokenizerFactory implements ResourceLoa
    public void inform(ResourceLoader loader) throws IOException {
      assert tailored != null : "init must be called first!";
      if (tailored.isEmpty()) {
 -      config = new DefaultICUTokenizerConfig(cjkAsWords);
 +      config = new DefaultICUTokenizerConfig(cjkAsWords, myanmarAsWords);
      } else {
        final BreakIterator breakers[] = new BreakIterator[UScript.CODE_LIMIT];
        for (Map.Entry<Integer,String> entry : tailored.entrySet()) {
 @@ -112,7 +114,7 @@ public class ICUTokenizerFactory extends TokenizerFactory implements ResourceLoa
          String resourcePath = entry.getValue();
          breakers[code] = parseRules(resourcePath, loader);
        }
 -      config = new DefaultICUTokenizerConfig(cjkAsWords) {
 +      config = new DefaultICUTokenizerConfig(cjkAsWords, myanmarAsWords) {

          @Override
          public BreakIterator getBreakIterator(int script) {
 diff --git a/lucene/analysis/icu/src/resources/org/apache/lucene/analysis/icu/segmentation/MyanmarSyllable.brk b/lucene/analysis/icu/src/resources/org/apache/lucene/analysis/icu/segmentation/MyanmarSyllable.brk
 new file mode 100644
 index 0000000..41b977b
 Binary files /dev/null and b/lucene/analysis/icu/src/resources/org/apache/lucene/analysis/icu/segmentation/MyanmarSyllable.brk differ
 diff --git a/lucene/analysis/icu/src/test/org/apache/lucene/analysis/icu/segmentation/TestICUTokenizer.java b/lucene/analysis/icu/src/test/org/apache/lucene/analysis/icu/segmentation/TestICUTokenizer.java
 index 6398b2c..027baa3 100644
 --- a/lucene/analysis/icu/src/test/org/apache/lucene/analysis/icu/segmentation/TestICUTokenizer.java
 +++ b/lucene/analysis/icu/src/test/org/apache/lucene/analysis/icu/segmentation/TestICUTokenizer.java
 @@ -42,7 +42,7 @@ public class TestICUTokenizer extends BaseTokenStreamTestCase {
      sb.append(whitespace);
      sb.append("testing 1234");
      String input = sb.toString();
 -    ICUTokenizer tokenizer = new ICUTokenizer(newAttributeFactory(), new DefaultICUTokenizerConfig(false));
 +    ICUTokenizer tokenizer = new ICUTokenizer(newAttributeFactory(), new DefaultICUTokenizerConfig(false, true));
      tokenizer.setReader(new StringReader(input));
      assertTokenStreamContents(tokenizer, new String[] { "testing", "1234" });
    }
 @@ -53,7 +53,7 @@ public class TestICUTokenizer extends BaseTokenStreamTestCase {
        sb.append('a');
      }
      String input = sb.toString();
 -    ICUTokenizer tokenizer = new ICUTokenizer(newAttributeFactory(), new DefaultICUTokenizerConfig(false));
 +    ICUTokenizer tokenizer = new ICUTokenizer(newAttributeFactory(), new DefaultICUTokenizerConfig(false, true));
      tokenizer.setReader(new StringReader(input));
      char token[] = new char[4096];
      Arrays.fill(token, 'a');
 @@ -75,7 +75,7 @@ public class TestICUTokenizer extends BaseTokenStreamTestCase {
      a = new Analyzer() {
        @Override
        protected TokenStreamComponents createComponents(String fieldName) {
 -        Tokenizer tokenizer = new ICUTokenizer(newAttributeFactory(), new DefaultICUTokenizerConfig(false));
 +        Tokenizer tokenizer = new ICUTokenizer(newAttributeFactory(), new DefaultICUTokenizerConfig(false, true));
          TokenFilter filter = new ICUNormalizer2Filter(tokenizer);
          return new TokenStreamComponents(tokenizer, filter);
        }
 diff --git a/lucene/analysis/icu/src/test/org/apache/lucene/analysis/icu/segmentation/TestICUTokenizerCJK.java b/lucene/analysis/icu/src/test/org/apache/lucene/analysis/icu/segmentation/TestICUTokenizerCJK.java
 index a29686c..96f44d6 100644
 --- a/lucene/analysis/icu/src/test/org/apache/lucene/analysis/icu/segmentation/TestICUTokenizerCJK.java
 +++ b/lucene/analysis/icu/src/test/org/apache/lucene/analysis/icu/segmentation/TestICUTokenizerCJK.java
 @@ -34,7 +34,7 @@ public class TestICUTokenizerCJK extends BaseTokenStreamTestCase {
      a = new Analyzer() {
        @Override
        protected TokenStreamComponents createComponents(String fieldName) {
 -        return new TokenStreamComponents(new ICUTokenizer(newAttributeFactory(), new DefaultICUTokenizerConfig(true)));
 +        return new TokenStreamComponents(new ICUTokenizer(newAttributeFactory(), new DefaultICUTokenizerConfig(true, true)));
        }
      };
    }
 diff --git a/lucene/analysis/icu/src/test/org/apache/lucene/analysis/icu/segmentation/TestMyanmarSyllable.java b/lucene/analysis/icu/src/test/org/apache/lucene/analysis/icu/segmentation/TestMyanmarSyllable.java
 new file mode 100644
 index 0000000..a3b608e
 --- /dev/null
 +++ b/lucene/analysis/icu/src/test/org/apache/lucene/analysis/icu/segmentation/TestMyanmarSyllable.java
 @@ -0,0 +1,156 @@
 +/*
 + * Licensed to the Apache Software Foundation (ASF) under one or more
 + * contributor license agreements.  See the NOTICE file distributed with
 + * this work for additional information regarding copyright ownership.
 + * The ASF licenses this file to You under the Apache License, Version 2.0
 + * (the "License"); you may not use this file except in compliance with
 + * the License.  You may obtain a copy of the License at
 + *
 + *     http://www.apache.org/licenses/LICENSE-2.0
 + *
 + * Unless required by applicable law or agreed to in writing, software
 + * distributed under the License is distributed on an "AS IS" BASIS,
 + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 + * See the License for the specific language governing permissions and
 + * limitations under the License.
 + */
 +package org.apache.lucene.analysis.icu.segmentation;
 +
 +import org.apache.lucene.analysis.Analyzer;
 +import org.apache.lucene.analysis.BaseTokenStreamTestCase;
 +import org.apache.lucene.analysis.Tokenizer;
 +
 +/** Test tokenizing Myanmar text into syllables */
 +public class TestMyanmarSyllable extends BaseTokenStreamTestCase {
 +
 +  Analyzer a;
 +
 +  @Override
 +  public void setUp() throws Exception {
 +    super.setUp();
 +    a = new Analyzer() {
 +      @Override
 +      protected TokenStreamComponents createComponents(String fieldName) {
 +        Tokenizer tokenizer = new ICUTokenizer(newAttributeFactory(), new DefaultICUTokenizerConfig(false, false));
 +        return new TokenStreamComponents(tokenizer);
 +      }
 +    };
 +  }
 +
 +  @Override
 +  public void tearDown() throws Exception {
 +    a.close();
 +    super.tearDown();
 +  }
 +
 +  /** as opposed to dictionary break of သက်ဝင်|လှုပ်ရှား|စေ|ပြီး */
 +  public void testBasics() throws Exception {
 +    assertAnalyzesTo(a, "သက်ဝင်လှုပ်ရှားစေပြီး", new String[] { "သက်", "ဝင်", "လှုပ်", "ရှား", "စေ", "ပြီး" });
 +  }
 +
 +  // simple tests from "A Rule-based Syllable Segmentation of Myanmar Text"
 +  // * http://www.aclweb.org/anthology/I08-3010
 +  // (see also the presentation: http://gii2.nagaokaut.ac.jp/gii/media/share/20080901-ZMM%20Presentation.pdf)
 +  // The words are fake, we just test the categories.
 +  // note that currently our algorithm is not sophisticated enough to handle some of the special cases!
 +
 +  /** constant */
 +  public void testC() throws Exception {
 +    assertAnalyzesTo(a, "ကက", new String[] { "က", "က" });
 +  }
 +
 +  /** consonant + sign */
 +  public void testCF() throws Exception {
 +    assertAnalyzesTo(a, "ကံကံ", new String[] { "ကံ", "ကံ" });
 +  }
 +
 +  /** consonant + consonant + asat */
 +  public void testCCA() throws Exception {
 +    assertAnalyzesTo(a, "ကင်ကင်", new String[] { "ကင်", "ကင်" });
 +  }
 +
 +  /** consonant + consonant + asat + sign */
 +  public void testCCAF() throws Exception {
 +    assertAnalyzesTo(a, "ကင်းကင်း", new String[] { "ကင်း", "ကင်း" });
 +  }
 +
 +  /** consonant + vowel */
 +  public void testCV() throws Exception {
 +    assertAnalyzesTo(a, "ကာကာ", new String[] { "ကာ", "ကာ" });
 +  }
 +
 +  /** consonant + vowel + sign */
 +  public void testCVF() throws Exception {
 +    assertAnalyzesTo(a, "ကားကား", new String[] { "ကား", "ကား" });
 +  }
 +
 +  /** consonant + vowel + vowel + asat */
 +  public void testCVVA() throws Exception {
 +    assertAnalyzesTo(a, "ကော်ကော်", new String[] { "ကော်", "ကော်" });
 +  }
 +
 +  /** consonant + vowel + vowel + consonant + asat */
 +  public void testCVVCA() throws Exception {
 +    assertAnalyzesTo(a, "ကောင်ကောင်", new String[] { "ကောင်", "ကောင်" });
 +  }
 +
 +  /** consonant + vowel + vowel + consonant + asat + sign */
 +  public void testCVVCAF() throws Exception {
 +    assertAnalyzesTo(a, "ကောင်းကောင်း", new String[] { "ကောင်း", "ကောင်း" });
 +  }
 +
 +  /** consonant + medial */
 +  public void testCM() throws Exception {
 +    assertAnalyzesTo(a, "ကျကျ", new String[] { "ကျ", "ကျ" });
 +  }
 +
 +  /** consonant + medial + sign */
 +  public void testCMF() throws Exception {
 +    assertAnalyzesTo(a, "ကျံကျံ", new String[] { "ကျံ", "ကျံ" });
 +  }
 +
 +  /** consonant + medial + consonant + asat */
 +  public void testCMCA() throws Exception {
 +    assertAnalyzesTo(a, "ကျင်ကျင်", new String[] { "ကျင်", "ကျင်" });
 +  }
 +
 +  /** consonant + medial + consonant + asat + sign */
 +  public void testCMCAF() throws Exception {
 +    assertAnalyzesTo(a, "ကျင်းကျင်း", new String[] { "ကျင်း", "ကျင်း" });
 +  }
 +
 +  /** consonant + medial + vowel */
 +  public void testCMV() throws Exception {
 +    assertAnalyzesTo(a, "ကျာကျာ", new String[] { "ကျာ", "ကျာ" });
 +  }
 +
 +  /** consonant + medial + vowel + sign */
 +  public void testCMVF() throws Exception {
 +    assertAnalyzesTo(a, "ကျားကျား", new String[] { "ကျား", "ကျား" });
 +  }
 +
 +  /** consonant + medial + vowel + vowel + asat */
 +  public void testCMVVA() throws Exception {
 +    assertAnalyzesTo(a, "ကျော်ကျော်", new String[] { "ကျော်", "ကျော်" });
 +  }
 +
 +  /** consonant + medial + vowel + vowel + consonant + asat */
 +  public void testCMVVCA() throws Exception {
 +    assertAnalyzesTo(a, "ကြောင်ကြောင်", new String[] { "ကြောင်", "ကြောင်"});
 +  }
 +
 +  /** consonant + medial + vowel + vowel + consonant + asat + sign */
 +  public void testCMVVCAF() throws Exception {
 +    assertAnalyzesTo(a, "ကြောင်းကြောင်း", new String[] { "ကြောင်း", "ကြောင်း"});
 +  }
 +
 +  /** independent vowel */
 +  public void testI() throws Exception {
 +    assertAnalyzesTo(a, "ဪဪ", new String[] { "ဪ", "ဪ" });
 +  }
 +
 +  /** independent vowel */
 +  public void testE() throws Exception {
 +    assertAnalyzesTo(a, "ဣဣ", new String[] { "ဣ", "ဣ" });
 +  }
 +}
 diff --git a/lucene/analysis/icu/src/test/org/apache/lucene/analysis/icu/segmentation/TestWithCJKBigramFilter.java b/lucene/analysis/icu/src/test/org/apache/lucene/analysis/icu/segmentation/TestWithCJKBigramFilter.java
 index 17ea967..411b85e 100644
 --- a/lucene/analysis/icu/src/test/org/apache/lucene/analysis/icu/segmentation/TestWithCJKBigramFilter.java
 +++ b/lucene/analysis/icu/src/test/org/apache/lucene/analysis/icu/segmentation/TestWithCJKBigramFilter.java
 @@ -46,7 +46,7 @@ public class TestWithCJKBigramFilter extends BaseTokenStreamTestCase {
      analyzer = new Analyzer() {
        @Override
        protected TokenStreamComponents createComponents(String fieldName) {
 -        Tokenizer source = new ICUTokenizer(newAttributeFactory(), new DefaultICUTokenizerConfig(false));
 +        Tokenizer source = new ICUTokenizer(newAttributeFactory(), new DefaultICUTokenizerConfig(false, true));
          TokenStream result = new CJKBigramFilter(source);
          return new TokenStreamComponents(source, new StopFilter(result, CharArraySet.EMPTY_SET));
        }
 @@ -60,7 +60,7 @@ public class TestWithCJKBigramFilter extends BaseTokenStreamTestCase {
      analyzer2 = new Analyzer() {
        @Override
        protected TokenStreamComponents createComponents(String fieldName) {
 -        Tokenizer source = new ICUTokenizer(newAttributeFactory(), new DefaultICUTokenizerConfig(false));
 +        Tokenizer source = new ICUTokenizer(newAttributeFactory(), new DefaultICUTokenizerConfig(false, true));
          // we put this before the CJKBigramFilter, because the normalization might combine
          // some halfwidth katakana forms, which will affect the bigramming.
          TokenStream result = new ICUNormalizer2Filter(source);
	diff --git a/lucene/analysis/icu/src/data/uax29/MyanmarSyllable.rbbi b/lucene/analysis/icu/src/data/uax29/MyanmarSyllable.rbbi
	new file mode 100644
	index 0000000..1840803
	--- /dev/null
	+++ b/lucene/analysis/icu/src/data/uax29/MyanmarSyllable.rbbi
	@@ -0,0 +1,50 @@
	+#
	+# Licensed to the Apache Software Foundation (ASF) under one or more
	+# contributor license agreements. See the NOTICE file distributed with
	+# this work for additional information regarding copyright ownership.
	+# The ASF licenses this file to You under the Apache License, Version 2.0
	+# (the "License"); you may not use this file except in compliance with
	+# the License. You may obtain a copy of the License at
	+#
	+# http://www.apache.org/licenses/LICENSE-2.0
	+#
	+# Unless required by applicable law or agreed to in writing, software
	+# distributed under the License is distributed on an "AS IS" BASIS,
	+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	+# See the License for the specific language governing permissions and
	+# limitations under the License.
	+#
	+#
	+# Parses Myanmar text, with syllable as token.
	+#
	+
	+$Cons = [[:Other_Letter:]&[:Myanmar:]];
	+$Virama = [\u1039];
	+$Asat = [\u103A];
	+
	+$WordJoin = [:Line_Break=Word_Joiner:];
	+
	+#
	+# default numerical definitions
	+#
	+$Extend = [\p{Word_Break = Extend}];
	+$Format = [\p{Word_Break = Format}];
	+$MidNumLet = [\p{Word_Break = MidNumLet}];
	+$MidNum = [\p{Word_Break = MidNum}];
	+$Numeric = [\p{Word_Break = Numeric}];
	+$ExtendNumLet = [\p{Word_Break = ExtendNumLet}];
	+$MidNumLetEx = $MidNumLet ($Extend \| $Format)*;
	+$MidNumEx = $MidNum ($Extend \| $Format)*;
	+$NumericEx = $Numeric ($Extend \| $Format)*;
	+$ExtendNumLetEx = $ExtendNumLet ($Extend \| $Format)*;
	+
	+$ConsEx = $Cons ($Extend \| $Format)*;
	+$AsatEx = $Cons $Asat ($Virama $ConsEx)? ($Extend \| $Format)*;
	+$MyanmarSyllableEx = $ConsEx ($Virama $ConsEx)? ($AsatEx)*;
	+$MyanmarJoinedSyllableEx = $MyanmarSyllableEx ($WordJoin $MyanmarSyllableEx)*;
	+
	+!!forward;
	+$MyanmarJoinedSyllableEx {200};
	+
	+# default numeric rules
	+$NumericEx $ExtendNumLetEx? (($MidNumEx \| $MidNumLetEx)? $NumericEx $ExtendNumLetEx?)* {100};
	diff --git a/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/segmentation/DefaultICUTokenizerConfig.java b/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/segmentation/DefaultICUTokenizerConfig.java
	index b33663d..3cd62c8 100644
	--- a/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/segmentation/DefaultICUTokenizerConfig.java
	+++ b/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/segmentation/DefaultICUTokenizerConfig.java
	@@ -63,9 +63,12 @@ public class DefaultICUTokenizerConfig extends ICUTokenizerConfig {
	// the same as ROOT, except no dictionary segmentation for cjk
	private static final BreakIterator defaultBreakIterator =
	readBreakIterator("Default.brk");
	+ private static final BreakIterator myanmarSyllableIterator =
	+ readBreakIterator("MyanmarSyllable.brk");

	// TODO: deprecate this boolean? you only care if you are doing super-expert stuff...
	private final boolean cjkAsWords;
	+ private final boolean myanmarAsWords;

	/**
	* Creates a new config. This object is lightweight, but the first
	@@ -74,9 +77,12 @@ public class DefaultICUTokenizerConfig extends ICUTokenizerConfig {
	* otherwise text will be segmented according to UAX#29 defaults.
	* If this is true, all Han+Hiragana+Katakana words will be tagged as
	* IDEOGRAPHIC.
	+ * @param myanmarAsWords true if Myanmar text should undergo dictionary-based segmentation,
	+ * otherwise it will be tokenized as syllables.
	*/
	- public DefaultICUTokenizerConfig(boolean cjkAsWords) {
	+ public DefaultICUTokenizerConfig(boolean cjkAsWords, boolean myanmarAsWords) {
	this.cjkAsWords = cjkAsWords;
	+ this.myanmarAsWords = myanmarAsWords;
	}

	@Override
	@@ -88,6 +94,12 @@ public class DefaultICUTokenizerConfig extends ICUTokenizerConfig {
	public BreakIterator getBreakIterator(int script) {
	switch(script) {
	case UScript.JAPANESE: return (BreakIterator)cjkBreakIterator.clone();
	+ case UScript.MYANMAR:
	+ if (myanmarAsWords) {
	+ return (BreakIterator)defaultBreakIterator.clone();
	+ } else {
	+ return (BreakIterator)myanmarSyllableIterator.clone();
	+ }
	default: return (BreakIterator)defaultBreakIterator.clone();
	}
	}
	diff --git a/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/segmentation/ICUTokenizer.java b/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/segmentation/ICUTokenizer.java
	index 64c6785..0941551 100644
	--- a/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/segmentation/ICUTokenizer.java
	+++ b/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/segmentation/ICUTokenizer.java
	@@ -68,7 +68,7 @@ public final class ICUTokenizer extends Tokenizer {
	* @see DefaultICUTokenizerConfig
	*/
	public ICUTokenizer() {
	- this(new DefaultICUTokenizerConfig(true));
	+ this(new DefaultICUTokenizerConfig(true, true));
	}

	/**
	diff --git a/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/segmentation/ICUTokenizerFactory.java b/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/segmentation/ICUTokenizerFactory.java
	index deb5d4f..974e719 100644
	--- a/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/segmentation/ICUTokenizerFactory.java
	+++ b/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/segmentation/ICUTokenizerFactory.java
	@@ -79,6 +79,7 @@ public class ICUTokenizerFactory extends TokenizerFactory implements ResourceLoa
	private final Map<Integer,String> tailored;
	private ICUTokenizerConfig config;
	private final boolean cjkAsWords;
	+ private final boolean myanmarAsWords;

	/** Creates a new ICUTokenizerFactory */
	public ICUTokenizerFactory(Map<String,String> args) {
	@@ -95,6 +96,7 @@ public class ICUTokenizerFactory extends TokenizerFactory implements ResourceLoa
	}
	}
	cjkAsWords = getBoolean(args, "cjkAsWords", true);
	+ myanmarAsWords = getBoolean(args, "myanmarAsWords", true);
	if (!args.isEmpty()) {
	throw new IllegalArgumentException("Unknown parameters: " + args);
	}
	@@ -104,7 +106,7 @@ public class ICUTokenizerFactory extends TokenizerFactory implements ResourceLoa
	public void inform(ResourceLoader loader) throws IOException {
	assert tailored != null : "init must be called first!";
	if (tailored.isEmpty()) {
	- config = new DefaultICUTokenizerConfig(cjkAsWords);
	+ config = new DefaultICUTokenizerConfig(cjkAsWords, myanmarAsWords);
	} else {
	final BreakIterator breakers[] = new BreakIterator[UScript.CODE_LIMIT];
	for (Map.Entry<Integer,String> entry : tailored.entrySet()) {
	@@ -112,7 +114,7 @@ public class ICUTokenizerFactory extends TokenizerFactory implements ResourceLoa
	String resourcePath = entry.getValue();
	breakers[code] = parseRules(resourcePath, loader);
	}
	- config = new DefaultICUTokenizerConfig(cjkAsWords) {
	+ config = new DefaultICUTokenizerConfig(cjkAsWords, myanmarAsWords) {

	@Override
	public BreakIterator getBreakIterator(int script) {
	diff --git a/lucene/analysis/icu/src/resources/org/apache/lucene/analysis/icu/segmentation/MyanmarSyllable.brk b/lucene/analysis/icu/src/resources/org/apache/lucene/analysis/icu/segmentation/MyanmarSyllable.brk
	new file mode 100644
	index 0000000..41b977b
	Binary files /dev/null and b/lucene/analysis/icu/src/resources/org/apache/lucene/analysis/icu/segmentation/MyanmarSyllable.brk differ
	diff --git a/lucene/analysis/icu/src/test/org/apache/lucene/analysis/icu/segmentation/TestICUTokenizer.java b/lucene/analysis/icu/src/test/org/apache/lucene/analysis/icu/segmentation/TestICUTokenizer.java
	index 6398b2c..027baa3 100644
	--- a/lucene/analysis/icu/src/test/org/apache/lucene/analysis/icu/segmentation/TestICUTokenizer.java
	+++ b/lucene/analysis/icu/src/test/org/apache/lucene/analysis/icu/segmentation/TestICUTokenizer.java
	@@ -42,7 +42,7 @@ public class TestICUTokenizer extends BaseTokenStreamTestCase {
	sb.append(whitespace);
	sb.append("testing 1234");
	String input = sb.toString();
	- ICUTokenizer tokenizer = new ICUTokenizer(newAttributeFactory(), new DefaultICUTokenizerConfig(false));
	+ ICUTokenizer tokenizer = new ICUTokenizer(newAttributeFactory(), new DefaultICUTokenizerConfig(false, true));
	tokenizer.setReader(new StringReader(input));
	assertTokenStreamContents(tokenizer, new String[] { "testing", "1234" });
	}
	@@ -53,7 +53,7 @@ public class TestICUTokenizer extends BaseTokenStreamTestCase {
	sb.append('a');
	}
	String input = sb.toString();
	- ICUTokenizer tokenizer = new ICUTokenizer(newAttributeFactory(), new DefaultICUTokenizerConfig(false));
	+ ICUTokenizer tokenizer = new ICUTokenizer(newAttributeFactory(), new DefaultICUTokenizerConfig(false, true));
	tokenizer.setReader(new StringReader(input));
	char token[] = new char[4096];
	Arrays.fill(token, 'a');
	@@ -75,7 +75,7 @@ public class TestICUTokenizer extends BaseTokenStreamTestCase {
	a = new Analyzer() {
	@Override
	protected TokenStreamComponents createComponents(String fieldName) {
	- Tokenizer tokenizer = new ICUTokenizer(newAttributeFactory(), new DefaultICUTokenizerConfig(false));
	+ Tokenizer tokenizer = new ICUTokenizer(newAttributeFactory(), new DefaultICUTokenizerConfig(false, true));
	TokenFilter filter = new ICUNormalizer2Filter(tokenizer);
	return new TokenStreamComponents(tokenizer, filter);
	}
	diff --git a/lucene/analysis/icu/src/test/org/apache/lucene/analysis/icu/segmentation/TestICUTokenizerCJK.java b/lucene/analysis/icu/src/test/org/apache/lucene/analysis/icu/segmentation/TestICUTokenizerCJK.java
	index a29686c..96f44d6 100644
	--- a/lucene/analysis/icu/src/test/org/apache/lucene/analysis/icu/segmentation/TestICUTokenizerCJK.java
	+++ b/lucene/analysis/icu/src/test/org/apache/lucene/analysis/icu/segmentation/TestICUTokenizerCJK.java
	@@ -34,7 +34,7 @@ public class TestICUTokenizerCJK extends BaseTokenStreamTestCase {
	a = new Analyzer() {
	@Override
	protected TokenStreamComponents createComponents(String fieldName) {
	- return new TokenStreamComponents(new ICUTokenizer(newAttributeFactory(), new DefaultICUTokenizerConfig(true)));
	+ return new TokenStreamComponents(new ICUTokenizer(newAttributeFactory(), new DefaultICUTokenizerConfig(true, true)));
	}
	};
	}
	diff --git a/lucene/analysis/icu/src/test/org/apache/lucene/analysis/icu/segmentation/TestMyanmarSyllable.java b/lucene/analysis/icu/src/test/org/apache/lucene/analysis/icu/segmentation/TestMyanmarSyllable.java
	new file mode 100644
	index 0000000..a3b608e
	--- /dev/null
	+++ b/lucene/analysis/icu/src/test/org/apache/lucene/analysis/icu/segmentation/TestMyanmarSyllable.java
	@@ -0,0 +1,156 @@
	+/*
	+ * Licensed to the Apache Software Foundation (ASF) under one or more
	+ * contributor license agreements. See the NOTICE file distributed with
	+ * this work for additional information regarding copyright ownership.
	+ * The ASF licenses this file to You under the Apache License, Version 2.0
	+ * (the "License"); you may not use this file except in compliance with
	+ * the License. You may obtain a copy of the License at
	+ *
	+ * http://www.apache.org/licenses/LICENSE-2.0
	+ *
	+ * Unless required by applicable law or agreed to in writing, software
	+ * distributed under the License is distributed on an "AS IS" BASIS,
	+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	+ * See the License for the specific language governing permissions and
	+ * limitations under the License.
	+ */
	+package org.apache.lucene.analysis.icu.segmentation;
	+
	+import org.apache.lucene.analysis.Analyzer;
	+import org.apache.lucene.analysis.BaseTokenStreamTestCase;
	+import org.apache.lucene.analysis.Tokenizer;
	+
	+/** Test tokenizing Myanmar text into syllables */
	+public class TestMyanmarSyllable extends BaseTokenStreamTestCase {
	+
	+ Analyzer a;
	+
	+ @Override
	+ public void setUp() throws Exception {
	+ super.setUp();
	+ a = new Analyzer() {
	+ @Override
	+ protected TokenStreamComponents createComponents(String fieldName) {
	+ Tokenizer tokenizer = new ICUTokenizer(newAttributeFactory(), new DefaultICUTokenizerConfig(false, false));
	+ return new TokenStreamComponents(tokenizer);
	+ }
	+ };
	+ }
	+
	+ @Override
	+ public void tearDown() throws Exception {
	+ a.close();
	+ super.tearDown();
	+ }
	+
	+ /** as opposed to dictionary break of သက်ဝင်\|လှုပ်ရှား\|စေ\|ပြီး */
	+ public void testBasics() throws Exception {
	+ assertAnalyzesTo(a, "သက်ဝင်လှုပ်ရှားစေပြီး", new String[] { "သက်", "ဝင်", "လှုပ်", "ရှား", "စေ", "ပြီး" });
	+ }
	+
	+ // simple tests from "A Rule-based Syllable Segmentation of Myanmar Text"
	+ // * http://www.aclweb.org/anthology/I08-3010
	+ // (see also the presentation: http://gii2.nagaokaut.ac.jp/gii/media/share/20080901-ZMM%20Presentation.pdf)
	+ // The words are fake, we just test the categories.
	+ // note that currently our algorithm is not sophisticated enough to handle some of the special cases!
	+
	+ /** constant */
	+ public void testC() throws Exception {
	+ assertAnalyzesTo(a, "ကက", new String[] { "က", "က" });
	+ }
	+
	+ /** consonant + sign */
	+ public void testCF() throws Exception {
	+ assertAnalyzesTo(a, "ကံကံ", new String[] { "ကံ", "ကံ" });
	+ }
	+
	+ /** consonant + consonant + asat */
	+ public void testCCA() throws Exception {
	+ assertAnalyzesTo(a, "ကင်ကင်", new String[] { "ကင်", "ကင်" });
	+ }
	+
	+ /** consonant + consonant + asat + sign */
	+ public void testCCAF() throws Exception {
	+ assertAnalyzesTo(a, "ကင်းကင်း", new String[] { "ကင်း", "ကင်း" });
	+ }
	+
	+ /** consonant + vowel */
	+ public void testCV() throws Exception {
	+ assertAnalyzesTo(a, "ကာကာ", new String[] { "ကာ", "ကာ" });
	+ }
	+
	+ /** consonant + vowel + sign */
	+ public void testCVF() throws Exception {
	+ assertAnalyzesTo(a, "ကားကား", new String[] { "ကား", "ကား" });
	+ }
	+
	+ /** consonant + vowel + vowel + asat */
	+ public void testCVVA() throws Exception {
	+ assertAnalyzesTo(a, "ကော်ကော်", new String[] { "ကော်", "ကော်" });
	+ }
	+
	+ /** consonant + vowel + vowel + consonant + asat */
	+ public void testCVVCA() throws Exception {
	+ assertAnalyzesTo(a, "ကောင်ကောင်", new String[] { "ကောင်", "ကောင်" });
	+ }
	+
	+ /** consonant + vowel + vowel + consonant + asat + sign */
	+ public void testCVVCAF() throws Exception {
	+ assertAnalyzesTo(a, "ကောင်းကောင်း", new String[] { "ကောင်း", "ကောင်း" });
	+ }
	+
	+ /** consonant + medial */
	+ public void testCM() throws Exception {
	+ assertAnalyzesTo(a, "ကျကျ", new String[] { "ကျ", "ကျ" });
	+ }
	+
	+ /** consonant + medial + sign */
	+ public void testCMF() throws Exception {
	+ assertAnalyzesTo(a, "ကျံကျံ", new String[] { "ကျံ", "ကျံ" });
	+ }
	+
	+ /** consonant + medial + consonant + asat */
	+ public void testCMCA() throws Exception {
	+ assertAnalyzesTo(a, "ကျင်ကျင်", new String[] { "ကျင်", "ကျင်" });
	+ }
	+
	+ /** consonant + medial + consonant + asat + sign */
	+ public void testCMCAF() throws Exception {
	+ assertAnalyzesTo(a, "ကျင်းကျင်း", new String[] { "ကျင်း", "ကျင်း" });
	+ }
	+
	+ /** consonant + medial + vowel */
	+ public void testCMV() throws Exception {
	+ assertAnalyzesTo(a, "ကျာကျာ", new String[] { "ကျာ", "ကျာ" });
	+ }
	+
	+ /** consonant + medial + vowel + sign */
	+ public void testCMVF() throws Exception {
	+ assertAnalyzesTo(a, "ကျားကျား", new String[] { "ကျား", "ကျား" });
	+ }
	+
	+ /** consonant + medial + vowel + vowel + asat */
	+ public void testCMVVA() throws Exception {
	+ assertAnalyzesTo(a, "ကျော်ကျော်", new String[] { "ကျော်", "ကျော်" });
	+ }
	+
	+ /** consonant + medial + vowel + vowel + consonant + asat */
	+ public void testCMVVCA() throws Exception {
	+ assertAnalyzesTo(a, "ကြောင်ကြောင်", new String[] { "ကြောင်", "ကြောင်"});
	+ }
	+
	+ /** consonant + medial + vowel + vowel + consonant + asat + sign */
	+ public void testCMVVCAF() throws Exception {
	+ assertAnalyzesTo(a, "ကြောင်းကြောင်း", new String[] { "ကြောင်း", "ကြောင်း"});
	+ }
	+
	+ /** independent vowel */
	+ public void testI() throws Exception {
	+ assertAnalyzesTo(a, "ဪဪ", new String[] { "ဪ", "ဪ" });
	+ }
	+
	+ /** independent vowel */
	+ public void testE() throws Exception {
	+ assertAnalyzesTo(a, "ဣဣ", new String[] { "ဣ", "ဣ" });
	+ }
	+}
	diff --git a/lucene/analysis/icu/src/test/org/apache/lucene/analysis/icu/segmentation/TestWithCJKBigramFilter.java b/lucene/analysis/icu/src/test/org/apache/lucene/analysis/icu/segmentation/TestWithCJKBigramFilter.java
	index 17ea967..411b85e 100644
	--- a/lucene/analysis/icu/src/test/org/apache/lucene/analysis/icu/segmentation/TestWithCJKBigramFilter.java
	+++ b/lucene/analysis/icu/src/test/org/apache/lucene/analysis/icu/segmentation/TestWithCJKBigramFilter.java
	@@ -46,7 +46,7 @@ public class TestWithCJKBigramFilter extends BaseTokenStreamTestCase {
	analyzer = new Analyzer() {
	@Override
	protected TokenStreamComponents createComponents(String fieldName) {
	- Tokenizer source = new ICUTokenizer(newAttributeFactory(), new DefaultICUTokenizerConfig(false));
	+ Tokenizer source = new ICUTokenizer(newAttributeFactory(), new DefaultICUTokenizerConfig(false, true));
	TokenStream result = new CJKBigramFilter(source);
	return new TokenStreamComponents(source, new StopFilter(result, CharArraySet.EMPTY_SET));
	}
	@@ -60,7 +60,7 @@ public class TestWithCJKBigramFilter extends BaseTokenStreamTestCase {
	analyzer2 = new Analyzer() {
	@Override
	protected TokenStreamComponents createComponents(String fieldName) {
	- Tokenizer source = new ICUTokenizer(newAttributeFactory(), new DefaultICUTokenizerConfig(false));
	+ Tokenizer source = new ICUTokenizer(newAttributeFactory(), new DefaultICUTokenizerConfig(false, true));
	// we put this before the CJKBigramFilter, because the normalization might combine
	// some halfwidth katakana forms, which will affect the bigramming.
	TokenStream result = new ICUNormalizer2Filter(source);