LUCENE-7393: restore old myanmar syllable tokenization as an option.

commit: 58f0fbd3767af649da1d47ea62f6f35b1ae28c19 [log] [tgz]
author: Robert Muir <rmuir@apache.org> Thu Jul 28 11:00:24 2016 -0400
committer: Robert Muir <rmuir@apache.org> Thu Jul 28 11:00:24 2016 -0400
tree: e0c5d567b80d67c01582b7bfefdbd0259dd56e5a
parent: c9935b478839a602616c1c6b76e0d3bb06d8f789 [diff]
diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt
index 917dfa2..6958660 100644
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt

@@ -140,6 +140,10 @@
   buffer, instead of a fixed 16.0 MB.  A custom codec can still
   override the buffer size itself. (Mike McCandless)
 
+* LUCENE-7393: Add ICUTokenizer option to parse Myanmar text as syllables instead of words,
+  because the ICU word-breaking algorithm has some issues. This allows for the previous 
+  tokenization used before Lucene 5. (AM, Robert Muir)
+
 Optimizations
 
 * LUCENE-7330, LUCENE-7339: Speed up conjunction queries. (Adrien Grand)

diff --git a/lucene/analysis/icu/src/data/uax29/MyanmarSyllable.rbbi b/lucene/analysis/icu/src/data/uax29/MyanmarSyllable.rbbi
new file mode 100644
index 0000000..1840803
--- /dev/null
+++ b/lucene/analysis/icu/src/data/uax29/MyanmarSyllable.rbbi

@@ -0,0 +1,50 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# 
+# Parses Myanmar text, with syllable as token. 
+#
+
+$Cons = [[:Other_Letter:]&[:Myanmar:]];
+$Virama = [\u1039];
+$Asat = [\u103A];
+
+$WordJoin = [:Line_Break=Word_Joiner:]; 
+
+#
+# default numerical definitions
+#
+$Extend       = [\p{Word_Break = Extend}];
+$Format       = [\p{Word_Break = Format}];
+$MidNumLet    = [\p{Word_Break = MidNumLet}];
+$MidNum       = [\p{Word_Break = MidNum}];
+$Numeric      = [\p{Word_Break = Numeric}];
+$ExtendNumLet = [\p{Word_Break = ExtendNumLet}];                                                          
+$MidNumLetEx    = $MidNumLet    ($Extend |  $Format)*;
+$MidNumEx       = $MidNum       ($Extend |  $Format)*;
+$NumericEx      = $Numeric      ($Extend |  $Format)*;
+$ExtendNumLetEx = $ExtendNumLet ($Extend |  $Format)*;
+
+$ConsEx = $Cons ($Extend | $Format)*;
+$AsatEx = $Cons $Asat ($Virama $ConsEx)? ($Extend | $Format)*;
+$MyanmarSyllableEx = $ConsEx ($Virama $ConsEx)? ($AsatEx)*;
+$MyanmarJoinedSyllableEx = $MyanmarSyllableEx ($WordJoin $MyanmarSyllableEx)*;
+
+!!forward;
+$MyanmarJoinedSyllableEx {200};
+
+# default numeric rules
+$NumericEx $ExtendNumLetEx? (($MidNumEx | $MidNumLetEx)? $NumericEx $ExtendNumLetEx?)*  {100};

diff --git a/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/segmentation/DefaultICUTokenizerConfig.java b/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/segmentation/DefaultICUTokenizerConfig.java
index b33663d..3cd62c8 100644
--- a/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/segmentation/DefaultICUTokenizerConfig.java
+++ b/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/segmentation/DefaultICUTokenizerConfig.java

@@ -63,9 +63,12 @@
   // the same as ROOT, except no dictionary segmentation for cjk
   private static final BreakIterator defaultBreakIterator = 
     readBreakIterator("Default.brk");
+  private static final BreakIterator myanmarSyllableIterator = 
+    readBreakIterator("MyanmarSyllable.brk");
   
   // TODO: deprecate this boolean? you only care if you are doing super-expert stuff...
   private final boolean cjkAsWords;
+  private final boolean myanmarAsWords;
   
   /** 
    * Creates a new config. This object is lightweight, but the first
@@ -74,9 +77,12 @@
    *                   otherwise text will be segmented according to UAX#29 defaults.
    *                   If this is true, all Han+Hiragana+Katakana words will be tagged as
    *                   IDEOGRAPHIC.
+   * @param myanmarAsWords true if Myanmar text should undergo dictionary-based segmentation,
+   *                       otherwise it will be tokenized as syllables.
    */
-  public DefaultICUTokenizerConfig(boolean cjkAsWords) { 
+  public DefaultICUTokenizerConfig(boolean cjkAsWords, boolean myanmarAsWords) { 
     this.cjkAsWords = cjkAsWords;
+    this.myanmarAsWords = myanmarAsWords;
   }
   
   @Override
@@ -88,6 +94,12 @@
   public BreakIterator getBreakIterator(int script) {
     switch(script) {
       case UScript.JAPANESE: return (BreakIterator)cjkBreakIterator.clone();
+      case UScript.MYANMAR: 
+        if (myanmarAsWords) {
+          return (BreakIterator)defaultBreakIterator.clone();
+        } else {
+          return (BreakIterator)myanmarSyllableIterator.clone();
+        }
       default: return (BreakIterator)defaultBreakIterator.clone();
     }
   }

diff --git a/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/segmentation/ICUTokenizer.java b/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/segmentation/ICUTokenizer.java
index 64c6785..0941551 100644
--- a/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/segmentation/ICUTokenizer.java
+++ b/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/segmentation/ICUTokenizer.java

@@ -68,7 +68,7 @@
    * @see DefaultICUTokenizerConfig
    */
   public ICUTokenizer() {
-    this(new DefaultICUTokenizerConfig(true));
+    this(new DefaultICUTokenizerConfig(true, true));
   }
 
   /**

diff --git a/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/segmentation/ICUTokenizerFactory.java b/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/segmentation/ICUTokenizerFactory.java
index deb5d4f..974e719 100644
--- a/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/segmentation/ICUTokenizerFactory.java
+++ b/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/segmentation/ICUTokenizerFactory.java

@@ -79,6 +79,7 @@
   private final Map<Integer,String> tailored;
   private ICUTokenizerConfig config;
   private final boolean cjkAsWords;
+  private final boolean myanmarAsWords;
   
   /** Creates a new ICUTokenizerFactory */
   public ICUTokenizerFactory(Map<String,String> args) {
@@ -95,6 +96,7 @@
       }
     }
     cjkAsWords = getBoolean(args, "cjkAsWords", true);
+    myanmarAsWords = getBoolean(args, "myanmarAsWords", true);
     if (!args.isEmpty()) {
       throw new IllegalArgumentException("Unknown parameters: " + args);
     }
@@ -104,7 +106,7 @@
   public void inform(ResourceLoader loader) throws IOException {
     assert tailored != null : "init must be called first!";
     if (tailored.isEmpty()) {
-      config = new DefaultICUTokenizerConfig(cjkAsWords);
+      config = new DefaultICUTokenizerConfig(cjkAsWords, myanmarAsWords);
     } else {
       final BreakIterator breakers[] = new BreakIterator[UScript.CODE_LIMIT];
       for (Map.Entry<Integer,String> entry : tailored.entrySet()) {
@@ -112,7 +114,7 @@
         String resourcePath = entry.getValue();
         breakers[code] = parseRules(resourcePath, loader);
       }
-      config = new DefaultICUTokenizerConfig(cjkAsWords) {
+      config = new DefaultICUTokenizerConfig(cjkAsWords, myanmarAsWords) {
         
         @Override
         public BreakIterator getBreakIterator(int script) {

diff --git a/lucene/analysis/icu/src/resources/org/apache/lucene/analysis/icu/segmentation/MyanmarSyllable.brk b/lucene/analysis/icu/src/resources/org/apache/lucene/analysis/icu/segmentation/MyanmarSyllable.brk
new file mode 100644
index 0000000..41b977b
--- /dev/null
+++ b/lucene/analysis/icu/src/resources/org/apache/lucene/analysis/icu/segmentation/MyanmarSyllable.brk
Binary files differ

diff --git a/lucene/analysis/icu/src/test/org/apache/lucene/analysis/icu/segmentation/TestICUTokenizer.java b/lucene/analysis/icu/src/test/org/apache/lucene/analysis/icu/segmentation/TestICUTokenizer.java
index 6398b2c..027baa3 100644
--- a/lucene/analysis/icu/src/test/org/apache/lucene/analysis/icu/segmentation/TestICUTokenizer.java
+++ b/lucene/analysis/icu/src/test/org/apache/lucene/analysis/icu/segmentation/TestICUTokenizer.java

@@ -42,7 +42,7 @@
     sb.append(whitespace);
     sb.append("testing 1234");
     String input = sb.toString();
-    ICUTokenizer tokenizer = new ICUTokenizer(newAttributeFactory(), new DefaultICUTokenizerConfig(false));
+    ICUTokenizer tokenizer = new ICUTokenizer(newAttributeFactory(), new DefaultICUTokenizerConfig(false, true));
     tokenizer.setReader(new StringReader(input));
     assertTokenStreamContents(tokenizer, new String[] { "testing", "1234" });
   }
@@ -53,7 +53,7 @@
       sb.append('a');
     }
     String input = sb.toString();
-    ICUTokenizer tokenizer = new ICUTokenizer(newAttributeFactory(), new DefaultICUTokenizerConfig(false));
+    ICUTokenizer tokenizer = new ICUTokenizer(newAttributeFactory(), new DefaultICUTokenizerConfig(false, true));
     tokenizer.setReader(new StringReader(input));
     char token[] = new char[4096];
     Arrays.fill(token, 'a');
@@ -75,7 +75,7 @@
     a = new Analyzer() {
       @Override
       protected TokenStreamComponents createComponents(String fieldName) {
-        Tokenizer tokenizer = new ICUTokenizer(newAttributeFactory(), new DefaultICUTokenizerConfig(false));
+        Tokenizer tokenizer = new ICUTokenizer(newAttributeFactory(), new DefaultICUTokenizerConfig(false, true));
         TokenFilter filter = new ICUNormalizer2Filter(tokenizer);
         return new TokenStreamComponents(tokenizer, filter);
       }

diff --git a/lucene/analysis/icu/src/test/org/apache/lucene/analysis/icu/segmentation/TestICUTokenizerCJK.java b/lucene/analysis/icu/src/test/org/apache/lucene/analysis/icu/segmentation/TestICUTokenizerCJK.java
index a29686c..96f44d6 100644
--- a/lucene/analysis/icu/src/test/org/apache/lucene/analysis/icu/segmentation/TestICUTokenizerCJK.java
+++ b/lucene/analysis/icu/src/test/org/apache/lucene/analysis/icu/segmentation/TestICUTokenizerCJK.java

@@ -34,7 +34,7 @@
     a = new Analyzer() {
       @Override
       protected TokenStreamComponents createComponents(String fieldName) {
-        return new TokenStreamComponents(new ICUTokenizer(newAttributeFactory(), new DefaultICUTokenizerConfig(true)));
+        return new TokenStreamComponents(new ICUTokenizer(newAttributeFactory(), new DefaultICUTokenizerConfig(true, true)));
       }
     };
   }

diff --git a/lucene/analysis/icu/src/test/org/apache/lucene/analysis/icu/segmentation/TestMyanmarSyllable.java b/lucene/analysis/icu/src/test/org/apache/lucene/analysis/icu/segmentation/TestMyanmarSyllable.java
new file mode 100644
index 0000000..a3b608e
--- /dev/null
+++ b/lucene/analysis/icu/src/test/org/apache/lucene/analysis/icu/segmentation/TestMyanmarSyllable.java

@@ -0,0 +1,156 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.analysis.icu.segmentation;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.BaseTokenStreamTestCase;
+import org.apache.lucene.analysis.Tokenizer;
+
+/** Test tokenizing Myanmar text into syllables */
+public class TestMyanmarSyllable extends BaseTokenStreamTestCase {
+
+  Analyzer a;
+
+  @Override
+  public void setUp() throws Exception {
+    super.setUp();
+    a = new Analyzer() {
+      @Override
+      protected TokenStreamComponents createComponents(String fieldName) {
+        Tokenizer tokenizer = new ICUTokenizer(newAttributeFactory(), new DefaultICUTokenizerConfig(false, false));
+        return new TokenStreamComponents(tokenizer);
+      }
+    };
+  }
+  
+  @Override
+  public void tearDown() throws Exception {
+    a.close();
+    super.tearDown();
+  }
+  
+  /** as opposed to dictionary break of သက်ဝင်|လှုပ်ရှား|စေ|ပြီး */
+  public void testBasics() throws Exception {
+    assertAnalyzesTo(a, "သက်ဝင်လှုပ်ရှားစေပြီး", new String[] { "သက်", "ဝင်", "လှုပ်", "ရှား", "စေ", "ပြီး" });
+  }
+  
+  // simple tests from "A Rule-based Syllable Segmentation of Myanmar Text" 
+  // * http://www.aclweb.org/anthology/I08-3010
+  // (see also the presentation: http://gii2.nagaokaut.ac.jp/gii/media/share/20080901-ZMM%20Presentation.pdf)
+  // The words are fake, we just test the categories.
+  // note that currently our algorithm is not sophisticated enough to handle some of the special cases!
+  
+  /** constant */
+  public void testC() throws Exception {
+    assertAnalyzesTo(a, "ကက", new String[] { "က", "က" });
+  }
+  
+  /** consonant + sign */
+  public void testCF() throws Exception {
+    assertAnalyzesTo(a, "ကံကံ", new String[] { "ကံ", "ကံ" });
+  }
+  
+  /** consonant + consonant + asat */
+  public void testCCA() throws Exception {
+    assertAnalyzesTo(a, "ကင်ကင်", new String[] { "ကင်", "ကင်" });
+  }
+  
+  /** consonant + consonant + asat + sign */
+  public void testCCAF() throws Exception {
+    assertAnalyzesTo(a, "ကင်းကင်း", new String[] { "ကင်း", "ကင်း" });
+  }
+  
+  /** consonant + vowel */
+  public void testCV() throws Exception {
+    assertAnalyzesTo(a, "ကာကာ", new String[] { "ကာ", "ကာ" });
+  }
+  
+  /** consonant + vowel + sign */
+  public void testCVF() throws Exception {
+    assertAnalyzesTo(a, "ကားကား", new String[] { "ကား", "ကား" });
+  }
+  
+  /** consonant + vowel + vowel + asat */
+  public void testCVVA() throws Exception {
+    assertAnalyzesTo(a, "ကော်ကော်", new String[] { "ကော်", "ကော်" });
+  }
+  
+  /** consonant + vowel + vowel + consonant + asat */
+  public void testCVVCA() throws Exception {
+    assertAnalyzesTo(a, "ကောင်ကောင်", new String[] { "ကောင်", "ကောင်" });
+  }
+  
+  /** consonant + vowel + vowel + consonant + asat + sign */
+  public void testCVVCAF() throws Exception {
+    assertAnalyzesTo(a, "ကောင်းကောင်း", new String[] { "ကောင်း", "ကောင်း" });
+  }
+  
+  /** consonant + medial */
+  public void testCM() throws Exception {
+    assertAnalyzesTo(a, "ကျကျ", new String[] { "ကျ", "ကျ" });
+  }
+  
+  /** consonant + medial + sign */
+  public void testCMF() throws Exception {
+    assertAnalyzesTo(a, "ကျံကျံ", new String[] { "ကျံ", "ကျံ" });
+  }
+  
+  /** consonant + medial + consonant + asat */
+  public void testCMCA() throws Exception {
+    assertAnalyzesTo(a, "ကျင်ကျင်", new String[] { "ကျင်", "ကျင်" });
+  }
+  
+  /** consonant + medial + consonant + asat + sign */
+  public void testCMCAF() throws Exception {
+    assertAnalyzesTo(a, "ကျင်းကျင်း", new String[] { "ကျင်း", "ကျင်း" });
+  }
+  
+  /** consonant + medial + vowel */
+  public void testCMV() throws Exception {
+    assertAnalyzesTo(a, "ကျာကျာ", new String[] { "ကျာ", "ကျာ" });
+  }
+  
+  /** consonant + medial + vowel + sign */
+  public void testCMVF() throws Exception {
+    assertAnalyzesTo(a, "ကျားကျား", new String[] { "ကျား", "ကျား" });
+  }
+  
+  /** consonant + medial + vowel + vowel + asat */
+  public void testCMVVA() throws Exception {
+    assertAnalyzesTo(a, "ကျော်ကျော်", new String[] { "ကျော်", "ကျော်" });
+  }
+  
+  /** consonant + medial + vowel + vowel + consonant + asat */
+  public void testCMVVCA() throws Exception {
+    assertAnalyzesTo(a, "ကြောင်ကြောင်", new String[] { "ကြောင်", "ကြောင်"});
+  }
+  
+  /** consonant + medial + vowel + vowel + consonant + asat + sign */
+  public void testCMVVCAF() throws Exception {
+    assertAnalyzesTo(a, "ကြောင်းကြောင်း", new String[] { "ကြောင်း", "ကြောင်း"});
+  }
+  
+  /** independent vowel */
+  public void testI() throws Exception {
+    assertAnalyzesTo(a, "ဪဪ", new String[] { "ဪ", "ဪ" });
+  }
+  
+  /** independent vowel */
+  public void testE() throws Exception {
+    assertAnalyzesTo(a, "ဣဣ", new String[] { "ဣ", "ဣ" });
+  }
+}

diff --git a/lucene/analysis/icu/src/test/org/apache/lucene/analysis/icu/segmentation/TestWithCJKBigramFilter.java b/lucene/analysis/icu/src/test/org/apache/lucene/analysis/icu/segmentation/TestWithCJKBigramFilter.java
index 17ea967..411b85e 100644
--- a/lucene/analysis/icu/src/test/org/apache/lucene/analysis/icu/segmentation/TestWithCJKBigramFilter.java
+++ b/lucene/analysis/icu/src/test/org/apache/lucene/analysis/icu/segmentation/TestWithCJKBigramFilter.java

@@ -46,7 +46,7 @@
     analyzer = new Analyzer() {
       @Override
       protected TokenStreamComponents createComponents(String fieldName) {
-        Tokenizer source = new ICUTokenizer(newAttributeFactory(), new DefaultICUTokenizerConfig(false));
+        Tokenizer source = new ICUTokenizer(newAttributeFactory(), new DefaultICUTokenizerConfig(false, true));
         TokenStream result = new CJKBigramFilter(source);
         return new TokenStreamComponents(source, new StopFilter(result, CharArraySet.EMPTY_SET));
       }
@@ -60,7 +60,7 @@
     analyzer2 = new Analyzer() {
       @Override
       protected TokenStreamComponents createComponents(String fieldName) {
-        Tokenizer source = new ICUTokenizer(newAttributeFactory(), new DefaultICUTokenizerConfig(false));
+        Tokenizer source = new ICUTokenizer(newAttributeFactory(), new DefaultICUTokenizerConfig(false, true));
         // we put this before the CJKBigramFilter, because the normalization might combine
         // some halfwidth katakana forms, which will affect the bigramming.
         TokenStream result = new ICUNormalizer2Filter(source);
commit	58f0fbd3767af649da1d47ea62f6f35b1ae28c19	[log] [tgz]
author	Robert Muir <rmuir@apache.org>	Thu Jul 28 11:00:24 2016 -0400
committer	Robert Muir <rmuir@apache.org>	Thu Jul 28 11:00:24 2016 -0400
tree	e0c5d567b80d67c01582b7bfefdbd0259dd56e5a
parent	c9935b478839a602616c1c6b76e0d3bb06d8f789 [diff]