LUCENE-4956: more refactoring of decompounding git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/branches/lucene4956@1536231 13f79535-47bb-0310-9956-ffa450edef68

commit: e4d7743542ef3cae273685fd1f067113ed4d3d97 [log] [tgz]
author: Robert Muir <rmuir@apache.org> Mon Oct 28 00:43:30 2013 +0000
committer: Robert Muir <rmuir@apache.org> Mon Oct 28 00:43:30 2013 +0000
tree: c8a3b67857f6b8d3966e9249f88c2e093f38fad4
parent: 220720bed762abfb4aa3d4f3f034ed30a0142d0c [diff]
diff --git a/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/dic/DictionaryUtil.java b/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/dic/DictionaryUtil.java
index a2be18e..5f8dae0 100644
--- a/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/dic/DictionaryUtil.java
+++ b/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/dic/DictionaryUtil.java

@@ -148,12 +148,12 @@
   }

   

   /** return list of irregular compounds for word class. */

-  static List<CompoundEntry> getIrregularCompounds(byte clazz) {

+  static CompoundEntry[] getIrregularCompounds(byte clazz) {

     return dictionary.getIrregularCompounds(clazz);

   }

   

   /** return list of compounds for key and word class. */

-  static List<CompoundEntry> getCompounds(String key, byte clazz) {

+  static CompoundEntry[] getCompounds(String key, byte clazz) {

     return dictionary.getCompounds(key, clazz);

   }

   


diff --git a/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/dic/HangulDictionary.java b/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/dic/HangulDictionary.java
index 953fa80..b4440d4 100644
--- a/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/dic/HangulDictionary.java
+++ b/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/dic/HangulDictionary.java

@@ -110,24 +110,24 @@
   
   /** return list of compounds for key and word class.
    * this retrieves the splits for the class and applies them to the key */
-  List<CompoundEntry> getCompounds(String word, byte clazz) {
+  CompoundEntry[] getCompounds(String word, byte clazz) {
     int off = clazz * RECORD_SIZE;
     int numSplits = metadata[off+2];
     assert numSplits > 0;
-    List<CompoundEntry> compounds = new ArrayList<>(numSplits+1);
+    CompoundEntry compounds[] = new CompoundEntry[numSplits+1];
     int last = 0;
     for (int i = 0; i < numSplits; i++) {
       int split = metadata[off+3+i];
-      compounds.add(new CompoundEntry(word.substring(last, split), true));
+      compounds[i] = new CompoundEntry(word.substring(last, split), true);
       last = split;
     }
-    compounds.add(new CompoundEntry(word.substring(last), true));
+    compounds[numSplits] = new CompoundEntry(word.substring(last), true);
     return compounds;
   }
   
   /** return list of compounds for key and word class.
    * this retrieves the decompounded data for this irregular class */
-  List<CompoundEntry> getIrregularCompounds(byte clazz) {
+  CompoundEntry[] getIrregularCompounds(byte clazz) {
     int off = clazz * RECORD_SIZE;
     int numChars = metadata[off+2];
     // TODO: more efficient
@@ -144,7 +144,7 @@
       }
     }
     compounds.add(new CompoundEntry(sb.toString(), true));
-    return compounds;
+    return compounds.toArray(new CompoundEntry[compounds.size()]);
   }
   
   /** walks the fst for prefix and returns true if it his no dead end */

diff --git a/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/dic/WordEntry.java b/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/dic/WordEntry.java
index e95c677..84fae0d 100644
--- a/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/dic/WordEntry.java
+++ b/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/dic/WordEntry.java

@@ -92,7 +92,7 @@
   }

   

   /** Returns List of compounds for word */

-  public List<CompoundEntry> getCompounds() {

+  public CompoundEntry[] getCompounds() {

     assert isCompoundNoun();

     // TODO: should we cache this here? see if someone is calling this repeatedly? i hope not.

     if ((features & COMPOUND_IRREGULAR) != 0) {


diff --git a/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/morph/AnalysisOutput.java b/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/morph/AnalysisOutput.java
index 3c63f54..a4247b3 100644
--- a/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/morph/AnalysisOutput.java
+++ b/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/morph/AnalysisOutput.java

@@ -188,6 +188,19 @@
     compound.addAll(cnoun);

   }

   

+  // nocommit

+  public void setCNounList(CompoundEntry[] cnoun) {

+    compound.clear();

+    addCNouns(cnoun);

+  }

+  

+  // nocommit

+  public void addCNouns(CompoundEntry[] cnoun) {

+    for (CompoundEntry e : cnoun) {

+      compound.add(e);

+    }

+  }

+  

   /**

    * @return the source

    */


diff --git a/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/morph/CompoundNounAnalyzer.java b/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/morph/CompoundNounAnalyzer.java
index 8228129..a1dc380 100644
--- a/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/morph/CompoundNounAnalyzer.java
+++ b/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/morph/CompoundNounAnalyzer.java

@@ -37,11 +37,16 @@
 

   /** Returns decompounded list for word, or null */

   public List<CompoundEntry> analyze(String input) {

+    if (input.length() < 3 || input.length() > 20) {

+      // ignore less than 3 letters or more than 20 letters.

+      return null;

+    }

     WordEntry entry = DictionaryUtil.getCompoundNoun(input);

     if (entry != null) {

-      return entry.getCompounds();

-    } else if (input.length() < 3) {

-      return null;

+      // nocommit

+      ArrayList<CompoundEntry> l = new ArrayList<CompoundEntry>();

+      l.addAll(Arrays.asList(entry.getCompounds()));

+      return l;

     } else {

       CompoundEntry[] compounds = analyze(input, true);

       if (compounds == null) {

@@ -56,18 +61,11 @@
   }

     

   private CompoundEntry[] analyze(String input, boolean isFirst) {    

-    switch(input.length()) {

+    switch (input.length()) {

       case 3: return analyze3Word(input, isFirst);

       case 4: return analyze4Word(input, isFirst);

       case 5: return analyze5Word(input, isFirst);

-      default:

-        List<CompoundEntry> outputs = new ArrayList<>();

-        boolean success = analyzeLongText(input, outputs, isFirst);

-        if (success) {

-          return outputs.toArray(new CompoundEntry[0]); // nocommit

-        } else {

-          return null;

-        }

+      default: return analyzeLongText(input, isFirst);

     }

   }

   

@@ -182,81 +180,62 @@
     

     return res;

   }

+  

+  /** 

+   * analyzes one part of the input recursively: called by analyzeLongText

+   */

+  private CompoundEntry[] analyzeLongPart(String input) {

+    WordEntry e = DictionaryUtil.getAllNoun(input);

+    if (e == null) {

+      return analyze(input, false);

+    } else {

+      if (e.isCompoundNoun()) {

+        return e.getCompounds();

+      } else {

+        return new CompoundEntry[] { new CompoundEntry(input, true) };

+      }

+    }

+  }

    

-  private boolean analyzeLongText(String input,List<CompoundEntry> outputs, boolean isFirst) {

-    

+  private CompoundEntry[] analyzeLongText(String input, boolean isFirst) {

     int len = input.length();

-    

-    // ignore less than 3 letters or more than 20 letters.

-    if(len>20) return false; 

 

     boolean hasSuffix = isFirst && DictionaryUtil.existSuffix(input.substring(len-1));        

-    int pos = caculatePos(input, hasSuffix);

-    if(pos<1) return false; // fail to search a valid word segment

-    

-    if(pos==input.length()) {     

-      if(hasSuffix) {

-        outputs.add(

-            new CompoundEntry(input.substring(0,len-1), true));

-        outputs.add(

-            new CompoundEntry(input.substring(len-1), true));

-      } else {

-        outputs.add(

-            new CompoundEntry(input, true));

-

-      } 

-      

-      return true;

+    int pos = calculatePos(input, hasSuffix);

+    if (pos < 1) {

+      return null; // fail to search a valid word segment

     }

     

-    List<CompoundEntry> results = new ArrayList<CompoundEntry>();

-        

-    String prev = input.substring(0,pos);

+    // whole word (or word+suffix)

+    if (pos == input.length()) {     

+      if (hasSuffix) {

+        return new CompoundEntry[] { 

+            new CompoundEntry(input.substring(0, len-1), true),

+            new CompoundEntry(input.substring(len-1), true)

+        };

+      } else {

+        return new CompoundEntry[] { new CompoundEntry(input, true) };

+      } 

+    }

+    

+    String prev = input.substring(0, pos);

     String rear = input.substring(pos);

     

-    boolean pSucess = false;

-    boolean rSuccess = false;

-    WordEntry prvEntry = DictionaryUtil.getAllNoun(prev);

-    if(prvEntry==null) {

-      CompoundEntry res[] = analyze(prev, false);

-      if (res == null) {

-        results.add(new CompoundEntry(prev, false));

-      } else {

-        results.addAll(Arrays.asList(res));

-        pSucess = true;

-      }

-    } else {

-      pSucess = true;

-      if(prvEntry.isCompoundNoun())

-        results.addAll(prvEntry.getCompounds());

-      else

-        results.add(new CompoundEntry(prev, true));

+    CompoundEntry[] pRes = analyzeLongPart(prev);

+    CompoundEntry[] rRes = analyzeLongPart(rear);

+    

+    if (pRes == null && rRes == null) {

+      return null; // no good split

+    } else if (pRes == null) {

+      pRes = new CompoundEntry[] { new CompoundEntry(prev, false) };

+    } else if (rRes == null) {

+      rRes = new CompoundEntry[] { new CompoundEntry(rear, false) };

     }

     

-    WordEntry rearEntry = DictionaryUtil.getAllNoun(rear);

-    if(rearEntry==null) {

-      CompoundEntry res[] = analyze(rear, false);

-      if (res == null) {

-        results.add(new CompoundEntry(rear, false));

-      } else {

-        results.addAll(Arrays.asList(res));

-        rSuccess = true;

-      }

-    } else {

-      rSuccess = true;

-      if(rearEntry.isCompoundNoun())

-        results.addAll(rearEntry.getCompounds());

-      else

-        results.add(new CompoundEntry(rear, true));

-    }

-    

-    if(!pSucess&&!rSuccess) {

-      return false;

-    }

-    

-    outputs.addAll(results);

-    

-    return true;

+    CompoundEntry result[] = new CompoundEntry[pRes.length + rRes.length];

+    System.arraycopy(pRes, 0, result, 0, pRes.length);

+    System.arraycopy(rRes, 0, result, pRes.length, rRes.length);

+    return result;

   }

   

   /**

@@ -264,7 +243,7 @@
    * @param input the input string

    * @return  the position

    */

-  private int caculatePos(String input, boolean hasSuffix) {

+  private int calculatePos(String input, boolean hasSuffix) {

   

     int pos = -1;

     int len = input.length();

@@ -317,17 +296,17 @@
   }

   

   private CompoundEntry[] analysisBySplited(int[] units, String input, boolean isFirst) {

-  

     CompoundEntry[] entries = new CompoundEntry[units.length];

     

     int pos = 0;

     String prev = null;

     

-    for(int i=0;i<units.length;i++) {

-      

-      String str = input.substring(pos,pos+units[i]);

+    for (int i = 0; i < units.length; i++) {

+      String str = input.substring(pos, pos + units[i]);

 

-      if(i!=0&&!validCompound(prev,str,isFirst&&(i==1),i)) return null;

+      if (i != 0 && !validCompound(prev, str, isFirst && (i==1), i)) {

+        return null;

+      }

       

       entries[i] = analyzeSingle(str); // CompoundEntry 로 변환

 

@@ -336,7 +315,6 @@
     }

     

     return entries;

-    

   }

   

   /**

@@ -345,7 +323,9 @@
    * @return compound entry

    */

   private CompoundEntry analyzeSingle(String input) {

-    if(input.length()==1) return  new CompoundEntry(input, true);

+    if (input.length() == 1) {

+      return new CompoundEntry(input, true);

+    }

     

     WordEntry entry = DictionaryUtil.getWordExceptVerb(input);

 


diff --git a/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/morph/WSOutput.java b/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/morph/WSOutput.java
index 9ea3482..4501a82 100644
--- a/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/morph/WSOutput.java
+++ b/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/morph/WSOutput.java

@@ -24,17 +24,11 @@
 

 class WSOutput  implements Cloneable {

 

-  private int lastStart = 0;

-  

+  private int lastStart = 0; // nocommit, seems unused?

   private int lastEnd = 0;  

-  

   private List<AnalysisOutput> phrases = new ArrayList<AnalysisOutput>();

   

-  int getLastStart() {

-    return lastStart;

-  }

-

-  void setLastStart(int start) {

+  private void setLastStart(int start) {

     this.lastStart = start;

   }

 

@@ -42,33 +36,29 @@
     return lastEnd;

   }

 

-  void setLastEnd(int end) {

+  private void setLastEnd(int end) {

     this.lastStart = end;

   }

   

-

   List<AnalysisOutput> getPhrases() {

     return phrases;

   }

 

   void removeLast() {

+    if (phrases.isEmpty()) {

+      return;

+    }

         

-    if(this.phrases.size()==0) return;

+    AnalysisOutput o = phrases.remove(phrases.size()-1);

     

-    AnalysisOutput o = this.phrases.remove(this.phrases.size()-1);

-    

-    if(this.phrases.size()==0) {

-      

-      this.lastStart = 0;

-      this.lastEnd = 0;

-      

+    if (phrases.isEmpty()) {

+      lastStart = 0;

+      lastEnd = 0;

     } else {

-      

-      this.lastEnd -= o.getSource().length();

-      

-      if(this.phrases.size()>1) {

-        AnalysisOutput o1 = this.phrases.get(this.phrases.size()-1);

-        this.lastStart = lastEnd-o1.getSource().length();

+      lastEnd -= o.getSource().length();

+      if (phrases.size() > 1) {

+        AnalysisOutput o1 = phrases.get(phrases.size()-1);

+        this.lastStart = lastEnd - o1.getSource().length();

       } else {

         this.lastStart = 0;

       }

@@ -76,76 +66,74 @@
   }

   

   void addPhrase(AnalysisOutput o) {

-

-    this.lastStart = this.lastEnd;

-    this.lastEnd += o.getSource().length();

+    lastStart = lastEnd;

+    lastEnd += o.getSource().length();

     

-    if(o.getCNounList().size()==0)

-      this.phrases.add(o);

+    if (o.getCNounList().size() == 0)

+      phrases.add(o);

     else

       addCompounds(o);

   }

   

-  void addCompounds(AnalysisOutput o) {

+  private void addCompounds(AnalysisOutput o) {

     

     List<CompoundEntry> cnouns = o.getCNounList();

-      

     String source = o.getSource();    

-    int rmstemlen = 0;

-    

-//    for(int i=0;i<cnouns.size();i++) {

-//      System.out.println(cnouns.get(i).getWord());

-//    }

-    for(int i=0;i<cnouns.size()-1;i++) {

+

+    for (int i = 0; i < cnouns.size() - 1; i++) {

       

       String noun = cnouns.get(i).getWord();      

       boolean isOnechar = false;

     

       // 접두어는 처음 음절에만 온다. 복합명사 분해규칙

       // 처음이 아닌 경우 1글자는 앞 문자와 결합한다.

-      if(cnouns.get(i).getWord().length()==1 ||

-          cnouns.get(i+1).getWord().length()==1) { // 접두어는 처음 음절에만 온다. 복합명사 분해규칙

+      if (cnouns.get(i).getWord().length() == 1 ||

+          cnouns.get(i+1).getWord().length() == 1) { // 접두어는 처음 음절에만 온다. 복합명사 분해규칙

         noun += cnouns.get(i+1).getWord();      

         isOnechar = true;

       }

       

-      if(isOnechar && i>=cnouns.size()-2) break;

+      if (isOnechar && i >= cnouns.size()-2) {

+        break;

+      }

             

       int score = AnalysisOutput.SCORE_CORRECT;

-      if(!cnouns.get(i).isExist()) score=AnalysisOutput.SCORE_CANDIDATE;

+      if (!cnouns.get(i).isExist()) {

+        score = AnalysisOutput.SCORE_CANDIDATE;

+      }

       

-      AnalysisOutput o1 = new AnalysisOutput(noun, null, null, 

-          PatternConstants.POS_NOUN, PatternConstants.PTN_N, score);

+      AnalysisOutput o1 = new AnalysisOutput(noun, null, null, PatternConstants.POS_NOUN, PatternConstants.PTN_N, score);

       

       o1.setSource(noun);

       

-      if(isOnechar) {

+      if (isOnechar) {

         o1.addCNoun(cnouns.get(i));

         o1.addCNoun(cnouns.get(i+1));

       }

     

-      if(source.length()>noun.length())

+      if (source.length()>noun.length()) {

         source = source.substring(noun.length());

+      }

       

-      this.phrases.add(o1);

+      phrases.add(o1);

       cnouns.remove(cnouns.get(0));

       i--;

       

-      if(isOnechar) {

+      if (isOnechar) {

         cnouns.remove(cnouns.get(0));

       }

-

     }

     

-    o.setStem(o.getStem().substring(o.getSource().length()-source.length()));

+    o.setStem(o.getStem().substring(o.getSource().length() - source.length()));

     o.setSource(source);

-    if(cnouns.size()==1) cnouns.remove(0);

+    if (cnouns.size() == 1) { 

+      cnouns.clear();

+    }

   

-    this.phrases.add(o);

-

+    phrases.add(o);

   }

   

-  void setPhrases(List<AnalysisOutput> phrases) {

+  private void setPhrases(List<AnalysisOutput> phrases) {

     this.phrases = phrases;

   }

   

@@ -154,7 +142,6 @@
     WSOutput candidate = (WSOutput)super.clone(); // FIXME: What's this? -Christian

     

     candidate.setLastStart(lastStart);

-    

     candidate.setLastEnd(lastEnd);

     

     List<AnalysisOutput> list = new ArrayList<AnalysisOutput>();


diff --git a/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/morph/WordSpaceAnalyzer.java b/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/morph/WordSpaceAnalyzer.java
index 6df6620..8ab00d0 100644
--- a/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/morph/WordSpaceAnalyzer.java
+++ b/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/morph/WordSpaceAnalyzer.java

@@ -37,9 +37,7 @@
     WSOutput output = new WSOutput();

     

     int wStart = 0;

-    

-    int sgCount;

-    

+        

     Map<Integer, Integer> fCounter = new HashMap<Integer, Integer>();

     

     for(int i=0;i<input.length();i++) {           

@@ -61,7 +59,6 @@
       // 이 경우는 다음 음절을 조사하여 

       } else if(i!= input.length()-1 && prefixExists) { 

         // 아무짓도 하지 않음.

-        sgCount = i;

       } else if(!prefixExists && 

           (entry=DictionaryUtil.getBusa(input.substring(wStart,i+1)))!=null) {        

         candidates.add(buildSingleOutput(entry));

@@ -210,8 +207,7 @@
     boolean hasJosa = false;

     for(int i=start;i>=jstart;i--) {

       String str = snippet.substring(jstart,i+1);

-      if(DictionaryUtil.existJosa(str) && !findNounWithinStr(snippet,i,i+2) &&

-          !isNounPart(snippet,jstart)) {

+      if(DictionaryUtil.existJosa(str) && !findNounWithinStr(snippet,i,i+2)) {

         jend = i;

         hasJosa = true;

         break;

@@ -550,19 +546,4 @@
     

     return false;

   }

-  

-  private boolean isNounPart(String str, int jstart)  {

-    

-    if(true) return false;

-    

-    for(int i=jstart-1;i>=0;i--) {      

-      if(DictionaryUtil.getWordExceptVerb(str.substring(i,jstart+1))!=null)

-        return true;

-      

-    }

-    

-    

-    return false;

-    

-  }

 }
commit	e4d7743542ef3cae273685fd1f067113ed4d3d97	[log] [tgz]
author	Robert Muir <rmuir@apache.org>	Mon Oct 28 00:43:30 2013 +0000
committer	Robert Muir <rmuir@apache.org>	Mon Oct 28 00:43:30 2013 +0000
tree	c8a3b67857f6b8d3966e9249f88c2e093f38fad4
parent	220720bed762abfb4aa3d4f3f034ed30a0142d0c [diff]