LUCENE-4956: more refactoring of decompounding
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/branches/lucene4956@1536231 13f79535-47bb-0310-9956-ffa450edef68
diff --git a/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/dic/DictionaryUtil.java b/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/dic/DictionaryUtil.java
index a2be18e..5f8dae0 100644
--- a/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/dic/DictionaryUtil.java
+++ b/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/dic/DictionaryUtil.java
@@ -148,12 +148,12 @@
}
/** return list of irregular compounds for word class. */
- static List<CompoundEntry> getIrregularCompounds(byte clazz) {
+ static CompoundEntry[] getIrregularCompounds(byte clazz) {
return dictionary.getIrregularCompounds(clazz);
}
/** return list of compounds for key and word class. */
- static List<CompoundEntry> getCompounds(String key, byte clazz) {
+ static CompoundEntry[] getCompounds(String key, byte clazz) {
return dictionary.getCompounds(key, clazz);
}
diff --git a/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/dic/HangulDictionary.java b/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/dic/HangulDictionary.java
index 953fa80..b4440d4 100644
--- a/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/dic/HangulDictionary.java
+++ b/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/dic/HangulDictionary.java
@@ -110,24 +110,24 @@
/** return list of compounds for key and word class.
* this retrieves the splits for the class and applies them to the key */
- List<CompoundEntry> getCompounds(String word, byte clazz) {
+ CompoundEntry[] getCompounds(String word, byte clazz) {
int off = clazz * RECORD_SIZE;
int numSplits = metadata[off+2];
assert numSplits > 0;
- List<CompoundEntry> compounds = new ArrayList<>(numSplits+1);
+ CompoundEntry compounds[] = new CompoundEntry[numSplits+1];
int last = 0;
for (int i = 0; i < numSplits; i++) {
int split = metadata[off+3+i];
- compounds.add(new CompoundEntry(word.substring(last, split), true));
+ compounds[i] = new CompoundEntry(word.substring(last, split), true);
last = split;
}
- compounds.add(new CompoundEntry(word.substring(last), true));
+ compounds[numSplits] = new CompoundEntry(word.substring(last), true);
return compounds;
}
/** return list of compounds for key and word class.
* this retrieves the decompounded data for this irregular class */
- List<CompoundEntry> getIrregularCompounds(byte clazz) {
+ CompoundEntry[] getIrregularCompounds(byte clazz) {
int off = clazz * RECORD_SIZE;
int numChars = metadata[off+2];
// TODO: more efficient
@@ -144,7 +144,7 @@
}
}
compounds.add(new CompoundEntry(sb.toString(), true));
- return compounds;
+ return compounds.toArray(new CompoundEntry[compounds.size()]);
}
/** walks the fst for prefix and returns true if it his no dead end */
diff --git a/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/dic/WordEntry.java b/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/dic/WordEntry.java
index e95c677..84fae0d 100644
--- a/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/dic/WordEntry.java
+++ b/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/dic/WordEntry.java
@@ -92,7 +92,7 @@
}
/** Returns List of compounds for word */
- public List<CompoundEntry> getCompounds() {
+ public CompoundEntry[] getCompounds() {
assert isCompoundNoun();
// TODO: should we cache this here? see if someone is calling this repeatedly? i hope not.
if ((features & COMPOUND_IRREGULAR) != 0) {
diff --git a/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/morph/AnalysisOutput.java b/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/morph/AnalysisOutput.java
index 3c63f54..a4247b3 100644
--- a/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/morph/AnalysisOutput.java
+++ b/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/morph/AnalysisOutput.java
@@ -188,6 +188,19 @@
compound.addAll(cnoun);
}
+ // nocommit
+ public void setCNounList(CompoundEntry[] cnoun) {
+ compound.clear();
+ addCNouns(cnoun);
+ }
+
+ // nocommit
+ public void addCNouns(CompoundEntry[] cnoun) {
+ for (CompoundEntry e : cnoun) {
+ compound.add(e);
+ }
+ }
+
/**
* @return the source
*/
diff --git a/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/morph/CompoundNounAnalyzer.java b/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/morph/CompoundNounAnalyzer.java
index 8228129..a1dc380 100644
--- a/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/morph/CompoundNounAnalyzer.java
+++ b/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/morph/CompoundNounAnalyzer.java
@@ -37,11 +37,16 @@
/** Returns decompounded list for word, or null */
public List<CompoundEntry> analyze(String input) {
+ if (input.length() < 3 || input.length() > 20) {
+ // ignore less than 3 letters or more than 20 letters.
+ return null;
+ }
WordEntry entry = DictionaryUtil.getCompoundNoun(input);
if (entry != null) {
- return entry.getCompounds();
- } else if (input.length() < 3) {
- return null;
+ // nocommit
+ ArrayList<CompoundEntry> l = new ArrayList<CompoundEntry>();
+ l.addAll(Arrays.asList(entry.getCompounds()));
+ return l;
} else {
CompoundEntry[] compounds = analyze(input, true);
if (compounds == null) {
@@ -56,18 +61,11 @@
}
private CompoundEntry[] analyze(String input, boolean isFirst) {
- switch(input.length()) {
+ switch (input.length()) {
case 3: return analyze3Word(input, isFirst);
case 4: return analyze4Word(input, isFirst);
case 5: return analyze5Word(input, isFirst);
- default:
- List<CompoundEntry> outputs = new ArrayList<>();
- boolean success = analyzeLongText(input, outputs, isFirst);
- if (success) {
- return outputs.toArray(new CompoundEntry[0]); // nocommit
- } else {
- return null;
- }
+ default: return analyzeLongText(input, isFirst);
}
}
@@ -182,81 +180,62 @@
return res;
}
+
+ /**
+ * analyzes one part of the input recursively: called by analyzeLongText
+ */
+ private CompoundEntry[] analyzeLongPart(String input) {
+ WordEntry e = DictionaryUtil.getAllNoun(input);
+ if (e == null) {
+ return analyze(input, false);
+ } else {
+ if (e.isCompoundNoun()) {
+ return e.getCompounds();
+ } else {
+ return new CompoundEntry[] { new CompoundEntry(input, true) };
+ }
+ }
+ }
- private boolean analyzeLongText(String input,List<CompoundEntry> outputs, boolean isFirst) {
-
+ private CompoundEntry[] analyzeLongText(String input, boolean isFirst) {
int len = input.length();
-
- // ignore less than 3 letters or more than 20 letters.
- if(len>20) return false;
boolean hasSuffix = isFirst && DictionaryUtil.existSuffix(input.substring(len-1));
- int pos = caculatePos(input, hasSuffix);
- if(pos<1) return false; // fail to search a valid word segment
-
- if(pos==input.length()) {
- if(hasSuffix) {
- outputs.add(
- new CompoundEntry(input.substring(0,len-1), true));
- outputs.add(
- new CompoundEntry(input.substring(len-1), true));
- } else {
- outputs.add(
- new CompoundEntry(input, true));
-
- }
-
- return true;
+ int pos = calculatePos(input, hasSuffix);
+ if (pos < 1) {
+ return null; // fail to search a valid word segment
}
- List<CompoundEntry> results = new ArrayList<CompoundEntry>();
-
- String prev = input.substring(0,pos);
+ // whole word (or word+suffix)
+ if (pos == input.length()) {
+ if (hasSuffix) {
+ return new CompoundEntry[] {
+ new CompoundEntry(input.substring(0, len-1), true),
+ new CompoundEntry(input.substring(len-1), true)
+ };
+ } else {
+ return new CompoundEntry[] { new CompoundEntry(input, true) };
+ }
+ }
+
+ String prev = input.substring(0, pos);
String rear = input.substring(pos);
- boolean pSucess = false;
- boolean rSuccess = false;
- WordEntry prvEntry = DictionaryUtil.getAllNoun(prev);
- if(prvEntry==null) {
- CompoundEntry res[] = analyze(prev, false);
- if (res == null) {
- results.add(new CompoundEntry(prev, false));
- } else {
- results.addAll(Arrays.asList(res));
- pSucess = true;
- }
- } else {
- pSucess = true;
- if(prvEntry.isCompoundNoun())
- results.addAll(prvEntry.getCompounds());
- else
- results.add(new CompoundEntry(prev, true));
+ CompoundEntry[] pRes = analyzeLongPart(prev);
+ CompoundEntry[] rRes = analyzeLongPart(rear);
+
+ if (pRes == null && rRes == null) {
+ return null; // no good split
+ } else if (pRes == null) {
+ pRes = new CompoundEntry[] { new CompoundEntry(prev, false) };
+ } else if (rRes == null) {
+ rRes = new CompoundEntry[] { new CompoundEntry(rear, false) };
}
- WordEntry rearEntry = DictionaryUtil.getAllNoun(rear);
- if(rearEntry==null) {
- CompoundEntry res[] = analyze(rear, false);
- if (res == null) {
- results.add(new CompoundEntry(rear, false));
- } else {
- results.addAll(Arrays.asList(res));
- rSuccess = true;
- }
- } else {
- rSuccess = true;
- if(rearEntry.isCompoundNoun())
- results.addAll(rearEntry.getCompounds());
- else
- results.add(new CompoundEntry(rear, true));
- }
-
- if(!pSucess&&!rSuccess) {
- return false;
- }
-
- outputs.addAll(results);
-
- return true;
+ CompoundEntry result[] = new CompoundEntry[pRes.length + rRes.length];
+ System.arraycopy(pRes, 0, result, 0, pRes.length);
+ System.arraycopy(rRes, 0, result, pRes.length, rRes.length);
+ return result;
}
/**
@@ -264,7 +243,7 @@
* @param input the input string
* @return the position
*/
- private int caculatePos(String input, boolean hasSuffix) {
+ private int calculatePos(String input, boolean hasSuffix) {
int pos = -1;
int len = input.length();
@@ -317,17 +296,17 @@
}
private CompoundEntry[] analysisBySplited(int[] units, String input, boolean isFirst) {
-
CompoundEntry[] entries = new CompoundEntry[units.length];
int pos = 0;
String prev = null;
- for(int i=0;i<units.length;i++) {
-
- String str = input.substring(pos,pos+units[i]);
+ for (int i = 0; i < units.length; i++) {
+ String str = input.substring(pos, pos + units[i]);
- if(i!=0&&!validCompound(prev,str,isFirst&&(i==1),i)) return null;
+ if (i != 0 && !validCompound(prev, str, isFirst && (i==1), i)) {
+ return null;
+ }
entries[i] = analyzeSingle(str); // CompoundEntry 로 변환
@@ -336,7 +315,6 @@
}
return entries;
-
}
/**
@@ -345,7 +323,9 @@
* @return compound entry
*/
private CompoundEntry analyzeSingle(String input) {
- if(input.length()==1) return new CompoundEntry(input, true);
+ if (input.length() == 1) {
+ return new CompoundEntry(input, true);
+ }
WordEntry entry = DictionaryUtil.getWordExceptVerb(input);
diff --git a/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/morph/WSOutput.java b/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/morph/WSOutput.java
index 9ea3482..4501a82 100644
--- a/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/morph/WSOutput.java
+++ b/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/morph/WSOutput.java
@@ -24,17 +24,11 @@
class WSOutput implements Cloneable {
- private int lastStart = 0;
-
+ private int lastStart = 0; // nocommit, seems unused?
private int lastEnd = 0;
-
private List<AnalysisOutput> phrases = new ArrayList<AnalysisOutput>();
- int getLastStart() {
- return lastStart;
- }
-
- void setLastStart(int start) {
+ private void setLastStart(int start) {
this.lastStart = start;
}
@@ -42,33 +36,29 @@
return lastEnd;
}
- void setLastEnd(int end) {
+ private void setLastEnd(int end) {
this.lastStart = end;
}
-
List<AnalysisOutput> getPhrases() {
return phrases;
}
void removeLast() {
+ if (phrases.isEmpty()) {
+ return;
+ }
- if(this.phrases.size()==0) return;
+ AnalysisOutput o = phrases.remove(phrases.size()-1);
- AnalysisOutput o = this.phrases.remove(this.phrases.size()-1);
-
- if(this.phrases.size()==0) {
-
- this.lastStart = 0;
- this.lastEnd = 0;
-
+ if (phrases.isEmpty()) {
+ lastStart = 0;
+ lastEnd = 0;
} else {
-
- this.lastEnd -= o.getSource().length();
-
- if(this.phrases.size()>1) {
- AnalysisOutput o1 = this.phrases.get(this.phrases.size()-1);
- this.lastStart = lastEnd-o1.getSource().length();
+ lastEnd -= o.getSource().length();
+ if (phrases.size() > 1) {
+ AnalysisOutput o1 = phrases.get(phrases.size()-1);
+ this.lastStart = lastEnd - o1.getSource().length();
} else {
this.lastStart = 0;
}
@@ -76,76 +66,74 @@
}
void addPhrase(AnalysisOutput o) {
-
- this.lastStart = this.lastEnd;
- this.lastEnd += o.getSource().length();
+ lastStart = lastEnd;
+ lastEnd += o.getSource().length();
- if(o.getCNounList().size()==0)
- this.phrases.add(o);
+ if (o.getCNounList().size() == 0)
+ phrases.add(o);
else
addCompounds(o);
}
- void addCompounds(AnalysisOutput o) {
+ private void addCompounds(AnalysisOutput o) {
List<CompoundEntry> cnouns = o.getCNounList();
-
String source = o.getSource();
- int rmstemlen = 0;
-
-// for(int i=0;i<cnouns.size();i++) {
-// System.out.println(cnouns.get(i).getWord());
-// }
- for(int i=0;i<cnouns.size()-1;i++) {
+
+ for (int i = 0; i < cnouns.size() - 1; i++) {
String noun = cnouns.get(i).getWord();
boolean isOnechar = false;
// 접두어는 처음 음절에만 온다. 복합명사 분해규칙
// 처음이 아닌 경우 1글자는 앞 문자와 결합한다.
- if(cnouns.get(i).getWord().length()==1 ||
- cnouns.get(i+1).getWord().length()==1) { // 접두어는 처음 음절에만 온다. 복합명사 분해규칙
+ if (cnouns.get(i).getWord().length() == 1 ||
+ cnouns.get(i+1).getWord().length() == 1) { // 접두어는 처음 음절에만 온다. 복합명사 분해규칙
noun += cnouns.get(i+1).getWord();
isOnechar = true;
}
- if(isOnechar && i>=cnouns.size()-2) break;
+ if (isOnechar && i >= cnouns.size()-2) {
+ break;
+ }
int score = AnalysisOutput.SCORE_CORRECT;
- if(!cnouns.get(i).isExist()) score=AnalysisOutput.SCORE_CANDIDATE;
+ if (!cnouns.get(i).isExist()) {
+ score = AnalysisOutput.SCORE_CANDIDATE;
+ }
- AnalysisOutput o1 = new AnalysisOutput(noun, null, null,
- PatternConstants.POS_NOUN, PatternConstants.PTN_N, score);
+ AnalysisOutput o1 = new AnalysisOutput(noun, null, null, PatternConstants.POS_NOUN, PatternConstants.PTN_N, score);
o1.setSource(noun);
- if(isOnechar) {
+ if (isOnechar) {
o1.addCNoun(cnouns.get(i));
o1.addCNoun(cnouns.get(i+1));
}
- if(source.length()>noun.length())
+ if (source.length()>noun.length()) {
source = source.substring(noun.length());
+ }
- this.phrases.add(o1);
+ phrases.add(o1);
cnouns.remove(cnouns.get(0));
i--;
- if(isOnechar) {
+ if (isOnechar) {
cnouns.remove(cnouns.get(0));
}
-
}
- o.setStem(o.getStem().substring(o.getSource().length()-source.length()));
+ o.setStem(o.getStem().substring(o.getSource().length() - source.length()));
o.setSource(source);
- if(cnouns.size()==1) cnouns.remove(0);
+ if (cnouns.size() == 1) {
+ cnouns.clear();
+ }
- this.phrases.add(o);
-
+ phrases.add(o);
}
- void setPhrases(List<AnalysisOutput> phrases) {
+ private void setPhrases(List<AnalysisOutput> phrases) {
this.phrases = phrases;
}
@@ -154,7 +142,6 @@
WSOutput candidate = (WSOutput)super.clone(); // FIXME: What's this? -Christian
candidate.setLastStart(lastStart);
-
candidate.setLastEnd(lastEnd);
List<AnalysisOutput> list = new ArrayList<AnalysisOutput>();
diff --git a/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/morph/WordSpaceAnalyzer.java b/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/morph/WordSpaceAnalyzer.java
index 6df6620..8ab00d0 100644
--- a/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/morph/WordSpaceAnalyzer.java
+++ b/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/morph/WordSpaceAnalyzer.java
@@ -37,9 +37,7 @@
WSOutput output = new WSOutput();
int wStart = 0;
-
- int sgCount;
-
+
Map<Integer, Integer> fCounter = new HashMap<Integer, Integer>();
for(int i=0;i<input.length();i++) {
@@ -61,7 +59,6 @@
// 이 경우는 다음 음절을 조사하여
} else if(i!= input.length()-1 && prefixExists) {
// 아무짓도 하지 않음.
- sgCount = i;
} else if(!prefixExists &&
(entry=DictionaryUtil.getBusa(input.substring(wStart,i+1)))!=null) {
candidates.add(buildSingleOutput(entry));
@@ -210,8 +207,7 @@
boolean hasJosa = false;
for(int i=start;i>=jstart;i--) {
String str = snippet.substring(jstart,i+1);
- if(DictionaryUtil.existJosa(str) && !findNounWithinStr(snippet,i,i+2) &&
- !isNounPart(snippet,jstart)) {
+ if(DictionaryUtil.existJosa(str) && !findNounWithinStr(snippet,i,i+2)) {
jend = i;
hasJosa = true;
break;
@@ -550,19 +546,4 @@
return false;
}
-
- private boolean isNounPart(String str, int jstart) {
-
- if(true) return false;
-
- for(int i=jstart-1;i>=0;i--) {
- if(DictionaryUtil.getWordExceptVerb(str.substring(i,jstart+1))!=null)
- return true;
-
- }
-
-
- return false;
-
- }
}