blob: ec36639b3d37c680b5bb9251975110d4cacffb95 [file] [log] [blame]
package org.apache.lucene.analysis.ko.morph;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import org.apache.lucene.analysis.ko.dic.DictionaryUtil;
import org.apache.lucene.analysis.ko.dic.SyllableFeatures;
class EomiUtil {
private EomiUtil() {}
/**
* 선어말어미를 분석한다.
*/
static String[] splitPomi(String stem) {
// results[0]:성공(1)/실패(0), results[1]: 어근, results[2]: 선어말어미
String[] results = new String[2];
results[0] = stem;
if(stem==null||stem.length()==0||"있".equals(stem)) return results;
char[] chrs = stem.toCharArray();
int len = chrs.length;
String pomi = "";
int index = len-1;
char[] jaso = MorphUtil.decompose(chrs[index]);
if(chrs[index]!='시'&&chrs[index]!='ㅆ'&&jaso[jaso.length-1]!='ㅆ') return results; // 선어말어미가 발견되지 않았다
if(chrs[index]=='겠') {
pomi = "겠";
setPomiResult(results,stem.substring(0,index),pomi);
if(--index<=0||
(chrs[index]!='시'&&chrs[index]!='ㅆ'&&jaso[jaso.length-1]!='ㅆ'))
return results; // 다음이거나 선어말어미가 없다면...
jaso = MorphUtil.decompose(chrs[index]);
}
if(chrs[index]=='었') { // 시었, ㅆ었, 었
pomi = chrs[index]+pomi;
setPomiResult(results,stem.substring(0,index),pomi);
if(--index<=0||
(chrs[index]!='시'&&chrs[index]!='ㅆ'&&jaso[jaso.length-1]!='ㅆ'))
return results; // 다음이거나 선어말어미가 없다면...
jaso = MorphUtil.decompose(chrs[index]);
}
if(chrs[index]=='였'){
pomi = MorphUtil.replaceJongsung('어',chrs[index])+pomi;
if(index>0&&chrs[index-1]=='하')
stem = stem.substring(0,index);
else
stem = stem.substring(0,index)+"이";
setPomiResult(results,stem,pomi);
}else if(chrs[index]=='셨'){
pomi = MorphUtil.replaceJongsung('어',chrs[index])+pomi;
stem = stem.substring(0,index);
setPomiResult(results,stem,"시"+pomi);
}else if(chrs[index]=='았'||chrs[index]=='었') {
pomi = chrs[index]+pomi;
setPomiResult(results,stem.substring(0,index),pomi);
if(--index<=0||
(chrs[index]!='시'&&chrs[index]!='으')) return results; // 다음이거나 선어말어미가 없다면...
jaso = MorphUtil.decompose(chrs[index]);
}else if(jaso.length==3&&jaso[2]=='ㅆ') {
if(jaso[0]=='ㅎ'&&jaso[1]=='ㅐ') {
pomi = MorphUtil.replaceJongsung('어',chrs[index])+pomi;
stem = stem.substring(0,index)+"하";
}else if(jaso[0]!='ㅇ'&&(jaso[1]=='ㅏ'||jaso[1]=='ㅓ'||jaso[1]=='ㅔ'||jaso[1]=='ㅐ')) {
pomi = "었"+pomi;
stem = stem.substring(0,index)+MorphUtil.makeChar(chrs[index], 0);
}else if(jaso[0]!='ㅇ'&&(jaso[1]=='ㅙ')) {
pomi = "었"+pomi;
stem = stem.substring(0,index)+MorphUtil.makeChar(chrs[index],11, 0);
} else if(jaso[1]=='ㅘ') {
pomi = MorphUtil.replaceJongsung('아',chrs[index])+pomi;
stem = stem.substring(0,index)+MorphUtil.makeChar(chrs[index],8, 0);
} else if(jaso[1]=='ㅝ') {
pomi = MorphUtil.replaceJongsung('어',chrs[index])+pomi;
stem = stem.substring(0,index)+MorphUtil.makeChar(chrs[index],13, 0);
} else if(jaso[1]=='ㅕ') {
pomi = MorphUtil.replaceJongsung('어',chrs[index])+pomi;
stem = stem.substring(0,index)+MorphUtil.makeChar(chrs[index],20, 0);
} else if(jaso[1]=='ㅐ') {
pomi = MorphUtil.replaceJongsung('어',chrs[index])+pomi;
stem = stem.substring(0,index);
} else if(jaso[1]=='ㅒ') {
pomi = MorphUtil.replaceJongsung('애',chrs[index])+pomi;
stem = stem.substring(0,index);
} else {
pomi = "었"+pomi;
}
setPomiResult(results,stem,pomi);
if(chrs[index]!='시'&&chrs[index]!='으') return results; // 다음이거나 선어말어미가 없다면...
jaso = MorphUtil.decompose(chrs[index]);
}
char[] nChrs = null;
if(index>0) nChrs = MorphUtil.decompose(chrs[index-1]);
else nChrs = new char[2];
if(nChrs.length==2&&chrs[index]=='시'&&(chrs.length<=index+1||
(chrs.length>index+1&&chrs[index+1]!='셨'))) {
if (DictionaryUtil.hasWord(results[0])) {
return results; //'시'가 포함된 단어가 있다. 성가시다/도시다/들쑤시다
}
pomi = chrs[index]+pomi;
setPomiResult(results,stem.substring(0,index),pomi);
if(--index==0||chrs[index]!='으') return results; // 다음이거나 선어말어미가 없다면...
jaso = MorphUtil.decompose(chrs[index]);
}
if(index>0) nChrs = MorphUtil.decompose(chrs[index-1]);
else nChrs = new char[2];
if(chrs.length>index+1&&nChrs.length==3&&(chrs[index+1]=='셨'||chrs[index+1]=='시')&&chrs[index]=='으') {
pomi = chrs[index]+pomi;
setPomiResult(results,stem.substring(0,index),pomi);
}
return results;
}
private static void setPomiResult(String[] results,String stem, String pomi ) {
results[0] = stem;
results[1] = pomi;
}
static boolean IsNLMBSyl(char ech, char lch) {
switch(lch) {
case 'ㄴ' :
return SyllableFeatures.hasFeature(ech, SyllableFeatures.YNPNA) || SyllableFeatures.hasFeature(ech, SyllableFeatures.YNPLN);
case 'ㄹ' :
return SyllableFeatures.hasFeature(ech, SyllableFeatures.YNPLA);
case 'ㅁ' :
return SyllableFeatures.hasFeature(ech, SyllableFeatures.YNPMA);
case 'ㅂ' :
return SyllableFeatures.hasFeature(ech, SyllableFeatures.YNPBA);
default:
return false;
}
}
/**
* 어미를 분리한다.
*
* 1. 규칙용언과 어간만 바뀌는 불규칙 용언
* 2. 어미가 종성 'ㄴ/ㄹ/ㅁ/ㅂ'으로 시작되는 어절
* 3. '여/거라/너라'의 불규칙 어절
* 4. 어미 '아/어'가 탈락되는 어절
* 5. '아/어'의 변이체 분리
*/
static String[] splitEomi(String stem, String end) {
int strlen = stem.length();
if (strlen == 0) {
return null;
}
char estem = stem.charAt(strlen-1);
char[] chrs = MorphUtil.decompose(estem);
if (chrs.length == 1) {
return null; // 한글이 아니라면...
}
if (chrs.length == 3 &&
(chrs[2] == 'ㄴ' || chrs[2] == 'ㄹ' || chrs[2] == 'ㅁ' || chrs[2] == 'ㅂ') &&
EomiUtil.IsNLMBSyl(estem,chrs[2]) &&
combineAndEomiCheck(chrs[2], end)) {
String strs[] = new String[2];
strs[1] = Character.toString(chrs[2]);
if (end.length() > 0) strs[1] += end;
strs[0] = stem.substring(0,strlen-1) + MorphUtil.makeChar(estem, 0);
return strs;
} else if (chrs.length==3 &&
chrs[2]=='ㄹ' &&
DictionaryUtil.hasVerb(stem) &&
combineAndEomiCheck(chrs[2], end)) {
String strs[] = new String[2];
strs[1] = Character.toString(chrs[2]);
if (end.length() > 0) strs[1] += end;
strs[0] = stem; // "만들 때와는"에서 "만들"과 같은 경우
return strs;
} else if (estem == '해' && DictionaryUtil.existEomi("어"+end)) {
return new String[] { stem.substring(0,strlen-1)+"하", "어"+end };
} else if (estem == '히' && DictionaryUtil.existEomi("이"+end)) {
return new String[] { stem.substring(0,strlen-1)+"하", "이"+end };
} else if (chrs[0] != 'ㅇ' &&
(chrs[1] == 'ㅏ' || chrs[1] == 'ㅓ' || chrs[1] == 'ㅔ' || chrs[1] == 'ㅐ') &&
(chrs.length == 2 || SyllableFeatures.hasFeature(estem, SyllableFeatures.YNPAH)) &&
combineAndEomiCheck('어', end)) {
if (chrs.length == 2) {
return new String[] { stem, "어"+end };
} else {
return new String[] { stem, end };
}
} else if (estem == '하' &&
end.startsWith("여") &&
combineAndEomiCheck('어', end.substring(1))) {
return new String[] { stem, "어"+end.substring(1) };
} else if (estem == '려' &&
combineAndEomiCheck('어', end)) {
// 꺼려=>꺼리어, 꺼려서=>꺼리어서
return new String[] {
stem.substring(0,stem.length()-1)+"리",
"어"+end
};
} else if ((chrs.length == 2) &&
(chrs[1] == 'ㅘ' || chrs[1] == 'ㅙ' || chrs[1] == 'ㅝ' || chrs[1] == 'ㅕ' || chrs[1] == 'ㅐ' || chrs[1] == 'ㅒ') &&
combineAndEomiCheck('어', end)) {
StringBuilder sb = new StringBuilder();
if (strlen > 1) {
sb.append(stem, 0, strlen-1);
}
switch (chrs[1]) {
case 'ㅘ': sb.append(MorphUtil.makeChar(estem, 8, 0));
sb.append(MorphUtil.replaceJongsung('아', estem));
break;
case 'ㅝ': sb.append(MorphUtil.makeChar(estem, 13, 0));
sb.append(MorphUtil.replaceJongsung('어', estem));
break;
case 'ㅙ': sb.append(MorphUtil.makeChar(estem, 11, 0));
sb.append(MorphUtil.replaceJongsung('어', estem));
break;
case 'ㅕ': sb.append(MorphUtil.makeChar(estem, 20, 0));
sb.append(MorphUtil.replaceJongsung('어',estem));
break;
case 'ㅐ': sb.append(MorphUtil.makeChar(estem, 0, 0));
sb.append(MorphUtil.replaceJongsung('어',estem));
break;
case 'ㅒ': sb.append(MorphUtil.makeChar(estem, 20, 0));
sb.append(MorphUtil.replaceJongsung('애',estem));
break;
}
String strs[] = new String[2];
strs[0] = sb.toString();
end = strs[0].substring(strs[0].length()-1)+end;
strs[0] = strs[0].substring(0,strs[0].length()-1);
strs[1] = end;
return strs;
} else if (end.length() > 0 && DictionaryUtil.existEomi(end)) {
return new String[]{stem, end};
} else {
return null;
}
}
/**
* ㄴ,ㄹ,ㅁ,ㅂ과 eomi 가 결합하여 어미가 될 수 있는지 점검한다.
*/
private static boolean combineAndEomiCheck(char s, String eomi) {
switch(s) {
case 'ㄴ': eomi = "은" + eomi;
break;
case 'ㄹ': eomi = "을" + eomi;
break;
case 'ㅁ': eomi = "음" + eomi;
break;
case 'ㅂ': eomi = "습" + eomi;
break;
default: eomi = s + eomi;
}
return DictionaryUtil.existEomi(eomi);
}
}