blob: 528eb4d5aa73f03d61ebd0343ab6223ef9641e3d [file] [log] [blame]
package opennlp.tools.parse_thicket;
import java.util.ArrayList;
import java.util.List;
public class ParseTreeNode implements IGeneralizer<ParseTreeNode>{
String word;
// this is the POS tag of the token
String pos;
// this is the NER label of the token
String ne;
Integer id;
//PhraseType
String phraseType;
public enum PhraseType {NP("NP"), VP("VP"), PRP("PRP");
private PhraseType(final String text) {
this.text = text;
}
private final String text;
}
public ParseTreeNode(String word, String pos, String ne, Integer id) {
super();
this.word = word;
this.pos = pos;
this.ne = ne;
this.id = id;
}
public ParseTreeNode(String word, String pos) {
super();
this.word = word;
this.pos = pos;
this.ne = ne;
this.id = id;
}
public String getPhraseType() {
return phraseType;
}
public void setPhraseType(String pt) {
this.phraseType=pt;
}
public String getWord() {
return word;
}
public void setWord(String word) {
this.word = word;
}
public String getPos() {
return pos;
}
public void setPos(String pos) {
this.pos = pos;
}
public String getNe() {
return ne;
}
public void setNe(String ne) {
this.ne = ne;
}
public Integer getId() {
return id;
}
public void setId(Integer id) {
this.id = id;
}
public String toString(){
StringBuffer buf = new StringBuffer();
if (id!=null)
buf.append("<"+id+">");
if(phraseType!=null)
buf.append(phraseType);
if(word!=null)
buf.append("'"+word+"'");
if (pos!=null)
buf.append(":"+pos);
return buf.toString();
}
@Override
public List<ParseTreeNode> generalize(Object o1, Object o2) {
List<ParseTreeNode> result = new ArrayList<ParseTreeNode>();
ParseTreeNode w1 = (ParseTreeNode) o1;
ParseTreeNode w2 = (ParseTreeNode) o2;
String posGen = generalizePOS(w1.pos, w2.pos);
if (posGen ==null)
return result;
ParseTreeNode newNode = new ParseTreeNode(generalizeWord(w1.word, w2.word),
posGen, "O", -1);
result.add(newNode);
return result;
}
public String generalizeWord(String lemma1, String lemma2){
if (lemma1.equals(lemma2))
return lemma1;
if (lemma1.equals("*"))
return "*";
if (lemma2.equals("*"))
return "*";
//TODO
return "*";
}
public String generalizePOS(String pos1, String pos2) {
if ((pos1.startsWith("NN") && pos2.equals("NP") || pos2.startsWith("NN")
&& pos1.equals("NP"))) {
return "NN";
}
if ((pos1.startsWith("NN") && pos2.equals("VBG") || pos2.startsWith("VBG")
&& pos1.equals("NN"))) {
return "NN";
}
if ((pos1.startsWith("NN") && pos2.equals("ADJP") || pos2.startsWith("NN")
&& pos1.equals("ADJP"))) {
return "NN";
}
if ((pos1.equals("IN") && pos2.equals("TO") || pos1.equals("TO")
&& pos2.equals("IN"))) {
return "IN";
}
// VBx vs VBx = VB (does not matter which form for verb)
if (pos1.startsWith("VB") && pos2.startsWith("VB")) {
return "VB";
}
// ABx vs ABy always gives AB
if (pos1.equalsIgnoreCase(pos2)) {
return pos1;
}
if (pos1.length() > 2) {
pos1 = pos1.substring(0, 2);
}
if (pos2.length() > 2) {
pos2 = pos2.substring(0, 2);
}
if (pos1.equalsIgnoreCase(pos2)) {
return pos1 + "*";
}
return null;
}
};