opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/ParseTreeNode.java - opennlp-sandbox - Git at Google

 package opennlp.tools.parse_thicket;

 import java.util.ArrayList;
 import java.util.List;

 public class ParseTreeNode implements IGeneralizer<ParseTreeNode>{
 	String word;
     // this is the POS tag of the token
     String pos;
     // this is the NER label of the token
     String ne;
     Integer id;
     //PhraseType
     String phraseType;

     public enum PhraseType {NP("NP"), VP("VP"), PRP("PRP");
     	private PhraseType(final String text) {
         this.text = text;
     	}
         private final String text;

     }

 	public ParseTreeNode(String word, String pos, String ne, Integer id) {
 		super();
 		this.word = word;
 		this.pos = pos;
 		this.ne = ne;
 		this.id = id;
 	}

 	public ParseTreeNode(String word, String pos) {
 		super();
 		this.word = word;
 		this.pos = pos;
 		this.ne = ne;
 		this.id = id;
 	}

 	public String getPhraseType() {
 		return phraseType;
 	}
 	public void setPhraseType(String pt) {
 		this.phraseType=pt;
 	}
 	public String getWord() {
 		return word;
 	}
 	public void setWord(String word) {
 		this.word = word;
 	}
 	public String getPos() {
 		return pos;
 	}
 	public void setPos(String pos) {
 		this.pos = pos;
 	}
 	public String getNe() {
 		return ne;
 	}
 	public void setNe(String ne) {
 		this.ne = ne;
 	}
 	public Integer getId() {
 		return id;
 	}
 	public void setId(Integer id) {
 		this.id = id;
 	}

 	public String toString(){
 		StringBuffer buf = new StringBuffer();
 		if (id!=null)
 			buf.append("<"+id+">");
 		if(phraseType!=null)
 			buf.append(phraseType);
 		if(word!=null)
 			buf.append("'"+word+"'");
 		if (pos!=null)
 			buf.append(":"+pos);
 		return buf.toString();
 	}

 	@Override
 	public List<ParseTreeNode> generalize(Object o1, Object o2) {
 		List<ParseTreeNode> result = new ArrayList<ParseTreeNode>();

 		ParseTreeNode w1 = (ParseTreeNode) o1;
 		ParseTreeNode w2 = (ParseTreeNode) o2;
 		String posGen =  generalizePOS(w1.pos, w2.pos);
 		if (posGen ==null)
 			return result;
 		ParseTreeNode newNode = new ParseTreeNode(generalizeWord(w1.word, w2.word),
 				posGen, "O", -1);
 		result.add(newNode);
 		return result;
 	}

 	public String generalizeWord(String lemma1, String lemma2){
 		if (lemma1.equals(lemma2))
 			return lemma1;
 		if (lemma1.equals("*"))
 			return "*";
 		if (lemma2.equals("*"))
 			return "*";
 		//TODO
 		return "*";

 	}

 	public String generalizePOS(String pos1, String pos2) {
 	    if ((pos1.startsWith("NN") && pos2.equals("NP") || pos2.startsWith("NN")
 	        && pos1.equals("NP"))) {
 	      return "NN";
 	    }
 	    if ((pos1.startsWith("NN") && pos2.equals("VBG") || pos2.startsWith("VBG")
 	        && pos1.equals("NN"))) {
 	      return "NN";
 	    }

 	    if ((pos1.startsWith("NN") && pos2.equals("ADJP") || pos2.startsWith("NN")
 	        && pos1.equals("ADJP"))) {
 	      return "NN";
 	    }
 	    if ((pos1.equals("IN") && pos2.equals("TO") || pos1.equals("TO")
 	        && pos2.equals("IN"))) {
 	      return "IN";
 	    }
 	    // VBx vs VBx = VB (does not matter which form for verb)
 	    if (pos1.startsWith("VB") && pos2.startsWith("VB")) {
 	      return "VB";
 	    }

 	    // ABx vs ABy always gives AB
 	    if (pos1.equalsIgnoreCase(pos2)) {
 	      return pos1;
 	    }
 	    if (pos1.length() > 2) {
 	      pos1 = pos1.substring(0, 2);
 	    }

 	    if (pos2.length() > 2) {
 	      pos2 = pos2.substring(0, 2);
 	    }
 	    if (pos1.equalsIgnoreCase(pos2)) {
 	      return pos1 + "*";
 	    }
 	    return null;
 	  }


 };
	package opennlp.tools.parse_thicket;

	import java.util.ArrayList;
	import java.util.List;

	public class ParseTreeNode implements IGeneralizer<ParseTreeNode>{
	String word;
	// this is the POS tag of the token
	String pos;
	// this is the NER label of the token
	String ne;
	Integer id;
	//PhraseType
	String phraseType;

	public enum PhraseType {NP("NP"), VP("VP"), PRP("PRP");
	private PhraseType(final String text) {
	this.text = text;
	}
	private final String text;

	}

	public ParseTreeNode(String word, String pos, String ne, Integer id) {
	super();
	this.word = word;
	this.pos = pos;
	this.ne = ne;
	this.id = id;
	}

	public ParseTreeNode(String word, String pos) {
	super();
	this.word = word;
	this.pos = pos;
	this.ne = ne;
	this.id = id;
	}

	public String getPhraseType() {
	return phraseType;
	}
	public void setPhraseType(String pt) {
	this.phraseType=pt;
	}
	public String getWord() {
	return word;
	}
	public void setWord(String word) {
	this.word = word;
	}
	public String getPos() {
	return pos;
	}
	public void setPos(String pos) {
	this.pos = pos;
	}
	public String getNe() {
	return ne;
	}
	public void setNe(String ne) {
	this.ne = ne;
	}
	public Integer getId() {
	return id;
	}
	public void setId(Integer id) {
	this.id = id;
	}

	public String toString(){
	StringBuffer buf = new StringBuffer();
	if (id!=null)
	buf.append("<"+id+">");
	if(phraseType!=null)
	buf.append(phraseType);
	if(word!=null)
	buf.append("'"+word+"'");
	if (pos!=null)
	buf.append(":"+pos);
	return buf.toString();
	}

	@Override
	public List<ParseTreeNode> generalize(Object o1, Object o2) {
	List<ParseTreeNode> result = new ArrayList<ParseTreeNode>();

	ParseTreeNode w1 = (ParseTreeNode) o1;
	ParseTreeNode w2 = (ParseTreeNode) o2;
	String posGen = generalizePOS(w1.pos, w2.pos);
	if (posGen ==null)
	return result;
	ParseTreeNode newNode = new ParseTreeNode(generalizeWord(w1.word, w2.word),
	posGen, "O", -1);
	result.add(newNode);
	return result;
	}

	public String generalizeWord(String lemma1, String lemma2){
	if (lemma1.equals(lemma2))
	return lemma1;
	if (lemma1.equals("*"))
	return "*";
	if (lemma2.equals("*"))
	return "*";
	//TODO
	return "*";

	}

	public String generalizePOS(String pos1, String pos2) {
	if ((pos1.startsWith("NN") && pos2.equals("NP") \|\| pos2.startsWith("NN")
	&& pos1.equals("NP"))) {
	return "NN";
	}
	if ((pos1.startsWith("NN") && pos2.equals("VBG") \|\| pos2.startsWith("VBG")
	&& pos1.equals("NN"))) {
	return "NN";
	}

	if ((pos1.startsWith("NN") && pos2.equals("ADJP") \|\| pos2.startsWith("NN")
	&& pos1.equals("ADJP"))) {
	return "NN";
	}
	if ((pos1.equals("IN") && pos2.equals("TO") \|\| pos1.equals("TO")
	&& pos2.equals("IN"))) {
	return "IN";
	}
	// VBx vs VBx = VB (does not matter which form for verb)
	if (pos1.startsWith("VB") && pos2.startsWith("VB")) {
	return "VB";
	}

	// ABx vs ABy always gives AB
	if (pos1.equalsIgnoreCase(pos2)) {
	return pos1;
	}
	if (pos1.length() > 2) {
	pos1 = pos1.substring(0, 2);
	}

	if (pos2.length() > 2) {
	pos2 = pos2.substring(0, 2);
	}
	if (pos1.equalsIgnoreCase(pos2)) {
	return pos1 + "*";
	}
	return null;
	}


	};