opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/ParseTreeNode.java - opennlp-sandbox - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one
  * or more contributor license agreements.  See the NOTICE file
  * distributed with this work for additional information
  * regarding copyright ownership.  The ASF licenses this file
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
  *
  *   http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing,
  * software distributed under the License is distributed on an
  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
  * KIND, either express or implied.  See the License for the
  * specific language governing permissions and limitations
  * under the License.
  */

 package opennlp.tools.parse_thicket;

 import java.util.ArrayList;
 import java.util.List;
 import java.util.Map;

 public class ParseTreeNode implements IGeneralizer<ParseTreeNode>{
 	String word; // word in normal form, lemma
 	// this is the POS tag of the token
 	String pos;
 	// this is the NER label of the token
 	String ne;
 	Integer id;
 	//PhraseType
 	String phraseType;
 	Map<String, Object> attributes;
 	String normalizedWord;
 	String syntacticDependence;
 	String originalWord; //what actually occurs in a sentence

 	String head;
 	String label;
 	String modifier;


 	public String getOriginalWord() {
 		return originalWord;
 	}

 	public void setOriginalWord(String originalWord) {
 		this.originalWord = originalWord;
 	}

 	public String getHead() {
 		return head;
 	}

 	public void setHead(String head) {
 		this.head = head;
 	}

 	public String getLabel() {
 		return label;
 	}

 	public void setLabel(String label) {
 		this.label = label;
 	}

 	public String getModifier() {
 		return modifier;
 	}

 	public void setModifier(String modifier) {
 		this.modifier = modifier;
 	}

 	public String getNormalizedWord() {
 		return normalizedWord;
 	}

 	public void setNormalizedWord(String normalizedWord) {
 		this.normalizedWord = normalizedWord;
 	}

 	public String getSyntacticDependence() {
 		return syntacticDependence;
 	}

 	public void setSyntacticDependence(String syntacticDependence) {
 		this.syntacticDependence = syntacticDependence;
 	}

 	public Map<String, Object> getAttributes() {
 		return attributes;
 	}

 	public void setAttributes(Map<String, Object> attributes) {
 		this.attributes = attributes;
 	}

 	public enum PhraseType {NP("NP"), VP("VP"), PRP("PRP");
 	PhraseType(final String text) {
 		this.text = text;
 	}
 	private final String text;

 	}

 	public ParseTreeNode(String word, String pos, String ne, Integer id) {
 		super();
 		this.word = word;
 		this.pos = pos;
 		this.ne = ne;
 		this.id = id;
 	}

 	public ParseTreeNode(String word, String pos) {
 		super();
 		this.word = word;
 		this.pos = pos;

 	}

 	public String getPhraseType() {
 		return phraseType;
 	}
 	public void setPhraseType(String pt) {
 		this.phraseType=pt;
 	}
 	public String getWord() {
 		return word;
 	}
 	public void setWord(String word) {
 		this.word = word;
 	}
 	public String getPos() {
 		return pos;
 	}
 	public void setPos(String pos) {
 		this.pos = pos;
 	}
 	public String getNe() {
 		return ne;
 	}
 	public void setNe(String ne) {
 		this.ne = ne;
 	}
 	public Integer getId() {
 		return id;
 	}
 	public void setId(Integer id) {
 		this.id = id;
 	}

 	public String toString(){
 		StringBuilder buf = new StringBuilder();
 		if (id!=null)
 			buf.append("<").append(id).append(">");
 		if(phraseType!=null)
 			buf.append(phraseType);
 		if(word!=null)
 			buf.append("'").append(word).append("'");
 		if (pos!=null)
 			buf.append(":").append(pos);
 		return buf.toString();
 	}

 	public static String toTreeRepresentationString(List<ParseTreeNode> chList){
 		StringBuilder buf = new StringBuilder();
 		for(ParseTreeNode ch: chList){
 			if (ch.getPos().startsWith(".") || ch.getPos().startsWith(",") || ch.getPos().startsWith(";") || ch.getPos().startsWith("!"))
 				continue;
 			buf.append("(").append(ch.getWord()).append(" ").append(ch.getPos()).append(")");
 		}
 		return buf.toString().trim();
 	}
 	public static String toWordString(List<ParseTreeNode> chList){
 		StringBuilder buf = new StringBuilder();
 		for(ParseTreeNode ch: chList){
 			buf.append(ch.getWord()).append(" ");
 		}
 		return buf.toString().trim();
 	}

 	@Override
 	public List<ParseTreeNode> generalize(Object o1, Object o2) {
 		List<ParseTreeNode> result = new ArrayList<>();

 		ParseTreeNode w1 = (ParseTreeNode) o1;
 		ParseTreeNode w2 = (ParseTreeNode) o2;
 		String posGen =  generalizePOS(w1.pos, w2.pos);
 		if (posGen ==null)
 			return result;
 		ParseTreeNode newNode = new ParseTreeNode(generalizeWord(w1.word, w2.word),
 				posGen, "O", -1);
 		result.add(newNode);
 		return result;
 	}

 	public String generalizeWord(String lemma1, String lemma2){
 		if (lemma1.equals(lemma2))
 			return lemma1;
 		if (lemma1.equals("*"))
 			return "*";
 		//TODO
 		return "*";

 	}

 	public String generalizePOS(String pos1, String pos2) {
 		if ((pos1.startsWith("NN") && pos2.equals("NP") || pos2.startsWith("NN")
 				&& pos1.equals("NP"))) {
 			return "NN";
 		}
 		if ((pos1.startsWith("NN") && pos2.equals("VBG") || pos2.startsWith("VBG")
 				&& pos1.equals("NN"))) {
 			return "NN";
 		}

 		if ((pos1.startsWith("NN") && pos2.equals("ADJP") || pos2.startsWith("NN")
 				&& pos1.equals("ADJP"))) {
 			return "NN";
 		}
 		if ((pos1.equals("IN") && pos2.equals("TO") || pos1.equals("TO")
 				&& pos2.equals("IN"))) {
 			return "IN";
 		}
 		// VBx vs VBx = VB (does not matter which form for verb)
 		if (pos1.startsWith("VB") && pos2.startsWith("VB")) {
 			return "VB";
 		}

 		// ABx vs ABy always gives AB
 		if (pos1.equalsIgnoreCase(pos2)) {
 			return pos1;
 		}
 		if (pos1.length() > 2) {
 			pos1 = pos1.substring(0, 2);
 		}

 		if (pos2.length() > 2) {
 			pos2 = pos2.substring(0, 2);
 		}
 		if (pos1.equalsIgnoreCase(pos2)) {
 			return pos1 + "*";
 		}
 		return null;
 	}


 }
	/*
	* Licensed to the Apache Software Foundation (ASF) under one
	* or more contributor license agreements. See the NOTICE file
	* distributed with this work for additional information
	* regarding copyright ownership. The ASF licenses this file
	* to you under the Apache License, Version 2.0 (the
	* "License"); you may not use this file except in compliance
	* with the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing,
	* software distributed under the License is distributed on an
	* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
	* KIND, either express or implied. See the License for the
	* specific language governing permissions and limitations
	* under the License.
	*/

	package opennlp.tools.parse_thicket;

	import java.util.ArrayList;
	import java.util.List;
	import java.util.Map;

	public class ParseTreeNode implements IGeneralizer<ParseTreeNode>{
	String word; // word in normal form, lemma
	// this is the POS tag of the token
	String pos;
	// this is the NER label of the token
	String ne;
	Integer id;
	//PhraseType
	String phraseType;
	Map<String, Object> attributes;
	String normalizedWord;
	String syntacticDependence;
	String originalWord; //what actually occurs in a sentence

	String head;
	String label;
	String modifier;



	public String getOriginalWord() {
	return originalWord;
	}

	public void setOriginalWord(String originalWord) {
	this.originalWord = originalWord;
	}

	public String getHead() {
	return head;
	}

	public void setHead(String head) {
	this.head = head;
	}

	public String getLabel() {
	return label;
	}

	public void setLabel(String label) {
	this.label = label;
	}

	public String getModifier() {
	return modifier;
	}

	public void setModifier(String modifier) {
	this.modifier = modifier;
	}

	public String getNormalizedWord() {
	return normalizedWord;
	}

	public void setNormalizedWord(String normalizedWord) {
	this.normalizedWord = normalizedWord;
	}

	public String getSyntacticDependence() {
	return syntacticDependence;
	}

	public void setSyntacticDependence(String syntacticDependence) {
	this.syntacticDependence = syntacticDependence;
	}

	public Map<String, Object> getAttributes() {
	return attributes;
	}

	public void setAttributes(Map<String, Object> attributes) {
	this.attributes = attributes;
	}

	public enum PhraseType {NP("NP"), VP("VP"), PRP("PRP");
	PhraseType(final String text) {
	this.text = text;
	}
	private final String text;

	}

	public ParseTreeNode(String word, String pos, String ne, Integer id) {
	super();
	this.word = word;
	this.pos = pos;
	this.ne = ne;
	this.id = id;
	}

	public ParseTreeNode(String word, String pos) {
	super();
	this.word = word;
	this.pos = pos;

	}

	public String getPhraseType() {
	return phraseType;
	}
	public void setPhraseType(String pt) {
	this.phraseType=pt;
	}
	public String getWord() {
	return word;
	}
	public void setWord(String word) {
	this.word = word;
	}
	public String getPos() {
	return pos;
	}
	public void setPos(String pos) {
	this.pos = pos;
	}
	public String getNe() {
	return ne;
	}
	public void setNe(String ne) {
	this.ne = ne;
	}
	public Integer getId() {
	return id;
	}
	public void setId(Integer id) {
	this.id = id;
	}

	public String toString(){
	StringBuilder buf = new StringBuilder();
	if (id!=null)
	buf.append("<").append(id).append(">");
	if(phraseType!=null)
	buf.append(phraseType);
	if(word!=null)
	buf.append("'").append(word).append("'");
	if (pos!=null)
	buf.append(":").append(pos);
	return buf.toString();
	}

	public static String toTreeRepresentationString(List<ParseTreeNode> chList){
	StringBuilder buf = new StringBuilder();
	for(ParseTreeNode ch: chList){
	if (ch.getPos().startsWith(".") \|\| ch.getPos().startsWith(",") \|\| ch.getPos().startsWith(";") \|\| ch.getPos().startsWith("!"))
	continue;
	buf.append("(").append(ch.getWord()).append(" ").append(ch.getPos()).append(")");
	}
	return buf.toString().trim();
	}
	public static String toWordString(List<ParseTreeNode> chList){
	StringBuilder buf = new StringBuilder();
	for(ParseTreeNode ch: chList){
	buf.append(ch.getWord()).append(" ");
	}
	return buf.toString().trim();
	}

	@Override
	public List<ParseTreeNode> generalize(Object o1, Object o2) {
	List<ParseTreeNode> result = new ArrayList<>();

	ParseTreeNode w1 = (ParseTreeNode) o1;
	ParseTreeNode w2 = (ParseTreeNode) o2;
	String posGen = generalizePOS(w1.pos, w2.pos);
	if (posGen ==null)
	return result;
	ParseTreeNode newNode = new ParseTreeNode(generalizeWord(w1.word, w2.word),
	posGen, "O", -1);
	result.add(newNode);
	return result;
	}

	public String generalizeWord(String lemma1, String lemma2){
	if (lemma1.equals(lemma2))
	return lemma1;
	if (lemma1.equals("*"))
	return "*";
	//TODO
	return "*";

	}

	public String generalizePOS(String pos1, String pos2) {
	if ((pos1.startsWith("NN") && pos2.equals("NP") \|\| pos2.startsWith("NN")
	&& pos1.equals("NP"))) {
	return "NN";
	}
	if ((pos1.startsWith("NN") && pos2.equals("VBG") \|\| pos2.startsWith("VBG")
	&& pos1.equals("NN"))) {
	return "NN";
	}

	if ((pos1.startsWith("NN") && pos2.equals("ADJP") \|\| pos2.startsWith("NN")
	&& pos1.equals("ADJP"))) {
	return "NN";
	}
	if ((pos1.equals("IN") && pos2.equals("TO") \|\| pos1.equals("TO")
	&& pos2.equals("IN"))) {
	return "IN";
	}
	// VBx vs VBx = VB (does not matter which form for verb)
	if (pos1.startsWith("VB") && pos2.startsWith("VB")) {
	return "VB";
	}

	// ABx vs ABy always gives AB
	if (pos1.equalsIgnoreCase(pos2)) {
	return pos1;
	}
	if (pos1.length() > 2) {
	pos1 = pos1.substring(0, 2);
	}

	if (pos2.length() > 2) {
	pos2 = pos2.substring(0, 2);
	}
	if (pos1.equalsIgnoreCase(pos2)) {
	return pos1 + "*";
	}
	return null;
	}


	}