opennlp-wsd/src/main/java/opennlp/tools/disambiguator/WSDSample.java - opennlp-sandbox - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License. You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */

 package opennlp.tools.disambiguator;

 import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.Collections;
 import java.util.List;

 import net.sf.extjwnl.JWNLException;
 import net.sf.extjwnl.data.POS;
 import net.sf.extjwnl.data.Synset;
 import net.sf.extjwnl.dictionary.Dictionary;
 import opennlp.tools.tokenize.WhitespaceTokenizer;
 import opennlp.tools.util.InvalidFormatException;

 public class WSDSample {

   private List<String> sentence;
   private List<String> tags;
   private List<String> lemmas;
   private int senseID;
   private List<String> senseIDs;
   private int targetPosition;

   public WSDSample(String sentence[], String tags[], String[] lemmas,
       int targetPosition, int senseID) {
     this.sentence = Collections.unmodifiableList(new ArrayList<String>(Arrays
         .asList(sentence)));
     this.tags = Collections.unmodifiableList(new ArrayList<String>(Arrays
         .asList(tags)));
     this.targetPosition = targetPosition;
     this.lemmas = Collections.unmodifiableList(new ArrayList<String>(Arrays
         .asList(lemmas)));
     ;
     this.senseID = senseID;
     checkArguments();
   }

   public WSDSample(String sentence[], String tags[], String[] lemmas,
       int targetPosition) {
     this.sentence = Collections.unmodifiableList(new ArrayList<String>(Arrays
         .asList(sentence)));
     this.tags = Collections.unmodifiableList(new ArrayList<String>(Arrays
         .asList(tags)));
     this.targetPosition = targetPosition;
     this.lemmas = Collections.unmodifiableList(new ArrayList<String>(Arrays
         .asList(lemmas)));
     ;
     checkArguments();
   }

   public WSDSample(String sentence[], String tags[], String[] lemmas,
       int targetPosition, List<String> senseIDs) {
     this.sentence = Collections.unmodifiableList(new ArrayList<String>(Arrays
         .asList(sentence)));
     this.tags = Collections.unmodifiableList(new ArrayList<String>(Arrays
         .asList(tags)));
     this.targetPosition = targetPosition;
     this.lemmas = Collections.unmodifiableList(new ArrayList<String>(Arrays
         .asList(lemmas)));
     this.senseIDs = senseIDs;
     checkArguments();
   }

   private void checkArguments() {
     if (sentence.size() != tags.size() || targetPosition < 0
         || targetPosition >= tags.size())
       throw new IllegalArgumentException(
           "There must be exactly one tag for each token!");

     if (sentence.contains(null) || tags.contains(null))
       throw new IllegalArgumentException("null elements are not allowed!");
   }

   public String[] getSentence() {
     return sentence.toArray(new String[sentence.size()]);
   }

   public String[] getTags() {
     return tags.toArray(new String[tags.size()]);
   }

   public String[] getLemmas() {
     return lemmas.toArray(new String[lemmas.size()]);
   }

   public int getTargetPosition() {
     return targetPosition;
   }

   public int getSenseID() {
     return senseID;
   }

   public List<String> getSenseIDs() {
     return senseIDs;
   }

   public String getTargetWord() {
     return sentence.get(targetPosition);
   }

   public String getTargetTag() {
     return tags.get(targetPosition);
   }

   public void setSentence(List<String> sentence) {
     this.sentence = sentence;
   }

   public void setTags(List<String> tags) {
     this.tags = tags;
   }

   public void setLemmas(List<String> lemmas) {
     this.lemmas = lemmas;
   }

   public void setSenseID(int senseID) {
     this.senseID = senseID;
   }

   public void setSenseIDs(List<String> senseIDs) {
     this.senseIDs = senseIDs;
   }

   public void setTargetPosition(int targetPosition) {
     this.targetPosition = targetPosition;
   }

   @Override
   public String toString() {

     StringBuilder result = new StringBuilder();
     result.append("target at : " + this.targetPosition + " in : ");
     for (int i = 0; i < getSentence().length; i++) {
       result.append(i);
       result.append(".");
       result.append(getSentence()[i]);
       result.append('_');
       result.append(getTags()[i]);
       result.append(' ');
     }

     if (result.length() > 0) {
       // get rid of last space
       result.setLength(result.length() - 1);
     }

     return result.toString();
   }

   /*
    * Parses a sample of format : TargetIndex TargetLemma Token Tag Token Tag ...
    */
   public static WSDSample parse(String sentenceString)
       throws InvalidFormatException {

     String tokenTags[] = WhitespaceTokenizer.INSTANCE.tokenize(sentenceString);

     int position = Integer.parseInt(tokenTags[0]);
     String sentence[] = new String[tokenTags.length - 1];
     String tags[] = new String[tokenTags.length - 1];
     String lemmas[] = new String[tokenTags.length - 1];

     for (int i = 1; i < tokenTags.length; i++) {
       int split = tokenTags[i].lastIndexOf("_");

       if (split == -1) {
         throw new InvalidFormatException("Cannot find \"_\" inside token!");
       }

       sentence[i] = tokenTags[i].substring(0, split);
       tags[i] = tokenTags[i].substring(split + 1);
       lemmas[i] = tokenTags[i].substring(split + 2);
     }

     return new WSDSample(sentence, tags, lemmas, position);
   }

   @Override
   public boolean equals(Object obj) {
     if (this == obj) {
       return true;
     } else if (obj instanceof WSDSample) {
       WSDSample a = (WSDSample) obj;

       return Arrays.equals(getSentence(), a.getSentence())
           && Arrays.equals(getTags(), a.getTags())
           && getTargetPosition() == a.getTargetPosition();
     } else {
       return false;
     }
   }

   // Return the synsets (thus the senses) of the current target word
   public List<Synset> getSynsets() {
     try {
       return Dictionary
           .getDefaultResourceInstance()
           .lookupIndexWord(WSDHelper.getPOS(this.getTargetTag()),
               this.getTargetWord()).getSenses();
     } catch (JWNLException e) {
       e.printStackTrace();
     }
     return null;
   }

   public String getTargetWordTag() {

     String wordBaseForm = this.getLemmas()[this.getTargetPosition()];

     String ref = "";

     if ((WSDHelper.getPOS(this.getTargetTag()) != null)) {
       if (WSDHelper.getPOS(this.getTargetTag()).equals(POS.VERB)) {
         ref = wordBaseForm + ".v";
       } else if (WSDHelper.getPOS(this.getTargetTag()).equals(POS.NOUN)) {
         ref = wordBaseForm + ".n";
       } else if (WSDHelper.getPOS(this.getTargetTag()).equals(POS.ADJECTIVE)) {
         ref = wordBaseForm + ".a";
       } else if (WSDHelper.getPOS(this.getTargetTag()).equals(POS.ADVERB)) {
         ref = wordBaseForm + ".r";
       }
     }

     return ref;
   }

 }
	/*
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/

	package opennlp.tools.disambiguator;

	import java.util.ArrayList;
	import java.util.Arrays;
	import java.util.Collections;
	import java.util.List;

	import net.sf.extjwnl.JWNLException;
	import net.sf.extjwnl.data.POS;
	import net.sf.extjwnl.data.Synset;
	import net.sf.extjwnl.dictionary.Dictionary;
	import opennlp.tools.tokenize.WhitespaceTokenizer;
	import opennlp.tools.util.InvalidFormatException;

	public class WSDSample {

	private List<String> sentence;
	private List<String> tags;
	private List<String> lemmas;
	private int senseID;
	private List<String> senseIDs;
	private int targetPosition;

	public WSDSample(String sentence[], String tags[], String[] lemmas,
	int targetPosition, int senseID) {
	this.sentence = Collections.unmodifiableList(new ArrayList<String>(Arrays
	.asList(sentence)));
	this.tags = Collections.unmodifiableList(new ArrayList<String>(Arrays
	.asList(tags)));
	this.targetPosition = targetPosition;
	this.lemmas = Collections.unmodifiableList(new ArrayList<String>(Arrays
	.asList(lemmas)));
	;
	this.senseID = senseID;
	checkArguments();
	}

	public WSDSample(String sentence[], String tags[], String[] lemmas,
	int targetPosition) {
	this.sentence = Collections.unmodifiableList(new ArrayList<String>(Arrays
	.asList(sentence)));
	this.tags = Collections.unmodifiableList(new ArrayList<String>(Arrays
	.asList(tags)));
	this.targetPosition = targetPosition;
	this.lemmas = Collections.unmodifiableList(new ArrayList<String>(Arrays
	.asList(lemmas)));
	;
	checkArguments();
	}

	public WSDSample(String sentence[], String tags[], String[] lemmas,
	int targetPosition, List<String> senseIDs) {
	this.sentence = Collections.unmodifiableList(new ArrayList<String>(Arrays
	.asList(sentence)));
	this.tags = Collections.unmodifiableList(new ArrayList<String>(Arrays
	.asList(tags)));
	this.targetPosition = targetPosition;
	this.lemmas = Collections.unmodifiableList(new ArrayList<String>(Arrays
	.asList(lemmas)));
	this.senseIDs = senseIDs;
	checkArguments();
	}

	private void checkArguments() {
	if (sentence.size() != tags.size() \|\| targetPosition < 0
	\|\| targetPosition >= tags.size())
	throw new IllegalArgumentException(
	"There must be exactly one tag for each token!");

	if (sentence.contains(null) \|\| tags.contains(null))
	throw new IllegalArgumentException("null elements are not allowed!");
	}

	public String[] getSentence() {
	return sentence.toArray(new String[sentence.size()]);
	}

	public String[] getTags() {
	return tags.toArray(new String[tags.size()]);
	}

	public String[] getLemmas() {
	return lemmas.toArray(new String[lemmas.size()]);
	}

	public int getTargetPosition() {
	return targetPosition;
	}

	public int getSenseID() {
	return senseID;
	}

	public List<String> getSenseIDs() {
	return senseIDs;
	}

	public String getTargetWord() {
	return sentence.get(targetPosition);
	}

	public String getTargetTag() {
	return tags.get(targetPosition);
	}

	public void setSentence(List<String> sentence) {
	this.sentence = sentence;
	}

	public void setTags(List<String> tags) {
	this.tags = tags;
	}

	public void setLemmas(List<String> lemmas) {
	this.lemmas = lemmas;
	}

	public void setSenseID(int senseID) {
	this.senseID = senseID;
	}

	public void setSenseIDs(List<String> senseIDs) {
	this.senseIDs = senseIDs;
	}

	public void setTargetPosition(int targetPosition) {
	this.targetPosition = targetPosition;
	}

	@Override
	public String toString() {

	StringBuilder result = new StringBuilder();
	result.append("target at : " + this.targetPosition + " in : ");
	for (int i = 0; i < getSentence().length; i++) {
	result.append(i);
	result.append(".");
	result.append(getSentence()[i]);
	result.append('_');
	result.append(getTags()[i]);
	result.append(' ');
	}

	if (result.length() > 0) {
	// get rid of last space
	result.setLength(result.length() - 1);
	}

	return result.toString();
	}

	/*
	* Parses a sample of format : TargetIndex TargetLemma Token Tag Token Tag ...
	*/
	public static WSDSample parse(String sentenceString)
	throws InvalidFormatException {

	String tokenTags[] = WhitespaceTokenizer.INSTANCE.tokenize(sentenceString);

	int position = Integer.parseInt(tokenTags[0]);
	String sentence[] = new String[tokenTags.length - 1];
	String tags[] = new String[tokenTags.length - 1];
	String lemmas[] = new String[tokenTags.length - 1];

	for (int i = 1; i < tokenTags.length; i++) {
	int split = tokenTags[i].lastIndexOf("_");

	if (split == -1) {
	throw new InvalidFormatException("Cannot find \"_\" inside token!");
	}

	sentence[i] = tokenTags[i].substring(0, split);
	tags[i] = tokenTags[i].substring(split + 1);
	lemmas[i] = tokenTags[i].substring(split + 2);
	}

	return new WSDSample(sentence, tags, lemmas, position);
	}

	@Override
	public boolean equals(Object obj) {
	if (this == obj) {
	return true;
	} else if (obj instanceof WSDSample) {
	WSDSample a = (WSDSample) obj;

	return Arrays.equals(getSentence(), a.getSentence())
	&& Arrays.equals(getTags(), a.getTags())
	&& getTargetPosition() == a.getTargetPosition();
	} else {
	return false;
	}
	}

	// Return the synsets (thus the senses) of the current target word
	public List<Synset> getSynsets() {
	try {
	return Dictionary
	.getDefaultResourceInstance()
	.lookupIndexWord(WSDHelper.getPOS(this.getTargetTag()),
	this.getTargetWord()).getSenses();
	} catch (JWNLException e) {
	e.printStackTrace();
	}
	return null;
	}

	public String getTargetWordTag() {

	String wordBaseForm = this.getLemmas()[this.getTargetPosition()];

	String ref = "";

	if ((WSDHelper.getPOS(this.getTargetTag()) != null)) {
	if (WSDHelper.getPOS(this.getTargetTag()).equals(POS.VERB)) {
	ref = wordBaseForm + ".v";
	} else if (WSDHelper.getPOS(this.getTargetTag()).equals(POS.NOUN)) {
	ref = wordBaseForm + ".n";
	} else if (WSDHelper.getPOS(this.getTargetTag()).equals(POS.ADJECTIVE)) {
	ref = wordBaseForm + ".a";
	} else if (WSDHelper.getPOS(this.getTargetTag()).equals(POS.ADVERB)) {
	ref = wordBaseForm + ".r";
	}
	}

	return ref;
	}

	}