/* | |
* Licensed to the Apache Software Foundation (ASF) under one or more | |
* contributor license agreements. See the NOTICE file distributed with | |
* this work for additional information regarding copyright ownership. | |
* The ASF licenses this file to You under the Apache License, Version 2.0 | |
* (the "License"); you may not use this file except in compliance with | |
* the License. You may obtain a copy of the License at | |
* | |
* http://www.apache.org/licenses/LICENSE-2.0 | |
* | |
* Unless required by applicable law or agreed to in writing, software | |
* distributed under the License is distributed on an "AS IS" BASIS, | |
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
* See the License for the specific language governing permissions and | |
* limitations under the License. | |
*/ | |
package opennlp.tools.disambiguator; | |
import java.util.ArrayList; | |
import java.util.Arrays; | |
import java.util.Collections; | |
import java.util.List; | |
import net.sf.extjwnl.JWNLException; | |
import net.sf.extjwnl.data.POS; | |
import net.sf.extjwnl.data.Synset; | |
import net.sf.extjwnl.dictionary.Dictionary; | |
import opennlp.tools.tokenize.WhitespaceTokenizer; | |
import opennlp.tools.util.InvalidFormatException; | |
public class WSDSample { | |
private List<String> sentence; | |
private List<String> tags; | |
private List<String> lemmas; | |
private int senseID; | |
private List<String> senseIDs; | |
private int targetPosition; | |
public WSDSample(String sentence[], String tags[], String[] lemmas, | |
int targetPosition, int senseID) { | |
this.sentence = Collections.unmodifiableList(new ArrayList<String>(Arrays | |
.asList(sentence))); | |
this.tags = Collections.unmodifiableList(new ArrayList<String>(Arrays | |
.asList(tags))); | |
this.targetPosition = targetPosition; | |
this.lemmas = Collections.unmodifiableList(new ArrayList<String>(Arrays | |
.asList(lemmas))); | |
; | |
this.senseID = senseID; | |
checkArguments(); | |
} | |
public WSDSample(String sentence[], String tags[], String[] lemmas, | |
int targetPosition) { | |
this.sentence = Collections.unmodifiableList(new ArrayList<String>(Arrays | |
.asList(sentence))); | |
this.tags = Collections.unmodifiableList(new ArrayList<String>(Arrays | |
.asList(tags))); | |
this.targetPosition = targetPosition; | |
this.lemmas = Collections.unmodifiableList(new ArrayList<String>(Arrays | |
.asList(lemmas))); | |
; | |
checkArguments(); | |
} | |
public WSDSample(String sentence[], String tags[], String[] lemmas, | |
int targetPosition, List<String> senseIDs) { | |
this.sentence = Collections.unmodifiableList(new ArrayList<String>(Arrays | |
.asList(sentence))); | |
this.tags = Collections.unmodifiableList(new ArrayList<String>(Arrays | |
.asList(tags))); | |
this.targetPosition = targetPosition; | |
this.lemmas = Collections.unmodifiableList(new ArrayList<String>(Arrays | |
.asList(lemmas))); | |
this.senseIDs = senseIDs; | |
checkArguments(); | |
} | |
private void checkArguments() { | |
if (sentence.size() != tags.size() || targetPosition < 0 | |
|| targetPosition >= tags.size()) | |
throw new IllegalArgumentException( | |
"There must be exactly one tag for each token!"); | |
if (sentence.contains(null) || tags.contains(null)) | |
throw new IllegalArgumentException("null elements are not allowed!"); | |
} | |
public String[] getSentence() { | |
return sentence.toArray(new String[sentence.size()]); | |
} | |
public String[] getTags() { | |
return tags.toArray(new String[tags.size()]); | |
} | |
public String[] getLemmas() { | |
return lemmas.toArray(new String[lemmas.size()]); | |
} | |
public int getTargetPosition() { | |
return targetPosition; | |
} | |
public int getSenseID() { | |
return senseID; | |
} | |
public List<String> getSenseIDs() { | |
return senseIDs; | |
} | |
public String getTargetWord() { | |
return sentence.get(targetPosition); | |
} | |
public String getTargetTag() { | |
return tags.get(targetPosition); | |
} | |
public void setSentence(List<String> sentence) { | |
this.sentence = sentence; | |
} | |
public void setTags(List<String> tags) { | |
this.tags = tags; | |
} | |
public void setLemmas(List<String> lemmas) { | |
this.lemmas = lemmas; | |
} | |
public void setSenseID(int senseID) { | |
this.senseID = senseID; | |
} | |
public void setSenseIDs(List<String> senseIDs) { | |
this.senseIDs = senseIDs; | |
} | |
public void setTargetPosition(int targetPosition) { | |
this.targetPosition = targetPosition; | |
} | |
@Override | |
public String toString() { | |
StringBuilder result = new StringBuilder(); | |
result.append("target at : " + this.targetPosition + " in : "); | |
for (int i = 0; i < getSentence().length; i++) { | |
result.append(i); | |
result.append("."); | |
result.append(getSentence()[i]); | |
result.append('_'); | |
result.append(getTags()[i]); | |
result.append(' '); | |
} | |
if (result.length() > 0) { | |
// get rid of last space | |
result.setLength(result.length() - 1); | |
} | |
return result.toString(); | |
} | |
/* | |
* Parses a sample of format : TargetIndex TargetLemma Token Tag Token Tag ... | |
*/ | |
public static WSDSample parse(String sentenceString) | |
throws InvalidFormatException { | |
String tokenTags[] = WhitespaceTokenizer.INSTANCE.tokenize(sentenceString); | |
int position = Integer.parseInt(tokenTags[0]); | |
String sentence[] = new String[tokenTags.length - 1]; | |
String tags[] = new String[tokenTags.length - 1]; | |
String lemmas[] = new String[tokenTags.length - 1]; | |
for (int i = 1; i < tokenTags.length; i++) { | |
int split = tokenTags[i].lastIndexOf("_"); | |
if (split == -1) { | |
throw new InvalidFormatException("Cannot find \"_\" inside token!"); | |
} | |
sentence[i] = tokenTags[i].substring(0, split); | |
tags[i] = tokenTags[i].substring(split + 1); | |
lemmas[i] = tokenTags[i].substring(split + 2); | |
} | |
return new WSDSample(sentence, tags, lemmas, position); | |
} | |
@Override | |
public boolean equals(Object obj) { | |
if (this == obj) { | |
return true; | |
} else if (obj instanceof WSDSample) { | |
WSDSample a = (WSDSample) obj; | |
return Arrays.equals(getSentence(), a.getSentence()) | |
&& Arrays.equals(getTags(), a.getTags()) | |
&& getTargetPosition() == a.getTargetPosition(); | |
} else { | |
return false; | |
} | |
} | |
// Return the synsets (thus the senses) of the current target word | |
public List<Synset> getSynsets() { | |
try { | |
return Dictionary | |
.getDefaultResourceInstance() | |
.lookupIndexWord(WSDHelper.getPOS(this.getTargetTag()), | |
this.getTargetWord()).getSenses(); | |
} catch (JWNLException e) { | |
e.printStackTrace(); | |
} | |
return null; | |
} | |
public String getTargetWordTag() { | |
String wordBaseForm = this.getLemmas()[this.getTargetPosition()]; | |
String ref = ""; | |
if ((WSDHelper.getPOS(this.getTargetTag()) != null)) { | |
if (WSDHelper.getPOS(this.getTargetTag()).equals(POS.VERB)) { | |
ref = wordBaseForm + ".v"; | |
} else if (WSDHelper.getPOS(this.getTargetTag()).equals(POS.NOUN)) { | |
ref = wordBaseForm + ".n"; | |
} else if (WSDHelper.getPOS(this.getTargetTag()).equals(POS.ADJECTIVE)) { | |
ref = wordBaseForm + ".a"; | |
} else if (WSDHelper.getPOS(this.getTargetTag()).equals(POS.ADVERB)) { | |
ref = wordBaseForm + ".r"; | |
} | |
} | |
return ref; | |
} | |
} |