| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| |
| package opennlp.tools.disambiguator; |
| |
| import java.util.Arrays; |
| import java.util.List; |
| import java.util.Objects; |
| |
| import net.sf.extjwnl.JWNLException; |
| import net.sf.extjwnl.data.POS; |
| import net.sf.extjwnl.data.Synset; |
| import net.sf.extjwnl.dictionary.Dictionary; |
| import opennlp.tools.tokenize.WhitespaceTokenizer; |
| import opennlp.tools.util.InvalidFormatException; |
| |
| public class WSDSample { |
| |
| private String[] sentence; |
| private String[] tags; |
| private String[] lemmas; |
| private int senseID; |
| private String[] senseIDs; |
| private int targetPosition; |
| |
| public WSDSample(String[] sentence, String[] tags, String[] lemmas, |
| int targetPosition, int senseID) { |
| this.sentence = sentence; |
| this.tags = tags; |
| this.targetPosition = targetPosition; |
| this.lemmas = lemmas; |
| this.senseID = senseID; |
| checkArguments(); |
| } |
| |
| public WSDSample(String[] sentence, String[] tags, String[] lemmas, |
| int targetPosition) { |
| this(sentence, tags, lemmas, targetPosition, null); |
| } |
| |
| public WSDSample(String[] sentence, String[] tags, String[] lemmas, |
| int targetPosition, String[] senseIDs) { |
| this.sentence = sentence; |
| this.tags = tags; |
| this.targetPosition = targetPosition; |
| this.lemmas = lemmas; |
| this.senseIDs = senseIDs; |
| checkArguments(); |
| } |
| |
| private void checkArguments() { |
| if (sentence.length != tags.length || tags.length != lemmas.length |
| || targetPosition < 0 || targetPosition >= tags.length) |
| throw new IllegalArgumentException("Some inputs are not correct"); |
| } |
| |
| public String[] getSentence() { |
| return sentence; |
| } |
| |
| public String[] getTags() { |
| return tags; |
| } |
| |
| public String[] getLemmas() { |
| return lemmas; |
| } |
| |
| public int getTargetPosition() { |
| return targetPosition; |
| } |
| |
| public int getSenseID() { |
| return senseID; |
| } |
| |
| public String[] getSenseIDs() { |
| return senseIDs; |
| } |
| |
| public String getTargetWord() { |
| return sentence[targetPosition]; |
| } |
| |
| public String getTargetTag() { |
| return tags[targetPosition]; |
| } |
| |
| public void setSentence(String[] sentence) { |
| this.sentence = sentence; |
| } |
| |
| public void setTags(String[] tags) { |
| this.tags = tags; |
| } |
| |
| public void setLemmas(String[] lemmas) { |
| this.lemmas = lemmas; |
| } |
| |
| public void setSenseID(int senseID) { |
| this.senseID = senseID; |
| } |
| |
| public void setSenseIDs(String[] senseIDs) { |
| this.senseIDs = senseIDs; |
| } |
| |
| public void setTargetPosition(int targetPosition) { |
| this.targetPosition = targetPosition; |
| } |
| |
| @Override |
| public String toString() { |
| |
| StringBuilder result = new StringBuilder(); |
| result.append("target at : ").append(this.targetPosition).append(" in : "); |
| for (int i = 0; i < getSentence().length; i++) { |
| result.append(i); |
| result.append("."); |
| result.append(getSentence()[i]); |
| result.append('_'); |
| result.append(getTags()[i]); |
| result.append(' '); |
| } |
| |
| if (result.length() > 0) { |
| // get rid of last space |
| result.setLength(result.length() - 1); |
| } |
| |
| return result.toString(); |
| } |
| |
| /* |
| * Parses a sample of format : TargetIndex TargetLemma Token Tag Token Tag ... |
| */ |
| public static WSDSample parse(String sentenceString) |
| throws InvalidFormatException { |
| |
| String[] tokenTags = WhitespaceTokenizer.INSTANCE.tokenize(sentenceString); |
| |
| int position = Integer.parseInt(tokenTags[0]); |
| String[] sentence = new String[tokenTags.length - 1]; |
| String[] tags = new String[tokenTags.length - 1]; |
| String[] lemmas = new String[tokenTags.length - 1]; |
| |
| for (int i = 1; i < tokenTags.length; i++) { |
| int split = tokenTags[i].lastIndexOf("_"); |
| |
| if (split == -1) { |
| throw new InvalidFormatException("Cannot find \"_\" inside token!"); |
| } |
| |
| sentence[i] = tokenTags[i].substring(0, split); |
| tags[i] = tokenTags[i].substring(split + 1); |
| lemmas[i] = tokenTags[i].substring(split + 2); |
| } |
| |
| return new WSDSample(sentence, tags, lemmas, position); |
| } |
| |
| @Override |
| public boolean equals(Object obj) { |
| if (this == obj) { |
| return true; |
| } else if (obj instanceof WSDSample) { |
| WSDSample a = (WSDSample) obj; |
| |
| return Arrays.equals(getSentence(), a.getSentence()) |
| && Arrays.equals(getTags(), a.getTags()) |
| && getTargetPosition() == a.getTargetPosition(); |
| } else { |
| return false; |
| } |
| } |
| |
| @Override |
| public int hashCode() { |
| int result = Objects.hash(targetPosition); |
| result = 31 * result + Arrays.hashCode(sentence); |
| result = 31 * result + Arrays.hashCode(tags); |
| return result; |
| } |
| |
| // Return the synsets (thus the senses) of the current target word |
| public List<Synset> getSynsets() { |
| try { |
| return Dictionary.getDefaultResourceInstance() |
| .lookupIndexWord(WSDHelper.getPOS(this.getTargetTag()), |
| this.getTargetWord()) |
| .getSenses(); |
| } catch (JWNLException e) { |
| e.printStackTrace(); |
| } |
| return null; |
| } |
| |
| public String getTargetWordTag() { |
| |
| String wordBaseForm = this.getLemmas()[this.getTargetPosition()]; |
| |
| String ref = ""; |
| |
| if ((WSDHelper.getPOS(this.getTargetTag()) != null)) { |
| if (WSDHelper.getPOS(this.getTargetTag()).equals(POS.VERB)) { |
| ref = wordBaseForm + ".v"; |
| } else if (WSDHelper.getPOS(this.getTargetTag()).equals(POS.NOUN)) { |
| ref = wordBaseForm + ".n"; |
| } else if (WSDHelper.getPOS(this.getTargetTag()).equals(POS.ADJECTIVE)) { |
| ref = wordBaseForm + ".a"; |
| } else if (WSDHelper.getPOS(this.getTargetTag()).equals(POS.ADVERB)) { |
| ref = wordBaseForm + ".r"; |
| } |
| } |
| |
| return ref; |
| } |
| |
| } |