blob: c48d9507aa69f60192615f56d92253ea6cc23894 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package opennlp.tools.disambiguator;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashSet;
public class IMSWSDContextGenerator implements WSDContextGenerator {
private String[] extractPosOfSurroundingWords(int index, String[] tags,
int windowSize) {
String[] windowTags = new String[2 * windowSize + 1];
int j = 0;
for (int i = index - windowSize; i < index + windowSize; i++) {
if (i < 0 || i >= tags.length) {
windowTags[j] = "null";
} else {
windowTags[j] = tags[i].toLowerCase();
}
j++;
}
return windowTags;
}
public String[] extractSurroundingContext(int index, String[] toks,
String[] lemmas, int windowSize) {
// TODO consider the windowSize
ArrayList<String> contextWords = new ArrayList<String>();
for (int i = 0; i < toks.length; i++) {
if (lemmas != null) {
if (!WSDHelper.stopWords.contains(toks[i].toLowerCase()) && (index
!= i)) {
String lemma = lemmas[i].toLowerCase().replaceAll("[^a-z_]", "")
.trim();
if (lemma.length() > 1) {
contextWords.add(lemma);
}
}
}
}
return contextWords.toArray(new String[contextWords.size()]);
}
private String[] extractLocalCollocations(int index, String[] sentence,
int ngram) {
/**
* Here the author used only 11 features of this type. the range was set to
* 3 (bigrams extracted in a way that they are at max separated by 1 word).
*/
ArrayList<String> localCollocations = new ArrayList<String>();
for (int i = index - ngram; i <= index + ngram; i++) {
if (!(i < 0 || i > sentence.length - 2)) {
if ((i != index) && (i + 1 != index) && (i + 1 < index + ngram)) {
String lc = sentence[i] + " " + sentence[i + 1];
localCollocations.add(lc);
}
if ((i != index) && (i + 2 != index) && (i + 2 < index + ngram)) {
String lc = sentence[i] + " " + sentence[i + 2];
localCollocations.add(lc);
}
}
}
String[] res = new String[localCollocations.size()];
res = localCollocations.toArray(new String[localCollocations.size()]);
return res;
}
/**
* Get Context of a word To disambiguate
*
* @param index The index of the word to disambiguate
* @param tokens The tokens of the sentence / context
* @param tags The POS-tags of the sentence / context
* @param lemmas The lemmas of the sentence / context
* @param ngram The ngram to consider for context
* @param windowSize The context window
* @param model The list of unigrams
* @return The IMS context of the word to disambiguate
*/
@Override public String[] getContext(int index, String[] tokens,
String[] tags, String[] lemmas, int ngram, int windowSize,
ArrayList<String> model) {
String[] posOfSurroundingWords = extractPosOfSurroundingWords(index, tokens,
windowSize);
HashSet<String> surroundingWords = new HashSet<>();
surroundingWords.addAll(Arrays
.asList(extractSurroundingContext(index, tokens, lemmas, windowSize)));
String[] localCollocations = extractLocalCollocations(index, tokens, ngram);
String[] serializedFeatures = new String[posOfSurroundingWords.length
+ localCollocations.length + model.size()];
int i = 0;
for (String feature : posOfSurroundingWords) {
serializedFeatures[i] = "F" + i + "=" + feature;
i++;
}
for (String feature : localCollocations) {
serializedFeatures[i] = "F" + i + "=" + feature;
i++;
}
for (String word : model) {
if (surroundingWords.contains(word.toString())) {
serializedFeatures[i] = "F" + i + "=1";
} else {
serializedFeatures[i] = "F" + i + "=0";
}
i++;
}
return serializedFeatures;
}
/**
* Get Context of a word To disambiguate
*
* @param sample The sample of the word to disambiguate
* @param ngram The ngram to consider for context
* @param windowSize The context window
* @param model The list of unigrams
* @return The IMS context of the word to disambiguate
*/
@Override public String[] getContext(WSDSample sample, int ngram,
int windowSize, ArrayList<String> model) {
return getContext(sample.getTargetPosition(), sample.getSentence(),
sample.getTags(), sample.getLemmas(), ngram, windowSize, model);
}
}