blob: 40c1cbcdb98b8fd2d107ada7f69dd70a07f925d1 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package opennlp.tools.disambiguator;
import java.util.ArrayList;
import opennlp.tools.disambiguator.ims.WTDIMS;
public class FeaturesExtractor {
public FeaturesExtractor() {
super();
}
/**
* @Algorithm: IMS (It Makes Sense)
*
* The following methods serve to extract the features for the
* algorithm IMS.
*
* Three families of features are to be extracted: - PoS of
* Surrounding Words: it requires one parameter: "Window size" -
* Surrounding Words: no parameters are required - Local
* Collocations: it requires one parameter: "the n-gram"
*
*/
private String[] extractPosOfSurroundingWords(String[] sentence,
int wordIndex, int windowSize) {
String[] taggedSentence = Loader.getTagger().tag(sentence);
String[] tags = new String[2 * windowSize + 1];
int j = 0;
for (int i = wordIndex - windowSize; i < wordIndex + windowSize; i++) {
if (i < 0 || i >= sentence.length) {
tags[j] = "null";
} else {
tags[j] = taggedSentence[i].toLowerCase();
}
j++;
}
return tags;
}
private String[] extractSurroundingWords(String[] sentence, int wordIndex) {
String[] posTags = Loader.getTagger().tag(sentence);
ArrayList<String> contextWords = new ArrayList<String>();
for (int i = 0; i < sentence.length; i++) {
if (!Constants.stopWords.contains(sentence[i].toLowerCase())
&& (wordIndex != i)) {
String word = sentence[i].toLowerCase().replaceAll("[^a-z]", "").trim();
if (!word.equals("")) {
String lemma = Loader.getLemmatizer().lemmatize(sentence[i],
posTags[i]);
contextWords.add(lemma);
}
}
}
return contextWords.toArray(new String[contextWords.size()]);
}
private String[] extractLocalCollocations(String[] sentence, int wordIndex,
int ngram) {
/**
* Here the author used only 11 features of this type. the range was set to
* 3 (bigrams extracted in a way that they are at max separated by 1 word).
*/
ArrayList<String> localCollocations = new ArrayList<String>();
for (int i = wordIndex - ngram; i <= wordIndex + ngram; i++) {
if (!(i < 0 || i > sentence.length - 3)) {
if ((i != wordIndex) && (i + 1 != wordIndex)
&& (i + 1 < wordIndex + ngram)) {
String lc = (sentence[i] + " " + sentence[i + 1]).toLowerCase();
localCollocations.add(lc);
}
if ((i != wordIndex) && (i + 2 != wordIndex)
&& (i + 2 < wordIndex + ngram)) {
String lc = (sentence[i] + " " + sentence[i + 2]).toLowerCase();
localCollocations.add(lc);
}
}
}
String[] res = new String[localCollocations.size()];
res = localCollocations.toArray(res);
return res;
}
// public method
/**
* This method generates the different set of features related to the IMS
* approach and store them in the corresponding attributes of the WTDIMS
*
* @param word
* the word to disambiguate [object: WTDIMS]
* @param windowSize
* the parameter required to generate the features qualified of
* "PoS of Surrounding Words"
* @param ngram
* the parameter required to generate the features qualified of
* "Local Collocations"
*/
public void extractIMSFeatures(WTDIMS word, int windowSize, int ngram) {
word.setPosOfSurroundingWords(extractPosOfSurroundingWords(
word.getSentence(), word.getWordIndex(), windowSize));
word.setSurroundingWords(extractSurroundingWords(word.getSentence(),
word.getWordIndex()));
word.setLocalCollocations(extractLocalCollocations(word.getSentence(),
word.getWordIndex(), ngram));
}
/**
* This generates the context of IMS. It supposes that the features have
* already been extracted and stored in the WTDIMS object, therefore it
* doesn't require any parameters.
*
* @param word
* @return the Context of the wordToDisambiguate
*/
public String[] serializeIMSFeatures(WTDIMS word) {
String[] posOfSurroundingWords = word.getPosOfSurroundingWords();
String[] surroundingWords = word.getSurroundingWords();
String[] localCollocations = word.getLocalCollocations();
String[] serializedFeatures = new String[posOfSurroundingWords.length
+ surroundingWords.length + localCollocations.length];
int i = 0;
for (String feature : posOfSurroundingWords) {
serializedFeatures[i] = "F" + i + "=" + feature;
i++;
}
for (String feature : surroundingWords) {
serializedFeatures[i] = "F" + i + "=" + feature;
i++;
}
for (String feature : localCollocations) {
serializedFeatures[i] = "F" + i + "=" + feature;
i++;
}
return serializedFeatures;
}
}