blob: 6d264809de8e14c60bc40063e094abe7d254141c [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package opennlp.tools.disambiguator;
import java.util.ArrayList;
import java.util.Arrays;
import opennlp.tools.disambiguator.ims.WTDIMS;
/**
* Class for the extraction of features for the different Supervised
* Disambiguation apporaches.<br>
* Each set of methods refer to one approach
* <ul>
* <li>IMS (It Makes Sense): check {@link https
* ://www.comp.nus.edu.sg/~nght/pubs/ims.pdf} for details about this approach</li>
* <li>SST (SuperSense Tagging): check {@link http
* ://ttic.uchicago.edu/~altun/pubs/CiaAlt_EMNLP06.pdf} for details about this
* approach</li>
* </ul>
*
* The first methods serve to extract the features for the algorithm IMS. Three
* families of features are to be extracted: - PoS of Surrounding Words: it
* requires one parameter: "Window size" - Surrounding Words: no parameters are
* required - Local Collocations: it requires one parameter: "the n-gram"
*
* check {@link https://www.comp.nus.edu.sg/~nght/pubs/ims.pdf} for details
* about this approach
*/
public class FeaturesExtractor {
/**
* Constructor
*/
public FeaturesExtractor() {
super();
}
// IMS approach
private String[] extractPosOfSurroundingWords(String[] sentence,
int wordIndex, int windowSize) {
String[] taggedSentence = Loader.getTagger().tag(sentence);
String[] tags = new String[2 * windowSize + 1];
int j = 0;
for (int i = wordIndex - windowSize; i < wordIndex + windowSize; i++) {
if (i < 0 || i >= sentence.length) {
tags[j] = "null";
} else {
tags[j] = taggedSentence[i].toLowerCase();
}
j++;
}
return tags;
}
private String[] extractSurroundingWords(String[] sentence, int wordIndex) {
String[] posTags = Loader.getTagger().tag(sentence);
ArrayList<String> contextWords = new ArrayList<String>();
for (int i = 0; i < sentence.length; i++) {
if (!Constants.stopWords.contains(sentence[i].toLowerCase())
&& (wordIndex != i)) {
String word = sentence[i].toLowerCase().replaceAll("[^a-z]", "").trim();
// if (!word.equals("") /*&& Constants.isRelevant(posTags[i])*/) {
if (Loader.getEnglishWords().containsKey(word)) {
String lemma = Loader.getLemmatizer().lemmatize(word, posTags[i]);
contextWords.add(lemma);
}
}
}
return contextWords.toArray(new String[contextWords.size()]);
}
private String[] extractLocalCollocations(String[] sentence, int wordIndex,
int ngram) {
/**
* Here the author used only 11 features of this type. the range was set to
* 3 (bigrams extracted in a way that they are at max separated by 1 word).
*/
ArrayList<String> localCollocations = new ArrayList<String>();
for (int i = wordIndex - ngram; i <= wordIndex + ngram; i++) {
if (!(i < 0 || i > sentence.length - 3)) {
if ((i != wordIndex) && (i + 1 != wordIndex)
&& (i + 1 < wordIndex + ngram)) {
String lc = (sentence[i] + " " + sentence[i + 1]).toLowerCase();
localCollocations.add(lc);
}
if ((i != wordIndex) && (i + 2 != wordIndex)
&& (i + 2 < wordIndex + ngram)) {
String lc = (sentence[i] + " " + sentence[i + 2]).toLowerCase();
localCollocations.add(lc);
}
}
}
String[] res = new String[localCollocations.size()];
res = localCollocations.toArray(res);
return res;
}
/**
* This methods generates the full list of Surrounding words, from the
* training data. These data will be later used for the generation of the
* features qualified of "Surrounding words
*
* @param trainingData
* list of the training samples (type {@link WTDIMS}
* @return the list of all the surrounding words from all the training data
*/
public ArrayList<String> extractTrainingSurroundingWords(
ArrayList<WTDIMS> trainingData) {
ArrayList<String> list = new ArrayList<String>();
for (WTDIMS word : trainingData) {
for (String sWord : word.getSurroundingWords()) {
list.add(sWord);
}
}
return list;
}
/**
* This method generates the different set of features related to the IMS
* approach and store them in the corresponding attributes of the WTDIMS
*
* @param word
* the word to disambiguate [object: WTDIMS]
* @param windowSize
* the parameter required to generate the features qualified of
* "PoS of Surrounding Words"
* @param ngram
* the parameter required to generate the features qualified of
* "Local Collocations"
*/
public void extractIMSFeatures(WTDIMS word, int windowSize, int ngram) {
word.setPosOfSurroundingWords(extractPosOfSurroundingWords(
word.getSentence(), word.getWordIndex(), windowSize));
word.setSurroundingWords(extractSurroundingWords(word.getSentence(),
word.getWordIndex()));
word.setLocalCollocations(extractLocalCollocations(word.getSentence(),
word.getWordIndex(), ngram));
}
/**
* This generates the context of IMS. It supposes that the features have
* already been extracted and stored in the WTDIMS object, therefore it
* doesn't require any parameters.
*
* @param word
* the word to disambiguate
* @param listSurrWords
* the full list of surrounding words of the training data
* @return the Context of the wordToDisambiguate
*/
public void serializeIMSFeatures(WTDIMS word, ArrayList<String> listSurrWords) {
String[] posOfSurroundingWords = word.getPosOfSurroundingWords();
ArrayList<String> surroundingWords = new ArrayList<String>(
Arrays.asList((word.getSurroundingWords())));
String[] localCollocations = word.getLocalCollocations();
String[] serializedFeatures = new String[posOfSurroundingWords.length
+ localCollocations.length + listSurrWords.size()];
int i = 0;
for (String feature : posOfSurroundingWords) {
serializedFeatures[i] = "F" + i + "=" + feature;
i++;
}
for (String feature : localCollocations) {
serializedFeatures[i] = "F" + i + "=" + feature;
i++;
}
for (String feature : listSurrWords) {
serializedFeatures[i] = "F" + i + "=0";
if (surroundingWords.contains(feature)) {
serializedFeatures[i] = "F" + i + "=1";
}
i++;
}
word.setFeatures(serializedFeatures);
}
// SST approach
}