| /* |
| * Licensed to the Apache Software Foundation (ASF) under one |
| * or more contributor license agreements. See the NOTICE file |
| * distributed with this work for additional information |
| * regarding copyright ownership. The ASF licenses this file |
| * to you under the Apache License, Version 2.0 (the |
| * "License"); you may not use this file except in compliance |
| * with the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, |
| * software distributed under the License is distributed on an |
| * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
| * KIND, either express or implied. See the License for the |
| * specific language governing permissions and limitations |
| * under the License. |
| */ |
| |
| package opennlp.tools.disambiguator; |
| |
| |
| import java.util.ArrayList; |
| |
| import opennlp.tools.disambiguator.ims.WTDIMS; |
| |
| public class FeaturesExtractor { |
| |
| public FeaturesExtractor() { |
| super(); |
| } |
| |
| /** |
| * @Algorithm: IMS (It Makes Sense) |
| * |
| * The following methods serve to extract the features for the |
| * algorithm IMS. |
| * |
| * Three families of features are to be extracted: - PoS of |
| * Surrounding Words: it requires one parameter: "Window size" - |
| * Surrounding Words: no parameters are required - Local |
| * Collocations: it requires one parameter: "the n-gram" |
| * |
| */ |
| private String[] extractPosOfSurroundingWords(String[] sentence, |
| int wordIndex, int windowSize) { |
| |
| String[] taggedSentence = Loader.getTagger().tag(sentence); |
| |
| String[] tags = new String[2 * windowSize + 1]; |
| |
| int j = 0; |
| |
| for (int i = wordIndex - windowSize; i < wordIndex + windowSize; i++) { |
| if (i < 0 || i >= sentence.length) { |
| tags[j] = "null"; |
| } else { |
| tags[j] = taggedSentence[i].toLowerCase(); |
| } |
| j++; |
| } |
| |
| return tags; |
| } |
| |
| private String[] extractSurroundingWords(String[] sentence, int wordIndex) { |
| |
| String[] posTags = Loader.getTagger().tag(sentence); |
| |
| ArrayList<String> contextWords = new ArrayList<String>(); |
| |
| for (int i = 0; i < sentence.length; i++) { |
| |
| if (!Constants.stopWords.contains(sentence[i].toLowerCase()) |
| && (wordIndex != i)) { |
| |
| String word = sentence[i].toLowerCase().replaceAll("[^a-z]", "").trim(); |
| |
| if (!word.equals("")) { |
| String lemma = Loader.getLemmatizer().lemmatize(sentence[i], |
| posTags[i]); |
| contextWords.add(lemma); |
| } |
| |
| } |
| } |
| |
| return contextWords.toArray(new String[contextWords.size()]); |
| } |
| |
| private String[] extractLocalCollocations(String[] sentence, int wordIndex, |
| int ngram) { |
| /** |
| * Here the author used only 11 features of this type. the range was set to |
| * 3 (bigrams extracted in a way that they are at max separated by 1 word). |
| */ |
| |
| ArrayList<String> localCollocations = new ArrayList<String>(); |
| |
| for (int i = wordIndex - ngram; i <= wordIndex + ngram; i++) { |
| |
| if (!(i < 0 || i > sentence.length - 3)) { |
| if ((i != wordIndex) && (i + 1 != wordIndex) |
| && (i + 1 < wordIndex + ngram)) { |
| String lc = (sentence[i] + " " + sentence[i + 1]).toLowerCase(); |
| localCollocations.add(lc); |
| } |
| if ((i != wordIndex) && (i + 2 != wordIndex) |
| && (i + 2 < wordIndex + ngram)) { |
| String lc = (sentence[i] + " " + sentence[i + 2]).toLowerCase(); |
| localCollocations.add(lc); |
| } |
| } |
| |
| } |
| |
| String[] res = new String[localCollocations.size()]; |
| res = localCollocations.toArray(res); |
| |
| return res; |
| } |
| |
| // public method |
| /** |
| * This method generates the different set of features related to the IMS |
| * approach and store them in the corresponding attributes of the WTDIMS |
| * |
| * @param word |
| * the word to disambiguate [object: WTDIMS] |
| * @param windowSize |
| * the parameter required to generate the features qualified of |
| * "PoS of Surrounding Words" |
| * @param ngram |
| * the parameter required to generate the features qualified of |
| * "Local Collocations" |
| */ |
| public void extractIMSFeatures(WTDIMS word, int windowSize, int ngram) { |
| |
| word.setPosOfSurroundingWords(extractPosOfSurroundingWords( |
| word.getSentence(), word.getWordIndex(), windowSize)); |
| word.setSurroundingWords(extractSurroundingWords(word.getSentence(), |
| word.getWordIndex())); |
| word.setLocalCollocations(extractLocalCollocations(word.getSentence(), |
| word.getWordIndex(), ngram)); |
| |
| } |
| |
| /** |
| * This generates the context of IMS. It supposes that the features have |
| * already been extracted and stored in the WTDIMS object, therefore it |
| * doesn't require any parameters. |
| * |
| * @param word |
| * @return the Context of the wordToDisambiguate |
| */ |
| public String[] serializeIMSFeatures(WTDIMS word) { |
| |
| String[] posOfSurroundingWords = word.getPosOfSurroundingWords(); |
| String[] surroundingWords = word.getSurroundingWords(); |
| String[] localCollocations = word.getLocalCollocations(); |
| |
| String[] serializedFeatures = new String[posOfSurroundingWords.length |
| + surroundingWords.length + localCollocations.length]; |
| |
| int i = 0; |
| |
| for (String feature : posOfSurroundingWords) { |
| serializedFeatures[i] = "F" + i + "=" + feature; |
| i++; |
| } |
| |
| for (String feature : surroundingWords) { |
| serializedFeatures[i] = "F" + i + "=" + feature; |
| i++; |
| } |
| |
| for (String feature : localCollocations) { |
| serializedFeatures[i] = "F" + i + "=" + feature; |
| i++; |
| } |
| |
| return serializedFeatures; |
| |
| } |
| } |