opennlp-wsd/src/main/java/opennlp/tools/disambiguator/FeaturesExtractor.java - opennlp-sandbox - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one
  * or more contributor license agreements.  See the NOTICE file
  * distributed with this work for additional information
  * regarding copyright ownership.  The ASF licenses this file
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
  *
  *   http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing,
  * software distributed under the License is distributed on an
  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
  * KIND, either express or implied.  See the License for the
  * specific language governing permissions and limitations
  * under the License.
  */

 package opennlp.tools.disambiguator;


 import java.util.ArrayList;

 import opennlp.tools.disambiguator.ims.WTDIMS;

 public class FeaturesExtractor {

   public FeaturesExtractor() {
     super();
   }

   /**
    * @Algorithm: IMS (It Makes Sense)
    *
    *             The following methods serve to extract the features for the
    *             algorithm IMS.
    *
    *             Three families of features are to be extracted: - PoS of
    *             Surrounding Words: it requires one parameter: "Window size" -
    *             Surrounding Words: no parameters are required - Local
    *             Collocations: it requires one parameter: "the n-gram"
    *
    */
   private String[] extractPosOfSurroundingWords(String[] sentence,
       int wordIndex, int windowSize) {

     String[] taggedSentence = Loader.getTagger().tag(sentence);

     String[] tags = new String[2 * windowSize + 1];

     int j = 0;

     for (int i = wordIndex - windowSize; i < wordIndex + windowSize; i++) {
       if (i < 0 || i >= sentence.length) {
         tags[j] = "null";
       } else {
         tags[j] = taggedSentence[i].toLowerCase();
       }
       j++;
     }

     return tags;
   }

   private String[] extractSurroundingWords(String[] sentence, int wordIndex) {

     String[] posTags = Loader.getTagger().tag(sentence);

     ArrayList<String> contextWords = new ArrayList<String>();

     for (int i = 0; i < sentence.length; i++) {

       if (!Constants.stopWords.contains(sentence[i].toLowerCase())
           && (wordIndex != i)) {

         String word = sentence[i].toLowerCase().replaceAll("[^a-z]", "").trim();

         if (!word.equals("")) {
           String lemma = Loader.getLemmatizer().lemmatize(sentence[i],
               posTags[i]);
           contextWords.add(lemma);
         }

       }
     }

     return contextWords.toArray(new String[contextWords.size()]);
   }

   private String[] extractLocalCollocations(String[] sentence, int wordIndex,
       int ngram) {
     /**
      * Here the author used only 11 features of this type. the range was set to
      * 3 (bigrams extracted in a way that they are at max separated by 1 word).
      */

     ArrayList<String> localCollocations = new ArrayList<String>();

     for (int i = wordIndex - ngram; i <= wordIndex + ngram; i++) {

       if (!(i < 0 || i > sentence.length - 3)) {
         if ((i != wordIndex) && (i + 1 != wordIndex)
             && (i + 1 < wordIndex + ngram)) {
           String lc = (sentence[i] + " " + sentence[i + 1]).toLowerCase();
           localCollocations.add(lc);
         }
         if ((i != wordIndex) && (i + 2 != wordIndex)
             && (i + 2 < wordIndex + ngram)) {
           String lc = (sentence[i] + " " + sentence[i + 2]).toLowerCase();
           localCollocations.add(lc);
         }
       }

     }

     String[] res = new String[localCollocations.size()];
     res = localCollocations.toArray(res);

     return res;
   }

   // public method
   /**
    * This method generates the different set of features related to the IMS
    * approach and store them in the corresponding attributes of the WTDIMS
    *
    * @param word
    *          the word to disambiguate [object: WTDIMS]
    * @param windowSize
    *          the parameter required to generate the features qualified of
    *          "PoS of Surrounding Words"
    * @param ngram
    *          the parameter required to generate the features qualified of
    *          "Local Collocations"
    */
   public void extractIMSFeatures(WTDIMS word, int windowSize, int ngram) {

     word.setPosOfSurroundingWords(extractPosOfSurroundingWords(
         word.getSentence(), word.getWordIndex(), windowSize));
     word.setSurroundingWords(extractSurroundingWords(word.getSentence(),
         word.getWordIndex()));
     word.setLocalCollocations(extractLocalCollocations(word.getSentence(),
         word.getWordIndex(), ngram));

   }

   /**
    * This generates the context of IMS. It supposes that the features have
    * already been extracted and stored in the WTDIMS object, therefore it
    * doesn't require any parameters.
    *
    * @param word
    * @return the Context of the wordToDisambiguate
    */
   public String[] serializeIMSFeatures(WTDIMS word) {

     String[] posOfSurroundingWords = word.getPosOfSurroundingWords();
     String[] surroundingWords = word.getSurroundingWords();
     String[] localCollocations = word.getLocalCollocations();

     String[] serializedFeatures = new String[posOfSurroundingWords.length
         + surroundingWords.length + localCollocations.length];

     int i = 0;

     for (String feature : posOfSurroundingWords) {
       serializedFeatures[i] = "F" + i + "=" + feature;
       i++;
     }

     for (String feature : surroundingWords) {
       serializedFeatures[i] = "F" + i + "=" + feature;
       i++;
     }

     for (String feature : localCollocations) {
       serializedFeatures[i] = "F" + i + "=" + feature;
       i++;
     }

     return serializedFeatures;

   }
 }
	/*
	* Licensed to the Apache Software Foundation (ASF) under one
	* or more contributor license agreements. See the NOTICE file
	* distributed with this work for additional information
	* regarding copyright ownership. The ASF licenses this file
	* to you under the Apache License, Version 2.0 (the
	* "License"); you may not use this file except in compliance
	* with the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing,
	* software distributed under the License is distributed on an
	* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
	* KIND, either express or implied. See the License for the
	* specific language governing permissions and limitations
	* under the License.
	*/

	package opennlp.tools.disambiguator;


	import java.util.ArrayList;

	import opennlp.tools.disambiguator.ims.WTDIMS;

	public class FeaturesExtractor {

	public FeaturesExtractor() {
	super();
	}

	/**
	* @Algorithm: IMS (It Makes Sense)
	*
	* The following methods serve to extract the features for the
	* algorithm IMS.
	*
	* Three families of features are to be extracted: - PoS of
	* Surrounding Words: it requires one parameter: "Window size" -
	* Surrounding Words: no parameters are required - Local
	* Collocations: it requires one parameter: "the n-gram"
	*
	*/
	private String[] extractPosOfSurroundingWords(String[] sentence,
	int wordIndex, int windowSize) {

	String[] taggedSentence = Loader.getTagger().tag(sentence);

	String[] tags = new String[2 * windowSize + 1];

	int j = 0;

	for (int i = wordIndex - windowSize; i < wordIndex + windowSize; i++) {
	if (i < 0 \|\| i >= sentence.length) {
	tags[j] = "null";
	} else {
	tags[j] = taggedSentence[i].toLowerCase();
	}
	j++;
	}

	return tags;
	}

	private String[] extractSurroundingWords(String[] sentence, int wordIndex) {

	String[] posTags = Loader.getTagger().tag(sentence);

	ArrayList<String> contextWords = new ArrayList<String>();

	for (int i = 0; i < sentence.length; i++) {

	if (!Constants.stopWords.contains(sentence[i].toLowerCase())
	&& (wordIndex != i)) {

	String word = sentence[i].toLowerCase().replaceAll("[^a-z]", "").trim();

	if (!word.equals("")) {
	String lemma = Loader.getLemmatizer().lemmatize(sentence[i],
	posTags[i]);
	contextWords.add(lemma);
	}

	}
	}

	return contextWords.toArray(new String[contextWords.size()]);
	}

	private String[] extractLocalCollocations(String[] sentence, int wordIndex,
	int ngram) {
	/**
	* Here the author used only 11 features of this type. the range was set to
	* 3 (bigrams extracted in a way that they are at max separated by 1 word).
	*/

	ArrayList<String> localCollocations = new ArrayList<String>();

	for (int i = wordIndex - ngram; i <= wordIndex + ngram; i++) {

	if (!(i < 0 \|\| i > sentence.length - 3)) {
	if ((i != wordIndex) && (i + 1 != wordIndex)
	&& (i + 1 < wordIndex + ngram)) {
	String lc = (sentence[i] + " " + sentence[i + 1]).toLowerCase();
	localCollocations.add(lc);
	}
	if ((i != wordIndex) && (i + 2 != wordIndex)
	&& (i + 2 < wordIndex + ngram)) {
	String lc = (sentence[i] + " " + sentence[i + 2]).toLowerCase();
	localCollocations.add(lc);
	}
	}

	}

	String[] res = new String[localCollocations.size()];
	res = localCollocations.toArray(res);

	return res;
	}

	// public method
	/**
	* This method generates the different set of features related to the IMS
	* approach and store them in the corresponding attributes of the WTDIMS
	*
	* @param word
	* the word to disambiguate [object: WTDIMS]
	* @param windowSize
	* the parameter required to generate the features qualified of
	* "PoS of Surrounding Words"
	* @param ngram
	* the parameter required to generate the features qualified of
	* "Local Collocations"
	*/
	public void extractIMSFeatures(WTDIMS word, int windowSize, int ngram) {

	word.setPosOfSurroundingWords(extractPosOfSurroundingWords(
	word.getSentence(), word.getWordIndex(), windowSize));
	word.setSurroundingWords(extractSurroundingWords(word.getSentence(),
	word.getWordIndex()));
	word.setLocalCollocations(extractLocalCollocations(word.getSentence(),
	word.getWordIndex(), ngram));

	}

	/**
	* This generates the context of IMS. It supposes that the features have
	* already been extracted and stored in the WTDIMS object, therefore it
	* doesn't require any parameters.
	*
	* @param word
	* @return the Context of the wordToDisambiguate
	*/
	public String[] serializeIMSFeatures(WTDIMS word) {

	String[] posOfSurroundingWords = word.getPosOfSurroundingWords();
	String[] surroundingWords = word.getSurroundingWords();
	String[] localCollocations = word.getLocalCollocations();

	String[] serializedFeatures = new String[posOfSurroundingWords.length
	+ surroundingWords.length + localCollocations.length];

	int i = 0;

	for (String feature : posOfSurroundingWords) {
	serializedFeatures[i] = "F" + i + "=" + feature;
	i++;
	}

	for (String feature : surroundingWords) {
	serializedFeatures[i] = "F" + i + "=" + feature;
	i++;
	}

	for (String feature : localCollocations) {
	serializedFeatures[i] = "F" + i + "=" + feature;
	i++;
	}

	return serializedFeatures;

	}
	}