opennlp-wsd/src/main/java/opennlp/tools/disambiguator/FeaturesExtractor.java - opennlp-sandbox - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one
  * or more contributor license agreements.  See the NOTICE file
  * distributed with this work for additional information
  * regarding copyright ownership.  The ASF licenses this file
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
  *
  *   http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing,
  * software distributed under the License is distributed on an
  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
  * KIND, either express or implied.  See the License for the
  * specific language governing permissions and limitations
  * under the License.
  */

 package opennlp.tools.disambiguator;

 import java.util.ArrayList;
 import java.util.Arrays;

 import opennlp.tools.disambiguator.ims.WTDIMS;

 /**
  * Class for the extraction of features for the different Supervised
  * Disambiguation apporaches.<br>
  * Each set of methods refer to one approach
  * <ul>
  * <li>IMS (It Makes Sense): check {@link https
  * ://www.comp.nus.edu.sg/~nght/pubs/ims.pdf} for details about this approach</li>
  * <li>SST (SuperSense Tagging): check {@link http
  * ://ttic.uchicago.edu/~altun/pubs/CiaAlt_EMNLP06.pdf} for details about this
  * approach</li>
  * </ul>
  *
  * The first methods serve to extract the features for the algorithm IMS. Three
  * families of features are to be extracted: - PoS of Surrounding Words: it
  * requires one parameter: "Window size" - Surrounding Words: no parameters are
  * required - Local Collocations: it requires one parameter: "the n-gram"
  *
  * check {@link https://www.comp.nus.edu.sg/~nght/pubs/ims.pdf} for details
  * about this approach
  */

 public class FeaturesExtractor {

   /**
    * Constructor
    */
   public FeaturesExtractor() {
     super();
   }

   // IMS approach

   private String[] extractPosOfSurroundingWords(String[] sentence,
       int wordIndex, int windowSize) {

     String[] taggedSentence = Loader.getTagger().tag(sentence);

     String[] tags = new String[2 * windowSize + 1];

     int j = 0;

     for (int i = wordIndex - windowSize; i < wordIndex + windowSize; i++) {
       if (i < 0 || i >= sentence.length) {
         tags[j] = "null";
       } else {
         tags[j] = taggedSentence[i].toLowerCase();
       }
       j++;
     }

     return tags;
   }

   private String[] extractSurroundingWords(String[] sentence, int wordIndex) {

     String[] posTags = Loader.getTagger().tag(sentence);

     ArrayList<String> contextWords = new ArrayList<String>();

     for (int i = 0; i < sentence.length; i++) {

       if (!Constants.stopWords.contains(sentence[i].toLowerCase())
           && (wordIndex != i)) {

         String word = sentence[i].toLowerCase().replaceAll("[^a-z]", "").trim();

         // if (!word.equals("") /*&& Constants.isRelevant(posTags[i])*/) {
         if (Loader.getEnglishWords().containsKey(word)) {
           String lemma = Loader.getLemmatizer().lemmatize(word, posTags[i]);
           contextWords.add(lemma);
         }

       }
     }

     return contextWords.toArray(new String[contextWords.size()]);
   }

   private String[] extractLocalCollocations(String[] sentence, int wordIndex,
       int ngram) {
     /**
      * Here the author used only 11 features of this type. the range was set to
      * 3 (bigrams extracted in a way that they are at max separated by 1 word).
      */

     ArrayList<String> localCollocations = new ArrayList<String>();

     for (int i = wordIndex - ngram; i <= wordIndex + ngram; i++) {

       if (!(i < 0 || i > sentence.length - 3)) {
         if ((i != wordIndex) && (i + 1 != wordIndex)
             && (i + 1 < wordIndex + ngram)) {
           String lc = (sentence[i] + " " + sentence[i + 1]).toLowerCase();
           localCollocations.add(lc);
         }
         if ((i != wordIndex) && (i + 2 != wordIndex)
             && (i + 2 < wordIndex + ngram)) {
           String lc = (sentence[i] + " " + sentence[i + 2]).toLowerCase();
           localCollocations.add(lc);
         }
       }

     }

     String[] res = new String[localCollocations.size()];
     res = localCollocations.toArray(res);

     return res;
   }

   /**
    * This methods generates the full list of Surrounding words, from the
    * training data. These data will be later used for the generation of the
    * features qualified of "Surrounding words
    *
    * @param trainingData
    *          list of the training samples (type {@link WTDIMS}
    * @return the list of all the surrounding words from all the training data
    */
   public ArrayList<String> extractTrainingSurroundingWords(
       ArrayList<WTDIMS> trainingData) {

     ArrayList<String> list = new ArrayList<String>();

     for (WTDIMS word : trainingData) {
       for (String sWord : word.getSurroundingWords()) {
         list.add(sWord);
       }
     }

     return list;

   }

   /**
    * This method generates the different set of features related to the IMS
    * approach and store them in the corresponding attributes of the WTDIMS
    *
    * @param word
    *          the word to disambiguate [object: WTDIMS]
    * @param windowSize
    *          the parameter required to generate the features qualified of
    *          "PoS of Surrounding Words"
    * @param ngram
    *          the parameter required to generate the features qualified of
    *          "Local Collocations"
    */
   public void extractIMSFeatures(WTDIMS word, int windowSize, int ngram) {

     word.setPosOfSurroundingWords(extractPosOfSurroundingWords(
         word.getSentence(), word.getWordIndex(), windowSize));
     word.setSurroundingWords(extractSurroundingWords(word.getSentence(),
         word.getWordIndex()));
     word.setLocalCollocations(extractLocalCollocations(word.getSentence(),
         word.getWordIndex(), ngram));

   }

   /**
    * This generates the context of IMS. It supposes that the features have
    * already been extracted and stored in the WTDIMS object, therefore it
    * doesn't require any parameters.
    *
    * @param word
    *          the word to disambiguate
    * @param listSurrWords
    *          the full list of surrounding words of the training data
    * @return the Context of the wordToDisambiguate
    */
   public void serializeIMSFeatures(WTDIMS word, ArrayList<String> listSurrWords) {

     String[] posOfSurroundingWords = word.getPosOfSurroundingWords();
     ArrayList<String> surroundingWords = new ArrayList<String>(
         Arrays.asList((word.getSurroundingWords())));
     String[] localCollocations = word.getLocalCollocations();

     String[] serializedFeatures = new String[posOfSurroundingWords.length
         + localCollocations.length + listSurrWords.size()];

     int i = 0;

     for (String feature : posOfSurroundingWords) {
       serializedFeatures[i] = "F" + i + "=" + feature;
       i++;
     }

     for (String feature : localCollocations) {
       serializedFeatures[i] = "F" + i + "=" + feature;
       i++;
     }

     for (String feature : listSurrWords) {
       serializedFeatures[i] = "F" + i + "=0";
       if (surroundingWords.contains(feature)) {
         serializedFeatures[i] = "F" + i + "=1";
       }
       i++;

     }

     word.setFeatures(serializedFeatures);

   }

   // SST approach

 }
	/*
	* Licensed to the Apache Software Foundation (ASF) under one
	* or more contributor license agreements. See the NOTICE file
	* distributed with this work for additional information
	* regarding copyright ownership. The ASF licenses this file
	* to you under the Apache License, Version 2.0 (the
	* "License"); you may not use this file except in compliance
	* with the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing,
	* software distributed under the License is distributed on an
	* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
	* KIND, either express or implied. See the License for the
	* specific language governing permissions and limitations
	* under the License.
	*/

	package opennlp.tools.disambiguator;

	import java.util.ArrayList;
	import java.util.Arrays;

	import opennlp.tools.disambiguator.ims.WTDIMS;

	/**
	* Class for the extraction of features for the different Supervised
	* Disambiguation apporaches.<br>
	* Each set of methods refer to one approach
	* <ul>
	* <li>IMS (It Makes Sense): check {@link https
	* ://www.comp.nus.edu.sg/~nght/pubs/ims.pdf} for details about this approach</li>
	* <li>SST (SuperSense Tagging): check {@link http
	* ://ttic.uchicago.edu/~altun/pubs/CiaAlt_EMNLP06.pdf} for details about this
	* approach</li>
	* </ul>
	*
	* The first methods serve to extract the features for the algorithm IMS. Three
	* families of features are to be extracted: - PoS of Surrounding Words: it
	* requires one parameter: "Window size" - Surrounding Words: no parameters are
	* required - Local Collocations: it requires one parameter: "the n-gram"
	*
	* check {@link https://www.comp.nus.edu.sg/~nght/pubs/ims.pdf} for details
	* about this approach
	*/

	public class FeaturesExtractor {

	/**
	* Constructor
	*/
	public FeaturesExtractor() {
	super();
	}

	// IMS approach

	private String[] extractPosOfSurroundingWords(String[] sentence,
	int wordIndex, int windowSize) {

	String[] taggedSentence = Loader.getTagger().tag(sentence);

	String[] tags = new String[2 * windowSize + 1];

	int j = 0;

	for (int i = wordIndex - windowSize; i < wordIndex + windowSize; i++) {
	if (i < 0 \|\| i >= sentence.length) {
	tags[j] = "null";
	} else {
	tags[j] = taggedSentence[i].toLowerCase();
	}
	j++;
	}

	return tags;
	}

	private String[] extractSurroundingWords(String[] sentence, int wordIndex) {

	String[] posTags = Loader.getTagger().tag(sentence);

	ArrayList<String> contextWords = new ArrayList<String>();

	for (int i = 0; i < sentence.length; i++) {

	if (!Constants.stopWords.contains(sentence[i].toLowerCase())
	&& (wordIndex != i)) {

	String word = sentence[i].toLowerCase().replaceAll("[^a-z]", "").trim();

	// if (!word.equals("") /&& Constants.isRelevant(posTags[i])/) {
	if (Loader.getEnglishWords().containsKey(word)) {
	String lemma = Loader.getLemmatizer().lemmatize(word, posTags[i]);
	contextWords.add(lemma);
	}

	}
	}

	return contextWords.toArray(new String[contextWords.size()]);
	}

	private String[] extractLocalCollocations(String[] sentence, int wordIndex,
	int ngram) {
	/**
	* Here the author used only 11 features of this type. the range was set to
	* 3 (bigrams extracted in a way that they are at max separated by 1 word).
	*/

	ArrayList<String> localCollocations = new ArrayList<String>();

	for (int i = wordIndex - ngram; i <= wordIndex + ngram; i++) {

	if (!(i < 0 \|\| i > sentence.length - 3)) {
	if ((i != wordIndex) && (i + 1 != wordIndex)
	&& (i + 1 < wordIndex + ngram)) {
	String lc = (sentence[i] + " " + sentence[i + 1]).toLowerCase();
	localCollocations.add(lc);
	}
	if ((i != wordIndex) && (i + 2 != wordIndex)
	&& (i + 2 < wordIndex + ngram)) {
	String lc = (sentence[i] + " " + sentence[i + 2]).toLowerCase();
	localCollocations.add(lc);
	}
	}

	}

	String[] res = new String[localCollocations.size()];
	res = localCollocations.toArray(res);

	return res;
	}

	/**
	* This methods generates the full list of Surrounding words, from the
	* training data. These data will be later used for the generation of the
	* features qualified of "Surrounding words
	*
	* @param trainingData
	* list of the training samples (type {@link WTDIMS}
	* @return the list of all the surrounding words from all the training data
	*/
	public ArrayList<String> extractTrainingSurroundingWords(
	ArrayList<WTDIMS> trainingData) {

	ArrayList<String> list = new ArrayList<String>();

	for (WTDIMS word : trainingData) {
	for (String sWord : word.getSurroundingWords()) {
	list.add(sWord);
	}
	}

	return list;

	}

	/**
	* This method generates the different set of features related to the IMS
	* approach and store them in the corresponding attributes of the WTDIMS
	*
	* @param word
	* the word to disambiguate [object: WTDIMS]
	* @param windowSize
	* the parameter required to generate the features qualified of
	* "PoS of Surrounding Words"
	* @param ngram
	* the parameter required to generate the features qualified of
	* "Local Collocations"
	*/
	public void extractIMSFeatures(WTDIMS word, int windowSize, int ngram) {

	word.setPosOfSurroundingWords(extractPosOfSurroundingWords(
	word.getSentence(), word.getWordIndex(), windowSize));
	word.setSurroundingWords(extractSurroundingWords(word.getSentence(),
	word.getWordIndex()));
	word.setLocalCollocations(extractLocalCollocations(word.getSentence(),
	word.getWordIndex(), ngram));

	}

	/**
	* This generates the context of IMS. It supposes that the features have
	* already been extracted and stored in the WTDIMS object, therefore it
	* doesn't require any parameters.
	*
	* @param word
	* the word to disambiguate
	* @param listSurrWords
	* the full list of surrounding words of the training data
	* @return the Context of the wordToDisambiguate
	*/
	public void serializeIMSFeatures(WTDIMS word, ArrayList<String> listSurrWords) {

	String[] posOfSurroundingWords = word.getPosOfSurroundingWords();
	ArrayList<String> surroundingWords = new ArrayList<String>(
	Arrays.asList((word.getSurroundingWords())));
	String[] localCollocations = word.getLocalCollocations();

	String[] serializedFeatures = new String[posOfSurroundingWords.length
	+ localCollocations.length + listSurrWords.size()];

	int i = 0;

	for (String feature : posOfSurroundingWords) {
	serializedFeatures[i] = "F" + i + "=" + feature;
	i++;
	}

	for (String feature : localCollocations) {
	serializedFeatures[i] = "F" + i + "=" + feature;
	i++;
	}

	for (String feature : listSurrWords) {
	serializedFeatures[i] = "F" + i + "=0";
	if (surroundingWords.contains(feature)) {
	serializedFeatures[i] = "F" + i + "=1";
	}
	i++;

	}

	word.setFeatures(serializedFeatures);

	}

	// SST approach

	}