opennlp-wsd/src/main/java/opennlp/tools/disambiguator/FeaturesExtractor.java - opennlp-sandbox - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one
  * or more contributor license agreements.  See the NOTICE file
  * distributed with this work for additional information
  * regarding copyright ownership.  The ASF licenses this file
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
  *
  *   http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing,
  * software distributed under the License is distributed on an
  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
  * KIND, either express or implied.  See the License for the
  * specific language governing permissions and limitations
  * under the License.
  */

 package opennlp.tools.disambiguator;

 import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.HashMap;

 import opennlp.tools.disambiguator.ims.WTDIMS;

 /**
  * Class for the extraction of features for the different Supervised
  * Disambiguation approaches.<br>
  * Each set of methods refer to one approach
  * <ul>
  * <li>IMS (It Makes Sense): check {@link https
  * ://www.comp.nus.edu.sg/~nght/pubs/ims.pdf} for details about this approach</li>
  * <li>SST (SuperSense Tagging): check {@link http
  * ://ttic.uchicago.edu/~altun/pubs/CiaAlt_EMNLP06.pdf} for details about this
  * approach</li>
  * </ul>
  *
  * The first methods serve to extract the features for the algorithm IMS. Three
  * families of features are to be extracted: - PoS of Surrounding Words: it
  * requires one parameter: "Window size" - Surrounding Words: no parameters are
  * required - Local Collocations: it requires one parameter: "the n-gram"
  *
  * check {@link https://www.comp.nus.edu.sg/~nght/pubs/ims.pdf} for details
  * about this approach
  */
 public class FeaturesExtractor {

   public FeaturesExtractor() {
     super();
   }

   // IMS

   private String[] extractPosOfSurroundingWords(WTDIMS wordToDisambiguate,
       int windowSize) {

     String[] taggedSentence = wordToDisambiguate.getPosTags();

     String[] tags = new String[2 * windowSize + 1];

     int j = 0;

     for (int i = wordToDisambiguate.getWordIndex() - windowSize; i < wordToDisambiguate
         .getWordIndex() + windowSize; i++) {
       if (i < 0 || i >= wordToDisambiguate.getSentence().length) {
         tags[j] = "null";
       } else {
         tags[j] = taggedSentence[i].toLowerCase();
       }
       j++;
     }

     return tags;
   }

   private String[] extractSurroundingWords(WTDIMS wordToDisambiguate) {

     ArrayList<String> contextWords = new ArrayList<String>();

     for (int i = 0; i < wordToDisambiguate.getSentence().length; i++) {
       if (wordToDisambiguate.getLemmas() != null) {
         if (!WSDHelper.stopWords.contains(wordToDisambiguate.getSentence()[i]
             .toLowerCase()) && (wordToDisambiguate.getWordIndex() != i)) {

           String lemma = wordToDisambiguate.getLemmas()[i].toLowerCase()
               .replaceAll("[^a-z_]", "").trim();

           if (lemma.length() > 1) {
             contextWords.add(lemma);
           }

         }
       }
     }

     return contextWords.toArray(new String[contextWords.size()]);
   }

   private String[] extractLocalCollocations(WTDIMS wordToDisambiguate, int ngram) {
     /**
      * Here the author used only 11 features of this type. the range was set to
      * 3 (bigrams extracted in a way that they are at max separated by 1 word).
      */

     ArrayList<String> localCollocations = new ArrayList<String>();

     for (int i = wordToDisambiguate.getWordIndex() - ngram; i <= wordToDisambiguate
         .getWordIndex() + ngram; i++) {

       if (!(i < 0 || i > wordToDisambiguate.getSentence().length - 3)) {
         if ((i != wordToDisambiguate.getWordIndex())
             && (i + 1 != wordToDisambiguate.getWordIndex())
             && (i + 1 < wordToDisambiguate.getWordIndex() + ngram)) {
           String lc = (wordToDisambiguate.getSentence()[i] + " " + wordToDisambiguate
               .getSentence()[i + 1]).toLowerCase();
           localCollocations.add(lc);
         }
         if ((i != wordToDisambiguate.getWordIndex())
             && (i + 2 != wordToDisambiguate.getWordIndex())
             && (i + 2 < wordToDisambiguate.getWordIndex() + ngram)) {
           String lc = (wordToDisambiguate.getSentence()[i] + " " + wordToDisambiguate
               .getSentence()[i + 2]).toLowerCase();
           localCollocations.add(lc);
         }
       }

     }

     String[] res = new String[localCollocations.size()];
     res = localCollocations.toArray(res);

     return res;
   }

   /**
    * This methods generates the full list of Surrounding words, from the
    * training data. These data will be later used for the generation of the
    * features qualified of "Surrounding words
    *
    * @param trainingData
    *          list of the training samples (type {@link WTDIMS}
    * @return the list of all the surrounding words from all the training data
    */
   public ArrayList<String> extractTrainingSurroundingWords(
       ArrayList<WTDIMS> trainingData) {

     HashMap<String, Object> words = new HashMap<String, Object>();

     for (WTDIMS word : trainingData) {
       for (String sWord : word.getSurroundingWords()) {
         if (!words.containsKey(sWord.toLowerCase()));
         words.put(sWord.toLowerCase(), null);
       }
     }

     ArrayList<String> list = new ArrayList<String>();

     for (String word : words.keySet()) {
         list.add(word);
     }

     return list;

   }

   /**
    * This method generates the different set of features related to the IMS
    * approach and store them in the corresponding attributes of the WTDIMS
    *
    * @param wordToDisambiguate
    *          the word to disambiguate [object: WTDIMS]
    * @param windowSize
    *          the parameter required to generate the features qualified of
    *          "PoS of Surrounding Words"
    * @param ngram
    *          the parameter required to generate the features qualified of
    *          "Local Collocations"
    */
   public void extractIMSFeatures(WTDIMS wordToDisambiguate, int windowSize,
       int ngram) {

     wordToDisambiguate.setPosOfSurroundingWords(extractPosOfSurroundingWords(
         wordToDisambiguate, windowSize));
     wordToDisambiguate
         .setSurroundingWords(extractSurroundingWords(wordToDisambiguate));
     wordToDisambiguate.setLocalCollocations(extractLocalCollocations(
         wordToDisambiguate, ngram));

   }

   /**
    * This generates the context of IMS. It supposes that the features have
    * already been extracted and stored in the WTDIMS object, therefore it
    * doesn't require any parameters.
    *
    * @param word
    *          the word to disambiguate
    * @param listSurrWords
    *          the full list of surrounding words of the training data
    * @return the Context of the wordToDisambiguate
    */
   public void serializeIMSFeatures(WTDIMS word, ArrayList<String> listSurrWords) {

     String[] posOfSurroundingWords = word.getPosOfSurroundingWords();
     ArrayList<String> surroundingWords = new ArrayList<String>(
         Arrays.asList((word.getSurroundingWords())));
     String[] localCollocations = word.getLocalCollocations();

     String[] serializedFeatures = new String[posOfSurroundingWords.length
         + localCollocations.length + listSurrWords.size()];

     int i = 0;

     for (String feature : posOfSurroundingWords) {
       serializedFeatures[i] = "F" + i + "=" + feature;
       i++;
     }

     for (String feature : localCollocations) {
       serializedFeatures[i] = "F" + i + "=" + feature;
       i++;
     }

     for (String feature : listSurrWords) {
       serializedFeatures[i] = "F" + i + "=0";
       if (surroundingWords.contains(feature)) {
         serializedFeatures[i] = "F" + i + "=1";
       }
       i++;

     }

     word.setFeatures(serializedFeatures);

   }

   // SST approach

 }
	/*
	* Licensed to the Apache Software Foundation (ASF) under one
	* or more contributor license agreements. See the NOTICE file
	* distributed with this work for additional information
	* regarding copyright ownership. The ASF licenses this file
	* to you under the Apache License, Version 2.0 (the
	* "License"); you may not use this file except in compliance
	* with the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing,
	* software distributed under the License is distributed on an
	* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
	* KIND, either express or implied. See the License for the
	* specific language governing permissions and limitations
	* under the License.
	*/

	package opennlp.tools.disambiguator;

	import java.util.ArrayList;
	import java.util.Arrays;
	import java.util.HashMap;

	import opennlp.tools.disambiguator.ims.WTDIMS;

	/**
	* Class for the extraction of features for the different Supervised
	* Disambiguation approaches.<br>
	* Each set of methods refer to one approach
	* <ul>
	* <li>IMS (It Makes Sense): check {@link https
	* ://www.comp.nus.edu.sg/~nght/pubs/ims.pdf} for details about this approach</li>
	* <li>SST (SuperSense Tagging): check {@link http
	* ://ttic.uchicago.edu/~altun/pubs/CiaAlt_EMNLP06.pdf} for details about this
	* approach</li>
	* </ul>
	*
	* The first methods serve to extract the features for the algorithm IMS. Three
	* families of features are to be extracted: - PoS of Surrounding Words: it
	* requires one parameter: "Window size" - Surrounding Words: no parameters are
	* required - Local Collocations: it requires one parameter: "the n-gram"
	*
	* check {@link https://www.comp.nus.edu.sg/~nght/pubs/ims.pdf} for details
	* about this approach
	*/
	public class FeaturesExtractor {

	public FeaturesExtractor() {
	super();
	}

	// IMS

	private String[] extractPosOfSurroundingWords(WTDIMS wordToDisambiguate,
	int windowSize) {

	String[] taggedSentence = wordToDisambiguate.getPosTags();

	String[] tags = new String[2 * windowSize + 1];

	int j = 0;

	for (int i = wordToDisambiguate.getWordIndex() - windowSize; i < wordToDisambiguate
	.getWordIndex() + windowSize; i++) {
	if (i < 0 \|\| i >= wordToDisambiguate.getSentence().length) {
	tags[j] = "null";
	} else {
	tags[j] = taggedSentence[i].toLowerCase();
	}
	j++;
	}

	return tags;
	}

	private String[] extractSurroundingWords(WTDIMS wordToDisambiguate) {

	ArrayList<String> contextWords = new ArrayList<String>();

	for (int i = 0; i < wordToDisambiguate.getSentence().length; i++) {
	if (wordToDisambiguate.getLemmas() != null) {
	if (!WSDHelper.stopWords.contains(wordToDisambiguate.getSentence()[i]
	.toLowerCase()) && (wordToDisambiguate.getWordIndex() != i)) {

	String lemma = wordToDisambiguate.getLemmas()[i].toLowerCase()
	.replaceAll("[^a-z_]", "").trim();

	if (lemma.length() > 1) {
	contextWords.add(lemma);
	}

	}
	}
	}

	return contextWords.toArray(new String[contextWords.size()]);
	}

	private String[] extractLocalCollocations(WTDIMS wordToDisambiguate, int ngram) {
	/**
	* Here the author used only 11 features of this type. the range was set to
	* 3 (bigrams extracted in a way that they are at max separated by 1 word).
	*/

	ArrayList<String> localCollocations = new ArrayList<String>();

	for (int i = wordToDisambiguate.getWordIndex() - ngram; i <= wordToDisambiguate
	.getWordIndex() + ngram; i++) {

	if (!(i < 0 \|\| i > wordToDisambiguate.getSentence().length - 3)) {
	if ((i != wordToDisambiguate.getWordIndex())
	&& (i + 1 != wordToDisambiguate.getWordIndex())
	&& (i + 1 < wordToDisambiguate.getWordIndex() + ngram)) {
	String lc = (wordToDisambiguate.getSentence()[i] + " " + wordToDisambiguate
	.getSentence()[i + 1]).toLowerCase();
	localCollocations.add(lc);
	}
	if ((i != wordToDisambiguate.getWordIndex())
	&& (i + 2 != wordToDisambiguate.getWordIndex())
	&& (i + 2 < wordToDisambiguate.getWordIndex() + ngram)) {
	String lc = (wordToDisambiguate.getSentence()[i] + " " + wordToDisambiguate
	.getSentence()[i + 2]).toLowerCase();
	localCollocations.add(lc);
	}
	}

	}

	String[] res = new String[localCollocations.size()];
	res = localCollocations.toArray(res);

	return res;
	}

	/**
	* This methods generates the full list of Surrounding words, from the
	* training data. These data will be later used for the generation of the
	* features qualified of "Surrounding words
	*
	* @param trainingData
	* list of the training samples (type {@link WTDIMS}
	* @return the list of all the surrounding words from all the training data
	*/
	public ArrayList<String> extractTrainingSurroundingWords(
	ArrayList<WTDIMS> trainingData) {

	HashMap<String, Object> words = new HashMap<String, Object>();

	for (WTDIMS word : trainingData) {
	for (String sWord : word.getSurroundingWords()) {
	if (!words.containsKey(sWord.toLowerCase()));
	words.put(sWord.toLowerCase(), null);
	}
	}

	ArrayList<String> list = new ArrayList<String>();

	for (String word : words.keySet()) {
	list.add(word);
	}

	return list;

	}

	/**
	* This method generates the different set of features related to the IMS
	* approach and store them in the corresponding attributes of the WTDIMS
	*
	* @param wordToDisambiguate
	* the word to disambiguate [object: WTDIMS]
	* @param windowSize
	* the parameter required to generate the features qualified of
	* "PoS of Surrounding Words"
	* @param ngram
	* the parameter required to generate the features qualified of
	* "Local Collocations"
	*/
	public void extractIMSFeatures(WTDIMS wordToDisambiguate, int windowSize,
	int ngram) {

	wordToDisambiguate.setPosOfSurroundingWords(extractPosOfSurroundingWords(
	wordToDisambiguate, windowSize));
	wordToDisambiguate
	.setSurroundingWords(extractSurroundingWords(wordToDisambiguate));
	wordToDisambiguate.setLocalCollocations(extractLocalCollocations(
	wordToDisambiguate, ngram));

	}

	/**
	* This generates the context of IMS. It supposes that the features have
	* already been extracted and stored in the WTDIMS object, therefore it
	* doesn't require any parameters.
	*
	* @param word
	* the word to disambiguate
	* @param listSurrWords
	* the full list of surrounding words of the training data
	* @return the Context of the wordToDisambiguate
	*/
	public void serializeIMSFeatures(WTDIMS word, ArrayList<String> listSurrWords) {

	String[] posOfSurroundingWords = word.getPosOfSurroundingWords();
	ArrayList<String> surroundingWords = new ArrayList<String>(
	Arrays.asList((word.getSurroundingWords())));
	String[] localCollocations = word.getLocalCollocations();

	String[] serializedFeatures = new String[posOfSurroundingWords.length
	+ localCollocations.length + listSurrWords.size()];

	int i = 0;

	for (String feature : posOfSurroundingWords) {
	serializedFeatures[i] = "F" + i + "=" + feature;
	i++;
	}

	for (String feature : localCollocations) {
	serializedFeatures[i] = "F" + i + "=" + feature;
	i++;
	}

	for (String feature : listSurrWords) {
	serializedFeatures[i] = "F" + i + "=0";
	if (surroundingWords.contains(feature)) {
	serializedFeatures[i] = "F" + i + "=1";
	}
	i++;

	}

	word.setFeatures(serializedFeatures);

	}

	// SST approach

	}