opennlp-wsd/src/main/java/opennlp/tools/disambiguator/IMSWSDContextGenerator.java - opennlp-sandbox - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License. You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */

 package opennlp.tools.disambiguator;

 import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.HashSet;
 import java.util.List;
 import java.util.regex.Pattern;

 public class IMSWSDContextGenerator implements WSDContextGenerator {

   private String[] extractPosOfSurroundingWords(int index, String[] tags,
     int windowSize) {

     String[] windowTags = new String[2 * windowSize + 1];

     int j = 0;

     for (int i = index - windowSize; i < index + windowSize; i++) {
       if (i < 0 || i >= tags.length) {
         windowTags[j] = "null";
       } else {
         windowTags[j] = tags[i].toLowerCase();
       }
       j++;
     }

     return windowTags;
   }

   public String[] extractSurroundingContext(int index, String[] toks, String[] lemmas, int windowSize) {

     // TODO consider the windowSize
     List<String> contextWords = new ArrayList<>();

     final Pattern pattern = Pattern.compile("[^a-z_]");

     for (int i = 0; i < toks.length; i++) {
       if (lemmas != null) {
         if (!WSDHelper.STOP_WORDS.contains(toks[i].toLowerCase()) && (index != i)) {

           String lemma = lemmas[i].toLowerCase();
           lemma = pattern.matcher(lemma).replaceAll("").trim();

           if (lemma.length() > 1) {
             contextWords.add(lemma);
           }

         }
       }
     }

     return contextWords.toArray(new String[0]);
   }

   private String[] extractLocalCollocations(int index, String[] sentence, int ngram) {
     /*
      * Here the author used only 11 features of this type. the range was set to
      * 3 (bigrams extracted in a way that they are at max separated by 1 word).
      */

     ArrayList<String> localCollocations = new ArrayList<>();

     for (int i = index - ngram; i <= index + ngram; i++) {

       if (!(i < 0 || i > sentence.length - 2)) {
         if ((i != index) && (i + 1 != index) && (i + 1 < index + ngram)) {
           String lc = sentence[i] + " " + sentence[i + 1];
           localCollocations.add(lc);
         }
         if ((i != index) && (i + 2 != index) && (i + 2 < index + ngram)) {
           String lc = sentence[i] + " " + sentence[i + 2];
           localCollocations.add(lc);
         }
       }

     }
     String[] res;
     res = localCollocations.toArray(new String[0]);

     return res;
   }

   /**
    * Get Context of a word To disambiguate
    *
    * @param index      The index of the word to disambiguate
    * @param tokens     The tokens of the sentence / context
    * @param tags       The POS-tags of the sentence / context
    * @param lemmas     The lemmas of the sentence / context
    * @param ngram      The ngram to consider for context
    * @param windowSize The context window
    * @param model      The list of unigrams
    * @return The IMS context of the word to disambiguate
    */
   @Override
   public String[] getContext(int index, String[] tokens,
     String[] tags, String[] lemmas, int ngram, int windowSize, List<String> model) {

     String[] posOfSurroundingWords = extractPosOfSurroundingWords(index, tokens,
       windowSize);

     HashSet<String> surroundingWords = new HashSet<>(Arrays
             .asList(extractSurroundingContext(index, tokens, lemmas, windowSize)));

     String[] localCollocations = extractLocalCollocations(index, tokens, ngram);

     String[] serializedFeatures = new String[posOfSurroundingWords.length
       + localCollocations.length + model.size()];

     int i = 0;

     for (String feature : posOfSurroundingWords) {
       serializedFeatures[i] = "F" + i + "=" + feature;
       i++;
     }

     for (String feature : localCollocations) {
       serializedFeatures[i] = "F" + i + "=" + feature;
       i++;
     }
     for (String word : model) {
       if (surroundingWords.contains(word.toString())) {
         serializedFeatures[i] = "F" + i + "=1";
       } else {
         serializedFeatures[i] = "F" + i + "=0";
       }
       i++;
     }
     return serializedFeatures;
   }

   /**
    * Get Context of a word To disambiguate
    *
    * @param sample     The sample of the word to disambiguate
    * @param ngram      The ngram to consider for context
    * @param windowSize The context window
    * @param model      The list of unigrams
    * @return The IMS context of the word to disambiguate
    */
   @Override
   public String[] getContext(WSDSample sample, int ngram,
     int windowSize, List<String> model) {
     return getContext(sample.getTargetPosition(), sample.getSentence(),
       sample.getTags(), sample.getLemmas(), ngram, windowSize, model);
   }
 }
	/*
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/

	package opennlp.tools.disambiguator;

	import java.util.ArrayList;
	import java.util.Arrays;
	import java.util.HashSet;
	import java.util.List;
	import java.util.regex.Pattern;

	public class IMSWSDContextGenerator implements WSDContextGenerator {

	private String[] extractPosOfSurroundingWords(int index, String[] tags,
	int windowSize) {

	String[] windowTags = new String[2 * windowSize + 1];

	int j = 0;

	for (int i = index - windowSize; i < index + windowSize; i++) {
	if (i < 0 \|\| i >= tags.length) {
	windowTags[j] = "null";
	} else {
	windowTags[j] = tags[i].toLowerCase();
	}
	j++;
	}

	return windowTags;
	}

	public String[] extractSurroundingContext(int index, String[] toks, String[] lemmas, int windowSize) {

	// TODO consider the windowSize
	List<String> contextWords = new ArrayList<>();

	final Pattern pattern = Pattern.compile("[^a-z_]");

	for (int i = 0; i < toks.length; i++) {
	if (lemmas != null) {
	if (!WSDHelper.STOP_WORDS.contains(toks[i].toLowerCase()) && (index != i)) {

	String lemma = lemmas[i].toLowerCase();
	lemma = pattern.matcher(lemma).replaceAll("").trim();

	if (lemma.length() > 1) {
	contextWords.add(lemma);
	}

	}
	}
	}

	return contextWords.toArray(new String[0]);
	}

	private String[] extractLocalCollocations(int index, String[] sentence, int ngram) {
	/*
	* Here the author used only 11 features of this type. the range was set to
	* 3 (bigrams extracted in a way that they are at max separated by 1 word).
	*/

	ArrayList<String> localCollocations = new ArrayList<>();

	for (int i = index - ngram; i <= index + ngram; i++) {

	if (!(i < 0 \|\| i > sentence.length - 2)) {
	if ((i != index) && (i + 1 != index) && (i + 1 < index + ngram)) {
	String lc = sentence[i] + " " + sentence[i + 1];
	localCollocations.add(lc);
	}
	if ((i != index) && (i + 2 != index) && (i + 2 < index + ngram)) {
	String lc = sentence[i] + " " + sentence[i + 2];
	localCollocations.add(lc);
	}
	}

	}
	String[] res;
	res = localCollocations.toArray(new String[0]);

	return res;
	}

	/**
	* Get Context of a word To disambiguate
	*
	* @param index The index of the word to disambiguate
	* @param tokens The tokens of the sentence / context
	* @param tags The POS-tags of the sentence / context
	* @param lemmas The lemmas of the sentence / context
	* @param ngram The ngram to consider for context
	* @param windowSize The context window
	* @param model The list of unigrams
	* @return The IMS context of the word to disambiguate
	*/
	@Override
	public String[] getContext(int index, String[] tokens,
	String[] tags, String[] lemmas, int ngram, int windowSize, List<String> model) {

	String[] posOfSurroundingWords = extractPosOfSurroundingWords(index, tokens,
	windowSize);

	HashSet<String> surroundingWords = new HashSet<>(Arrays
	.asList(extractSurroundingContext(index, tokens, lemmas, windowSize)));

	String[] localCollocations = extractLocalCollocations(index, tokens, ngram);

	String[] serializedFeatures = new String[posOfSurroundingWords.length
	+ localCollocations.length + model.size()];

	int i = 0;

	for (String feature : posOfSurroundingWords) {
	serializedFeatures[i] = "F" + i + "=" + feature;
	i++;
	}

	for (String feature : localCollocations) {
	serializedFeatures[i] = "F" + i + "=" + feature;
	i++;
	}
	for (String word : model) {
	if (surroundingWords.contains(word.toString())) {
	serializedFeatures[i] = "F" + i + "=1";
	} else {
	serializedFeatures[i] = "F" + i + "=0";
	}
	i++;
	}
	return serializedFeatures;
	}

	/**
	* Get Context of a word To disambiguate
	*
	* @param sample The sample of the word to disambiguate
	* @param ngram The ngram to consider for context
	* @param windowSize The context window
	* @param model The list of unigrams
	* @return The IMS context of the word to disambiguate
	*/
	@Override
	public String[] getContext(WSDSample sample, int ngram,
	int windowSize, List<String> model) {
	return getContext(sample.getTargetPosition(), sample.getSentence(),
	sample.getTags(), sample.getLemmas(), ngram, windowSize, model);
	}
	}