opennlp-wsd/src/main/java/opennlp/tools/disambiguator/DataExtractor.java - opennlp-sandbox - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one
  * or more contributor license agreements.  See the NOTICE file
  * distributed with this work for additional information
  * regarding copyright ownership.  The ASF licenses this file
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
  *
  *   http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing,
  * software distributed under the License is distributed on an
  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
  * KIND, either express or implied.  See the License for the
  * specific language governing permissions and limitations
  * under the License.
  */

 package opennlp.tools.disambiguator;

 import java.io.BufferedReader;
 import java.io.File;
 import java.io.FileNotFoundException;
 import java.io.FileReader;
 import java.io.IOException;
 import java.util.ArrayList;
 import java.util.HashMap;

 import javax.xml.parsers.DocumentBuilder;
 import javax.xml.parsers.DocumentBuilderFactory;

 import org.w3c.dom.Document;
 import org.w3c.dom.Element;
 import org.w3c.dom.Node;
 import org.w3c.dom.NodeList;

 import opennlp.tools.disambiguator.DictionaryInstance;
 import opennlp.tools.disambiguator.ims.WTDIMS;

 /**
  * This class handles the extraction of data from the different files (training
  * data, dictionary instances, etc.)
  */

 public class DataExtractor {

   private static String englishDict = "src\\test\\resources\\models\\en-lemmatizer.dict";

   /**
    * Constructor
    */
   public DataExtractor() {
     super();
   }

   private ArrayList<DictionaryInstance> extractDictionary(String xmlLocation) {

     ArrayList<DictionaryInstance> dictionary = new ArrayList<DictionaryInstance>();

     try {

       File xmlFile = new File(xmlLocation);
       DocumentBuilderFactory dbFactory = DocumentBuilderFactory.newInstance();
       DocumentBuilder dBuilder = dbFactory.newDocumentBuilder();
       Document doc = dBuilder.parse(xmlFile);
       doc.getDocumentElement().normalize();

       NodeList nLexelts = doc.getElementsByTagName("lexelt");

       int index = 0;

       for (int i = 0; i < nLexelts.getLength(); i++) {

         Node nLexelt = nLexelts.item(i);

         Element eLexelt = (Element) nLexelt;

         String word = eLexelt.getAttribute("item");

         if (nLexelt.getNodeType() == Node.ELEMENT_NODE) {

           NodeList nSenses = eLexelt.getChildNodes();

           for (int j = 0; j < nSenses.getLength(); j++) {

             if (nSenses.item(j).getNodeType() == Node.ELEMENT_NODE) {

               Element eSense = (Element) nSenses.item(j);

               int ind = index; // rather use this than the ID used by default
               String id = eSense.getAttribute("id");
               String source = eSense.getAttribute("source");
               String[] synset = eSense.getAttribute("synset").split("\\s");
               String gloss = eSense.getAttribute("gloss");

               DictionaryInstance wd = new DictionaryInstance(ind, word, id,
                   source, synset, gloss);

               dictionary.add(wd);
               index++;
             }
           }

         }
       }
     } catch (Exception e) {
       e.printStackTrace();
     }

     return dictionary;

   }

   private HashMap<Integer, ArrayList<String>> getEquivalentSense(
       String sensemapFile) {

     HashMap<Integer, ArrayList<String>> mappedSenses = new HashMap<Integer, ArrayList<String>>();

     try (BufferedReader wordsList = new BufferedReader(new FileReader(
         sensemapFile))) {

       int index = 0;

       String line;

       // Read the file
       while ((line = wordsList.readLine()) != null) {

         String[] temp = line.split("\\s");

         ArrayList<String> tempSenses = new ArrayList<String>();

         for (String sense : temp) {
           if (sense.length() > 1) {
             // System.out.println(sense);
             tempSenses.add(sense);
           }
         }

         mappedSenses.put(index, tempSenses);
         // System.out.println(index);
         index++;

       }

     } catch (IOException e) {
       e.printStackTrace();
     }

     return mappedSenses;

   }

   private HashMap<String, ArrayList<DictionaryInstance>> extractCoarseGrainedDictionary(
       String xmlLocation, String sensemapFile) {

     HashMap<String, ArrayList<DictionaryInstance>> optimizedDictionary = new HashMap<String, ArrayList<DictionaryInstance>>();

     HashMap<Integer, ArrayList<String>> equivalentSenses = getEquivalentSense(sensemapFile);

     ArrayList<DictionaryInstance> dictionary = extractDictionary(xmlLocation);

     for (int mapKey : equivalentSenses.keySet()) {
       ArrayList<String> sensesIds = equivalentSenses.get(mapKey);
       ArrayList<DictionaryInstance> optimizedDictionaryInstance = new ArrayList<DictionaryInstance>();

       String word = "";

       for (String senseId : sensesIds) {
         for (int i = 0; i < dictionary.size(); i++) {
           if (dictionary.get(i).getId().equals(senseId)) {
             optimizedDictionaryInstance.add(dictionary.get(i));
             word = dictionary.get(i).getWord();
             word = word + "_" + mapKey;
             break;
           }
         }

       }

       optimizedDictionary.put(word, optimizedDictionaryInstance);
     }

     return optimizedDictionary;
   }

   /**
    * Extract the different senses (those which are equivalent are put together)
    * of a word
    *
    * @param xmlLocation
    *          : location of the file containing the dictionary instances
    * @param sensemapFile
    *          : location of the file containing the equivalent senses in the
    *          case of Coarse-grained disambiguation
    * @param wordTag
    *          : the word to disambiguate. It should be written in the format
    *          "word.p" (Exp: "write.v", "well.r", "smart.a", "go.v"
    * @return a {@link HashMap} of {@link DictionaryInstance} with their IDs
    */
   public HashMap<String, ArrayList<DictionaryInstance>> extractWordSenses(
       String xmlLocation, String sensemapFile, String wordTag) {

     /**
      * word tag has to be in the format "word.t" (e.g., "activate.v", "smart.a",
      * etc.)
      */

     HashMap<String, ArrayList<DictionaryInstance>> wordSenses = new HashMap<String, ArrayList<DictionaryInstance>>();

     HashMap<String, ArrayList<DictionaryInstance>> optimalDictionary = extractCoarseGrainedDictionary(
         xmlLocation, sensemapFile);

     int i = 0;
     for (String key : optimalDictionary.keySet()) {
       if (key.startsWith(wordTag)) {
         String newKey = wordTag + "_" + i;
         wordSenses.put(newKey, optimalDictionary.get(key));
         i++;
       }
     }

     return wordSenses;
   }

   /**
    * Extract the different senses. This class returns only the ID of the sense
    * and the gloss. the synsets and other information are omitted.
    *
    * @param xmlLocation
    *          : location of the file containing the dictionary instances
    * @param sensemapFile
    *          : location of the file containing the equivalent senses in the
    *          case of Coarse-grained disambiguation
    * @param wordTag
    *          the word to disambiguate. It should be written in the format
    *          "word.p" (Exp: "write.v", "well.r", "smart.a", "go.v"
    * @return a {@link HashMap} of word senses with their IDs
    */
   public HashMap<String, String> getDictionaryInstance(String xmlLocation,
       String sensemapFile, String wordTag) {

     HashMap<String, ArrayList<DictionaryInstance>> dict = extractWordSenses(
         xmlLocation, sensemapFile, wordTag);

     HashMap<String, String> senses = new HashMap<String, String>();

     for (String key : dict.keySet()) {
       String sense = dict.get(key).get(0).getGloss();
       senses.put(key, sense);
     }

     return senses;

   }

   /**
    * Extract the training instances from the training/test set File
    *
    * @param xmlDataSet
    *          : the file from which the data are to be extracted
    * @return {@link ArrayList} of Word To Disambiguate (WTDIMS) instances
    */
   public ArrayList<WTDIMS> extractWSDInstances(String xmlDataSet) {

     ArrayList<WTDIMS> setInstances = new ArrayList<WTDIMS>();

     try {

       File xmlFile = new File(xmlDataSet);
       DocumentBuilderFactory dbFactory = DocumentBuilderFactory.newInstance();
       DocumentBuilder dBuilder = dbFactory.newDocumentBuilder();
       Document doc = dBuilder.parse(xmlFile);

       doc.getDocumentElement().normalize();

       NodeList lexelts = doc.getElementsByTagName("lexelt");

       for (int i = 0; i < lexelts.getLength(); i++) {

         Node nLexelt = lexelts.item(i);

         if (nLexelt.getNodeType() == Node.ELEMENT_NODE) {
           Element eLexelt = (Element) nLexelt;

           NodeList nInstances = nLexelt.getChildNodes();

           for (int j = 1; j < nInstances.getLength(); j++) {

             Node nInstance = nInstances.item(j);

             if (nInstance.getNodeType() == Node.ELEMENT_NODE) {

               Element eInstance = (Element) nInstance;

               String[] wordPos = eLexelt.getAttribute("item").split("\\.");
               String word = wordPos[0]; // Word
               String tag; // Part of Speech

               if (wordPos[1].equals("n")) {
                 tag = "noun";
               } else if (wordPos[1].equals("v")) {
                 tag = "verb";
               } else if (wordPos[1].equals("a")) {
                 tag = "adjective";
               } else {
                 tag = "adverb";
               }

               String id = eInstance.getAttribute("id");
               String source = eInstance.getAttribute("docsrc");

               ArrayList<String> answers = new ArrayList<String>();
               String sentence = "";
               String rawWord = "";

               NodeList nChildren = nInstance.getChildNodes();

               for (int k = 1; k < nChildren.getLength(); k++) {
                 Node nChild = nChildren.item(k);

                 if (nChild.getNodeName().equals("answer")) {
                   // String answer =
                   // nChild.getAttributes().item(0).getTextContent();
                   String senseid = nChild.getAttributes().item(1)
                       .getTextContent();

                   String temp = senseid;
                   // String[] temp = { answer, senseid };
                   answers.add(temp);
                 }

                 if (nChild.getNodeName().equals("context")) {
                   sentence = ((Element) nChild).getTextContent();

                   if (nChild.hasChildNodes()) {
                     // textbefore =
                     // nChild.getChildNodes().item(0).getTextContent();
                     rawWord = nChild.getChildNodes().item(1).getTextContent();
                     // textAfter =
                     // nChild.getChildNodes().item(2).getTextContent();
                   }
                 }

               }

               WTDIMS wordToDisambiguate = new WTDIMS(word, answers, sentence,
                   rawWord);
               setInstances.add(wordToDisambiguate);
             }

           }

         }

       }

     } catch (Exception e) {
       e.printStackTrace();
     }

     return setInstances;

   }

   /**
    * Extract the list of ALL English words
    *
    * @param dict
    *          : this file is the same that is used in the simple lemmatizer
    *          (i.e.,"en-lemmatizer.dict")
    *
    * @return a list of all the english words
    */
   public HashMap<String, Object> getEnglishWords(String dict) {

     HashMap<String, Object> words = new HashMap<String, Object>();

     BufferedReader br = null;

     File file = new File(englishDict);

     if (file.exists()) {

       try {
         br = new BufferedReader(new FileReader(file));
         String line = br.readLine();
         while (line != null) {
           line = br.readLine();
           if (line != null) {
             String word = line.split("\\t")[0];
             words.put(word, null);
           }
         }
       } catch (FileNotFoundException e) {
         e.printStackTrace();
       } catch (IOException e) {
         e.printStackTrace();
       } finally {
         if (br != null) {
           try {
             br.close();
           } catch (IOException e) {
             e.printStackTrace();
           }
         }
       }
     }

     return words;
   }

 }
	/*
	* Licensed to the Apache Software Foundation (ASF) under one
	* or more contributor license agreements. See the NOTICE file
	* distributed with this work for additional information
	* regarding copyright ownership. The ASF licenses this file
	* to you under the Apache License, Version 2.0 (the
	* "License"); you may not use this file except in compliance
	* with the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing,
	* software distributed under the License is distributed on an
	* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
	* KIND, either express or implied. See the License for the
	* specific language governing permissions and limitations
	* under the License.
	*/

	package opennlp.tools.disambiguator;

	import java.io.BufferedReader;
	import java.io.File;
	import java.io.FileNotFoundException;
	import java.io.FileReader;
	import java.io.IOException;
	import java.util.ArrayList;
	import java.util.HashMap;

	import javax.xml.parsers.DocumentBuilder;
	import javax.xml.parsers.DocumentBuilderFactory;

	import org.w3c.dom.Document;
	import org.w3c.dom.Element;
	import org.w3c.dom.Node;
	import org.w3c.dom.NodeList;

	import opennlp.tools.disambiguator.DictionaryInstance;
	import opennlp.tools.disambiguator.ims.WTDIMS;

	/**
	* This class handles the extraction of data from the different files (training
	* data, dictionary instances, etc.)
	*/

	public class DataExtractor {

	private static String englishDict = "src\\test\\resources\\models\\en-lemmatizer.dict";

	/**
	* Constructor
	*/
	public DataExtractor() {
	super();
	}

	private ArrayList<DictionaryInstance> extractDictionary(String xmlLocation) {

	ArrayList<DictionaryInstance> dictionary = new ArrayList<DictionaryInstance>();

	try {

	File xmlFile = new File(xmlLocation);
	DocumentBuilderFactory dbFactory = DocumentBuilderFactory.newInstance();
	DocumentBuilder dBuilder = dbFactory.newDocumentBuilder();
	Document doc = dBuilder.parse(xmlFile);
	doc.getDocumentElement().normalize();

	NodeList nLexelts = doc.getElementsByTagName("lexelt");

	int index = 0;

	for (int i = 0; i < nLexelts.getLength(); i++) {

	Node nLexelt = nLexelts.item(i);

	Element eLexelt = (Element) nLexelt;

	String word = eLexelt.getAttribute("item");

	if (nLexelt.getNodeType() == Node.ELEMENT_NODE) {

	NodeList nSenses = eLexelt.getChildNodes();

	for (int j = 0; j < nSenses.getLength(); j++) {

	if (nSenses.item(j).getNodeType() == Node.ELEMENT_NODE) {

	Element eSense = (Element) nSenses.item(j);

	int ind = index; // rather use this than the ID used by default
	String id = eSense.getAttribute("id");
	String source = eSense.getAttribute("source");
	String[] synset = eSense.getAttribute("synset").split("\\s");
	String gloss = eSense.getAttribute("gloss");

	DictionaryInstance wd = new DictionaryInstance(ind, word, id,
	source, synset, gloss);

	dictionary.add(wd);
	index++;
	}
	}

	}
	}
	} catch (Exception e) {
	e.printStackTrace();
	}

	return dictionary;

	}

	private HashMap<Integer, ArrayList<String>> getEquivalentSense(
	String sensemapFile) {

	HashMap<Integer, ArrayList<String>> mappedSenses = new HashMap<Integer, ArrayList<String>>();

	try (BufferedReader wordsList = new BufferedReader(new FileReader(
	sensemapFile))) {

	int index = 0;

	String line;

	// Read the file
	while ((line = wordsList.readLine()) != null) {

	String[] temp = line.split("\\s");

	ArrayList<String> tempSenses = new ArrayList<String>();

	for (String sense : temp) {
	if (sense.length() > 1) {
	// System.out.println(sense);
	tempSenses.add(sense);
	}
	}

	mappedSenses.put(index, tempSenses);
	// System.out.println(index);
	index++;

	}

	} catch (IOException e) {
	e.printStackTrace();
	}

	return mappedSenses;

	}

	private HashMap<String, ArrayList<DictionaryInstance>> extractCoarseGrainedDictionary(
	String xmlLocation, String sensemapFile) {

	HashMap<String, ArrayList<DictionaryInstance>> optimizedDictionary = new HashMap<String, ArrayList<DictionaryInstance>>();

	HashMap<Integer, ArrayList<String>> equivalentSenses = getEquivalentSense(sensemapFile);

	ArrayList<DictionaryInstance> dictionary = extractDictionary(xmlLocation);

	for (int mapKey : equivalentSenses.keySet()) {
	ArrayList<String> sensesIds = equivalentSenses.get(mapKey);
	ArrayList<DictionaryInstance> optimizedDictionaryInstance = new ArrayList<DictionaryInstance>();

	String word = "";

	for (String senseId : sensesIds) {
	for (int i = 0; i < dictionary.size(); i++) {
	if (dictionary.get(i).getId().equals(senseId)) {
	optimizedDictionaryInstance.add(dictionary.get(i));
	word = dictionary.get(i).getWord();
	word = word + "_" + mapKey;
	break;
	}
	}

	}

	optimizedDictionary.put(word, optimizedDictionaryInstance);
	}

	return optimizedDictionary;
	}

	/**
	* Extract the different senses (those which are equivalent are put together)
	* of a word
	*
	* @param xmlLocation
	* : location of the file containing the dictionary instances
	* @param sensemapFile
	* : location of the file containing the equivalent senses in the
	* case of Coarse-grained disambiguation
	* @param wordTag
	* : the word to disambiguate. It should be written in the format
	* "word.p" (Exp: "write.v", "well.r", "smart.a", "go.v"
	* @return a {@link HashMap} of {@link DictionaryInstance} with their IDs
	*/
	public HashMap<String, ArrayList<DictionaryInstance>> extractWordSenses(
	String xmlLocation, String sensemapFile, String wordTag) {

	/**
	* word tag has to be in the format "word.t" (e.g., "activate.v", "smart.a",
	* etc.)
	*/

	HashMap<String, ArrayList<DictionaryInstance>> wordSenses = new HashMap<String, ArrayList<DictionaryInstance>>();

	HashMap<String, ArrayList<DictionaryInstance>> optimalDictionary = extractCoarseGrainedDictionary(
	xmlLocation, sensemapFile);

	int i = 0;
	for (String key : optimalDictionary.keySet()) {
	if (key.startsWith(wordTag)) {
	String newKey = wordTag + "_" + i;
	wordSenses.put(newKey, optimalDictionary.get(key));
	i++;
	}
	}

	return wordSenses;
	}

	/**
	* Extract the different senses. This class returns only the ID of the sense
	* and the gloss. the synsets and other information are omitted.
	*
	* @param xmlLocation
	* : location of the file containing the dictionary instances
	* @param sensemapFile
	* : location of the file containing the equivalent senses in the
	* case of Coarse-grained disambiguation
	* @param wordTag
	* the word to disambiguate. It should be written in the format
	* "word.p" (Exp: "write.v", "well.r", "smart.a", "go.v"
	* @return a {@link HashMap} of word senses with their IDs
	*/
	public HashMap<String, String> getDictionaryInstance(String xmlLocation,
	String sensemapFile, String wordTag) {

	HashMap<String, ArrayList<DictionaryInstance>> dict = extractWordSenses(
	xmlLocation, sensemapFile, wordTag);

	HashMap<String, String> senses = new HashMap<String, String>();

	for (String key : dict.keySet()) {
	String sense = dict.get(key).get(0).getGloss();
	senses.put(key, sense);
	}

	return senses;

	}

	/**
	* Extract the training instances from the training/test set File
	*
	* @param xmlDataSet
	* : the file from which the data are to be extracted
	* @return {@link ArrayList} of Word To Disambiguate (WTDIMS) instances
	*/
	public ArrayList<WTDIMS> extractWSDInstances(String xmlDataSet) {

	ArrayList<WTDIMS> setInstances = new ArrayList<WTDIMS>();

	try {

	File xmlFile = new File(xmlDataSet);
	DocumentBuilderFactory dbFactory = DocumentBuilderFactory.newInstance();
	DocumentBuilder dBuilder = dbFactory.newDocumentBuilder();
	Document doc = dBuilder.parse(xmlFile);

	doc.getDocumentElement().normalize();

	NodeList lexelts = doc.getElementsByTagName("lexelt");

	for (int i = 0; i < lexelts.getLength(); i++) {

	Node nLexelt = lexelts.item(i);

	if (nLexelt.getNodeType() == Node.ELEMENT_NODE) {
	Element eLexelt = (Element) nLexelt;

	NodeList nInstances = nLexelt.getChildNodes();

	for (int j = 1; j < nInstances.getLength(); j++) {

	Node nInstance = nInstances.item(j);

	if (nInstance.getNodeType() == Node.ELEMENT_NODE) {

	Element eInstance = (Element) nInstance;

	String[] wordPos = eLexelt.getAttribute("item").split("\\.");
	String word = wordPos[0]; // Word
	String tag; // Part of Speech

	if (wordPos[1].equals("n")) {
	tag = "noun";
	} else if (wordPos[1].equals("v")) {
	tag = "verb";
	} else if (wordPos[1].equals("a")) {
	tag = "adjective";
	} else {
	tag = "adverb";
	}

	String id = eInstance.getAttribute("id");
	String source = eInstance.getAttribute("docsrc");

	ArrayList<String> answers = new ArrayList<String>();
	String sentence = "";
	String rawWord = "";

	NodeList nChildren = nInstance.getChildNodes();

	for (int k = 1; k < nChildren.getLength(); k++) {
	Node nChild = nChildren.item(k);

	if (nChild.getNodeName().equals("answer")) {
	// String answer =
	// nChild.getAttributes().item(0).getTextContent();
	String senseid = nChild.getAttributes().item(1)
	.getTextContent();

	String temp = senseid;
	// String[] temp = { answer, senseid };
	answers.add(temp);
	}

	if (nChild.getNodeName().equals("context")) {
	sentence = ((Element) nChild).getTextContent();

	if (nChild.hasChildNodes()) {
	// textbefore =
	// nChild.getChildNodes().item(0).getTextContent();
	rawWord = nChild.getChildNodes().item(1).getTextContent();
	// textAfter =
	// nChild.getChildNodes().item(2).getTextContent();
	}
	}

	}

	WTDIMS wordToDisambiguate = new WTDIMS(word, answers, sentence,
	rawWord);
	setInstances.add(wordToDisambiguate);
	}

	}

	}

	}

	} catch (Exception e) {
	e.printStackTrace();
	}

	return setInstances;

	}

	/**
	* Extract the list of ALL English words
	*
	* @param dict
	* : this file is the same that is used in the simple lemmatizer
	* (i.e.,"en-lemmatizer.dict")
	*
	* @return a list of all the english words
	*/
	public HashMap<String, Object> getEnglishWords(String dict) {

	HashMap<String, Object> words = new HashMap<String, Object>();

	BufferedReader br = null;

	File file = new File(englishDict);

	if (file.exists()) {

	try {
	br = new BufferedReader(new FileReader(file));
	String line = br.readLine();
	while (line != null) {
	line = br.readLine();
	if (line != null) {
	String word = line.split("\\t")[0];
	words.put(word, null);
	}
	}
	} catch (FileNotFoundException e) {
	e.printStackTrace();
	} catch (IOException e) {
	e.printStackTrace();
	} finally {
	if (br != null) {
	try {
	br.close();
	} catch (IOException e) {
	e.printStackTrace();
	}
	}
	}
	}

	return words;
	}

	}