opennlp-wsd/src/main/java/opennlp/tools/disambiguator/datareader/SensevalReader.java - opennlp-sandbox - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one
  * or more contributor license agreements.  See the NOTICE file
  * distributed with this work for additional information
  * regarding copyright ownership.  The ASF licenses this file
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
  *
  *   http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing,
  * software distributed under the License is distributed on an
  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
  * KIND, either express or implied.  See the License for the
  * specific language governing permissions and limitations
  * under the License.
  */

 package opennlp.tools.disambiguator.datareader;

 import java.io.BufferedReader;
 import java.io.File;
 import java.io.FileReader;
 import java.io.IOException;
 import java.util.ArrayList;
 import java.util.HashMap;
 import java.util.Collections;
 import java.util.Arrays;

 import javax.xml.parsers.DocumentBuilder;
 import javax.xml.parsers.DocumentBuilderFactory;

 import org.w3c.dom.Document;
 import org.w3c.dom.Element;
 import org.w3c.dom.Node;
 import org.w3c.dom.NodeList;

 import opennlp.tools.disambiguator.WSDHelper;
 import opennlp.tools.disambiguator.WSDSample;
 import opennlp.tools.util.ObjectStream;
 import opennlp.tools.util.ObjectStreamUtils;

 /**
  * This class handles the extraction of Senseval-3 data from the different files
  * (training data, dictionary instances, etc.)
  */
 public class SensevalReader {

   protected String sensevalDirectory = "src\\test\\resources\\senseval3\\";

   protected String data = sensevalDirectory + "EnglishLS.train";
   protected String sensemapFile = sensevalDirectory + "EnglishLS.sensemap";
   protected String wordList = sensevalDirectory + "EnglishLS.train.key";

   public String getSensevalDirectory() {
     return sensevalDirectory;
   }

   public void setSensevalDirectory(String sensevalDirectory) {
     this.sensevalDirectory = sensevalDirectory;

     this.data = sensevalDirectory + "EnglishLS.train";
     this.sensemapFile = sensevalDirectory + "EnglishLS.sensemap";
     this.wordList = sensevalDirectory + "EnglishLS.train.key";
   }

   public SensevalReader() {
     super();
   }

   /**
    * This extracts the equivalent senses. This serves in the case of the
    * coarse-grained disambiguation
    *
    * @param sensemapFile
    *          the file containing the equivalent senses, each set of equivalent
    *          senses per line
    * @return a {@link HashMap} conaining the new sense ID ({@link Integer}) and
    *         an {@link ArrayList} of the equivalent senses original IDs
    */
   public HashMap<Integer, ArrayList<String>> getEquivalentSense() {

     HashMap<Integer, ArrayList<String>> mappedSenses = new HashMap<Integer, ArrayList<String>>();

     try (BufferedReader wordsList = new BufferedReader(new FileReader(
         sensemapFile))) {

       int index = 0;

       String line;

       while ((line = wordsList.readLine()) != null) {

         String[] temp = line.split("\\s");

         ArrayList<String> tempSenses = new ArrayList<String>();

         for (String sense : temp) {
           if (sense.length() > 1) {
             tempSenses.add(sense);
           }
         }

         mappedSenses.put(index, tempSenses);
         index++;

       }

     } catch (IOException e) {
       e.printStackTrace();
     }

     return mappedSenses;

   }

   /**
    * This returns the list of words available in the Senseval data
    *
    * @return {@link ArrayList} of the words available on the current Senseval
    *         set
    */
   public ArrayList<String> getSensevalWords() {

     ArrayList<String> wordTags = new ArrayList<String>();

     try (BufferedReader br = new BufferedReader(new FileReader(wordList))) {

       String line;

       while ((line = br.readLine()) != null) {

         String word = line.split("\\s")[0];

         if (!wordTags.contains(word)) {
           wordTags.add(word);
         }

       }

     } catch (IOException e) {
       e.printStackTrace();
     }

     return wordTags;

   }

   /**
    * Main Senseval Reader: This checks if the data corresponding to the words to
    * disambiguate exist in the folder, and extract the {@link WSDSample}
    * instances
    *
    * @param wordTag
    *          The word, of which we are looking for the instances
    * @return the list of the {@link WSDSample} instances of the word to
    *         disambiguate
    */
   public ArrayList<WSDSample> getSensevalData(String wordTag) {

     ArrayList<WSDSample> setInstances = new ArrayList<WSDSample>();

     try {

       File xmlFile = new File(data);
       DocumentBuilderFactory dbFactory = DocumentBuilderFactory.newInstance();
       DocumentBuilder dBuilder = dbFactory.newDocumentBuilder();
       Document doc = dBuilder.parse(xmlFile);

       doc.getDocumentElement().normalize();

       NodeList lexelts = doc.getElementsByTagName("lexelt");

       for (int i = 0; i < lexelts.getLength(); i++) {

         Node nLexelt = lexelts.item(i);

         if (nLexelt.getNodeType() == Node.ELEMENT_NODE) {
           Element eLexelt = (Element) nLexelt;

           if (eLexelt.getAttribute("item").equals(wordTag)) {

             NodeList nInstances = nLexelt.getChildNodes();

             for (int j = 1; j < nInstances.getLength(); j++) {

               Node nInstance = nInstances.item(j);

               if (nInstance.getNodeType() == Node.ELEMENT_NODE) {
                 ArrayList<String> senseIDs = new ArrayList<String>();
                 String rawWord = "";
                 String[] finalText = null;
                 int index = 0;

                 NodeList nChildren = nInstance.getChildNodes();

                 for (int k = 1; k < nChildren.getLength(); k++) {
                   Node nChild = nChildren.item(k);

                   if (nChild.getNodeName().equals("answer")) {
                     // String answer =
                     // nChild.getAttributes().item(0).getTextContent();
                     String senseid = nChild.getAttributes().item(1)
                         .getTextContent();

                     String temp = senseid;
                     // String[] temp = { answer, senseid };
                     senseIDs.add(temp);
                   }

                   if (nChild.getNodeName().equals("context")) {

                     if (nChild.hasChildNodes()) {
                       String textBefore = nChild.getChildNodes().item(0)
                           .getTextContent();
                       rawWord = nChild.getChildNodes().item(1).getTextContent();
                       String textAfter = nChild.getChildNodes().item(2)
                           .getTextContent();

                       ArrayList<String> textBeforeTokenzed = new ArrayList<String>(
                           Arrays.asList(textBefore.split("\\s")));
                       ArrayList<String> textAfterTokenzed = new ArrayList<String>(
                           Arrays.asList(textAfter.split("\\s")));

                       textBeforeTokenzed.removeAll(Collections.singleton(null));
                       textBeforeTokenzed.removeAll(Collections.singleton(""));

                       textAfterTokenzed.removeAll(Collections.singleton(null));
                       textAfterTokenzed.removeAll(Collections.singleton(""));

                       finalText = new String[textBeforeTokenzed.size() + 1
                           + textAfterTokenzed.size()];

                       int l = 0;
                       for (String tempWord : textBeforeTokenzed) {
                         finalText[l] = tempWord;
                         l++;
                       }
                       index = l;
                       finalText[l] = rawWord.toLowerCase();
                       l++;
                       for (String tempWord : textAfterTokenzed) {
                         finalText[l] = tempWord;
                         l++;
                       }

                     }
                   }

                 }

                 String[] words = finalText;
                 String[] tags = WSDHelper.getTagger().tag(words);
                 String[] lemmas = new String[words.length];

                 for (int k = 0; k < words.length; k++) {
                   lemmas[k] = WSDHelper.getLemmatizer().lemmatize(words[k],
                       tags[k]);
                 }

                 WSDSample wtd = new WSDSample(words, tags, lemmas, index,
                     senseIDs);
                 setInstances.add(wtd);

               }
             }

           }

         }

       }

     } catch (Exception e) {
       e.printStackTrace();
     }

     return setInstances;

   }

   /**
    * Main Senseval Reader: This checks if the data corresponding to the words to
    * disambiguate exist in the folder, and extract the
    *
    * @param wordTag
    *          The word, of which we are looking for the instances
    * @return the stream of {@link WSDSample} of the word to disambiguate
    */
   public ObjectStream<WSDSample> getSemcorDataStream(String wordTag) {
     return ObjectStreamUtils.createObjectStream(getSensevalData(wordTag));
   }

 }
	/*
	* Licensed to the Apache Software Foundation (ASF) under one
	* or more contributor license agreements. See the NOTICE file
	* distributed with this work for additional information
	* regarding copyright ownership. The ASF licenses this file
	* to you under the Apache License, Version 2.0 (the
	* "License"); you may not use this file except in compliance
	* with the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing,
	* software distributed under the License is distributed on an
	* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
	* KIND, either express or implied. See the License for the
	* specific language governing permissions and limitations
	* under the License.
	*/

	package opennlp.tools.disambiguator.datareader;

	import java.io.BufferedReader;
	import java.io.File;
	import java.io.FileReader;
	import java.io.IOException;
	import java.util.ArrayList;
	import java.util.HashMap;
	import java.util.Collections;
	import java.util.Arrays;

	import javax.xml.parsers.DocumentBuilder;
	import javax.xml.parsers.DocumentBuilderFactory;

	import org.w3c.dom.Document;
	import org.w3c.dom.Element;
	import org.w3c.dom.Node;
	import org.w3c.dom.NodeList;

	import opennlp.tools.disambiguator.WSDHelper;
	import opennlp.tools.disambiguator.WSDSample;
	import opennlp.tools.util.ObjectStream;
	import opennlp.tools.util.ObjectStreamUtils;

	/**
	* This class handles the extraction of Senseval-3 data from the different files
	* (training data, dictionary instances, etc.)
	*/
	public class SensevalReader {

	protected String sensevalDirectory = "src\\test\\resources\\senseval3\\";

	protected String data = sensevalDirectory + "EnglishLS.train";
	protected String sensemapFile = sensevalDirectory + "EnglishLS.sensemap";
	protected String wordList = sensevalDirectory + "EnglishLS.train.key";

	public String getSensevalDirectory() {
	return sensevalDirectory;
	}

	public void setSensevalDirectory(String sensevalDirectory) {
	this.sensevalDirectory = sensevalDirectory;

	this.data = sensevalDirectory + "EnglishLS.train";
	this.sensemapFile = sensevalDirectory + "EnglishLS.sensemap";
	this.wordList = sensevalDirectory + "EnglishLS.train.key";
	}

	public SensevalReader() {
	super();
	}

	/**
	* This extracts the equivalent senses. This serves in the case of the
	* coarse-grained disambiguation
	*
	* @param sensemapFile
	* the file containing the equivalent senses, each set of equivalent
	* senses per line
	* @return a {@link HashMap} conaining the new sense ID ({@link Integer}) and
	* an {@link ArrayList} of the equivalent senses original IDs
	*/
	public HashMap<Integer, ArrayList<String>> getEquivalentSense() {

	HashMap<Integer, ArrayList<String>> mappedSenses = new HashMap<Integer, ArrayList<String>>();

	try (BufferedReader wordsList = new BufferedReader(new FileReader(
	sensemapFile))) {

	int index = 0;

	String line;

	while ((line = wordsList.readLine()) != null) {

	String[] temp = line.split("\\s");

	ArrayList<String> tempSenses = new ArrayList<String>();

	for (String sense : temp) {
	if (sense.length() > 1) {
	tempSenses.add(sense);
	}
	}

	mappedSenses.put(index, tempSenses);
	index++;

	}

	} catch (IOException e) {
	e.printStackTrace();
	}

	return mappedSenses;

	}

	/**
	* This returns the list of words available in the Senseval data
	*
	* @return {@link ArrayList} of the words available on the current Senseval
	* set
	*/
	public ArrayList<String> getSensevalWords() {

	ArrayList<String> wordTags = new ArrayList<String>();

	try (BufferedReader br = new BufferedReader(new FileReader(wordList))) {

	String line;

	while ((line = br.readLine()) != null) {

	String word = line.split("\\s")[0];

	if (!wordTags.contains(word)) {
	wordTags.add(word);
	}

	}

	} catch (IOException e) {
	e.printStackTrace();
	}

	return wordTags;

	}

	/**
	* Main Senseval Reader: This checks if the data corresponding to the words to
	* disambiguate exist in the folder, and extract the {@link WSDSample}
	* instances
	*
	* @param wordTag
	* The word, of which we are looking for the instances
	* @return the list of the {@link WSDSample} instances of the word to
	* disambiguate
	*/
	public ArrayList<WSDSample> getSensevalData(String wordTag) {

	ArrayList<WSDSample> setInstances = new ArrayList<WSDSample>();

	try {

	File xmlFile = new File(data);
	DocumentBuilderFactory dbFactory = DocumentBuilderFactory.newInstance();
	DocumentBuilder dBuilder = dbFactory.newDocumentBuilder();
	Document doc = dBuilder.parse(xmlFile);

	doc.getDocumentElement().normalize();

	NodeList lexelts = doc.getElementsByTagName("lexelt");

	for (int i = 0; i < lexelts.getLength(); i++) {

	Node nLexelt = lexelts.item(i);

	if (nLexelt.getNodeType() == Node.ELEMENT_NODE) {
	Element eLexelt = (Element) nLexelt;

	if (eLexelt.getAttribute("item").equals(wordTag)) {

	NodeList nInstances = nLexelt.getChildNodes();

	for (int j = 1; j < nInstances.getLength(); j++) {

	Node nInstance = nInstances.item(j);

	if (nInstance.getNodeType() == Node.ELEMENT_NODE) {
	ArrayList<String> senseIDs = new ArrayList<String>();
	String rawWord = "";
	String[] finalText = null;
	int index = 0;

	NodeList nChildren = nInstance.getChildNodes();

	for (int k = 1; k < nChildren.getLength(); k++) {
	Node nChild = nChildren.item(k);

	if (nChild.getNodeName().equals("answer")) {
	// String answer =
	// nChild.getAttributes().item(0).getTextContent();
	String senseid = nChild.getAttributes().item(1)
	.getTextContent();

	String temp = senseid;
	// String[] temp = { answer, senseid };
	senseIDs.add(temp);
	}

	if (nChild.getNodeName().equals("context")) {

	if (nChild.hasChildNodes()) {
	String textBefore = nChild.getChildNodes().item(0)
	.getTextContent();
	rawWord = nChild.getChildNodes().item(1).getTextContent();
	String textAfter = nChild.getChildNodes().item(2)
	.getTextContent();

	ArrayList<String> textBeforeTokenzed = new ArrayList<String>(
	Arrays.asList(textBefore.split("\\s")));
	ArrayList<String> textAfterTokenzed = new ArrayList<String>(
	Arrays.asList(textAfter.split("\\s")));

	textBeforeTokenzed.removeAll(Collections.singleton(null));
	textBeforeTokenzed.removeAll(Collections.singleton(""));

	textAfterTokenzed.removeAll(Collections.singleton(null));
	textAfterTokenzed.removeAll(Collections.singleton(""));

	finalText = new String[textBeforeTokenzed.size() + 1
	+ textAfterTokenzed.size()];

	int l = 0;
	for (String tempWord : textBeforeTokenzed) {
	finalText[l] = tempWord;
	l++;
	}
	index = l;
	finalText[l] = rawWord.toLowerCase();
	l++;
	for (String tempWord : textAfterTokenzed) {
	finalText[l] = tempWord;
	l++;
	}

	}
	}

	}

	String[] words = finalText;
	String[] tags = WSDHelper.getTagger().tag(words);
	String[] lemmas = new String[words.length];

	for (int k = 0; k < words.length; k++) {
	lemmas[k] = WSDHelper.getLemmatizer().lemmatize(words[k],
	tags[k]);
	}

	WSDSample wtd = new WSDSample(words, tags, lemmas, index,
	senseIDs);
	setInstances.add(wtd);

	}
	}

	}

	}

	}

	} catch (Exception e) {
	e.printStackTrace();
	}

	return setInstances;

	}

	/**
	* Main Senseval Reader: This checks if the data corresponding to the words to
	* disambiguate exist in the folder, and extract the
	*
	* @param wordTag
	* The word, of which we are looking for the instances
	* @return the stream of {@link WSDSample} of the word to disambiguate
	*/
	public ObjectStream<WSDSample> getSemcorDataStream(String wordTag) {
	return ObjectStreamUtils.createObjectStream(getSensevalData(wordTag));
	}

	}