nlp-utils/src/test/java/org/apache/opennlp/utils/cfg/ProbabilisticContextFreeGrammarTest.java - opennlp-sandbox - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one
  * or more contributor license agreements.  See the NOTICE file
  * distributed with this work for additional information
  * regarding copyright ownership.  The ASF licenses this file
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
  *
  *   http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing,
  * software distributed under the License is distributed on an
  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
  * KIND, either express or implied.  See the License for the
  * specific language governing permissions and limitations
  * under the License.
  */
 package org.apache.opennlp.utils.cfg;

 import java.io.BufferedReader;
 import java.io.IOException;
 import java.io.InputStream;
 import java.io.InputStreamReader;
 import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.Collection;
 import java.util.HashMap;
 import java.util.LinkedList;
 import java.util.List;
 import java.util.Map;
 import org.junit.BeforeClass;
 import org.junit.Ignore;
 import org.junit.Test;

 import static org.junit.Assert.*;

 /**
  * Testcase for {@link org.apache.opennlp.utils.cfg.ProbabilisticContextFreeGrammar}
  */
 public class ProbabilisticContextFreeGrammarTest {

   private static LinkedList<String> nonTerminals;
   private static String startSymbol;
   private static LinkedList<String> terminals;
   private static Map<Rule, Double> rules;

   @BeforeClass
   public static void setUp() throws Exception {
     nonTerminals = new LinkedList<String>();
     nonTerminals.add("S");
     nonTerminals.add("NP");
     nonTerminals.add("VP");
     nonTerminals.add("DT");
     nonTerminals.add("Vi");
     nonTerminals.add("Vt");
     nonTerminals.add("NN");
     nonTerminals.add("IN");
     nonTerminals.add("NNP");
     nonTerminals.add("Adv");

     startSymbol = "S";

     terminals = new LinkedList<String>();
     terminals.add("works");
     terminals.add("saw");
     terminals.add("man");
     terminals.add("woman");
     terminals.add("dog");
     terminals.add("the");
     terminals.add("with");
     terminals.add("in");
     terminals.add("joe");
     terminals.add("john");
     terminals.add("sam");
     terminals.add("michael");
     terminals.add("michelle");
     terminals.add("scarlett");
     terminals.add("and");
     terminals.add("but");
     terminals.add("while");
     terminals.add("of");
     terminals.add("for");
     terminals.add("badly");
     terminals.add("nicely");

     rules = new HashMap<Rule, Double>();
     rules.put(new Rule("S", "NP", "VP"), 1d);
     rules.put(new Rule("VP", "Vi", "Adv"), 0.3);
     rules.put(new Rule("VP", "Vt", "NP"), 0.7);
     rules.put(new Rule("NP", "DT", "NN"), 1d);
     rules.put(new Rule("Vi", "works"), 1d);
     rules.put(new Rule("Vt", "saw"), 1d);
     rules.put(new Rule("NN", "man"), 0.5);
     rules.put(new Rule("NN", "woman"), 0.2);
     rules.put(new Rule("NN", "dog"), 0.3);
     rules.put(new Rule("DT", "the"), 1d);
     rules.put(new Rule("IN", "with"), 0.2);
     rules.put(new Rule("IN", "in"), 0.1);
     rules.put(new Rule("IN", "for"), 0.4);
     rules.put(new Rule("IN", "of"), 0.4);
     rules.put(new Rule("NNP", "joe"), 0.1);
     rules.put(new Rule("NNP", "john"), 0.1);
     rules.put(new Rule("NNP", "sam"), 0.1);
     rules.put(new Rule("NNP", "michael"), 0.1);
     rules.put(new Rule("NNP", "michelle"), 0.1);
     rules.put(new Rule("NNP", "scarlett"), 0.5);
     rules.put(new Rule("Adv", "badly"), 0.3);
     rules.put(new Rule("Adv", "nicely"), 0.7);
   }

   @Test
   public void testIntermediateProbability() throws Exception {
     ArrayList<String> sentence = new ArrayList<String>();
     sentence.add("the");
     sentence.add("dog");
     sentence.add("saw");
     sentence.add("the");
     sentence.add("man");
     sentence.add("with");
     sentence.add("the");
     sentence.add("woman");

     ProbabilisticContextFreeGrammar pcfg = new ProbabilisticContextFreeGrammar(nonTerminals, terminals, rules, startSymbol);

     double pi = pcfg.pi(sentence, 0, 1, pcfg.getStartSymbol()).getProbability();
     assertEquals(0.3d, pi, 0d);

     pi = pcfg.pi(sentence, 2, 4, "VP").getProbability();
     assertEquals(0.35d, pi, 0d);
   }

   @Test
   public void testFullSentenceCKY() throws Exception {
     ProbabilisticContextFreeGrammar pcfg = new ProbabilisticContextFreeGrammar(nonTerminals, terminals, rules, startSymbol, true);

     // fixed sentence one
     List<String> sentence = new ArrayList<String>();
     sentence.add("the");
     sentence.add("dog");
     sentence.add("saw");
     sentence.add("the");
     sentence.add("man");

     ProbabilisticContextFreeGrammar.ParseTree parseTree = pcfg.cky(sentence);
     check(pcfg, parseTree, sentence);

     // fixed sentence two
     sentence = new ArrayList<String>();
     sentence.add("the");
     sentence.add("man");
     sentence.add("works");
     sentence.add("nicely");

     parseTree = pcfg.cky(sentence);
     check(pcfg, parseTree, sentence);

     // random sentence generated by the grammar
     String[] expansion = pcfg.leftMostDerivation("S");
     sentence = Arrays.asList(expansion);

     parseTree = pcfg.cky(sentence);
     check(pcfg, parseTree, sentence);
   }

   private void check(ProbabilisticContextFreeGrammar pcfg, ProbabilisticContextFreeGrammar.ParseTree parseTree, List<String> sentence) {
     Rule rule = parseTree.getRule();
     assertNotNull(rule);
     assertEquals(pcfg.getStartSymbol(), rule.getEntry());
     int s = parseTree.getSplitPoint();
     assertTrue(s >= 0);
     double pi = parseTree.getProbability();
     assertTrue(pi <= 1 && pi >= 0);
     List<String> expandedTerminals = getTerminals(parseTree);
     for (int i = 0; i < sentence.size(); i++) {
       assertEquals(sentence.get(i), expandedTerminals.get(i));
     }

   }

   private List<String> getTerminals(ProbabilisticContextFreeGrammar.ParseTree parseTree) {
     if (parseTree.getLeftTree() == null && parseTree.getRightTree() == null) {
       return Arrays.asList(parseTree.getRule().getExpansion());
     }

     ArrayList<String> list = new ArrayList<String>();
     list.addAll(getTerminals(parseTree.getLeftTree()));
     list.addAll(getTerminals(parseTree.getRightTree()));
     return list;
   }

   @Test
   public void testParseString() throws Exception {
     String string = "(S (VP (Adv last) (Vb tidy)) (NP (Adj biogenic) (NN Gainesville)))";
     Map<Rule, Double> rules = ProbabilisticContextFreeGrammar.parseRules(string);
     assertNotNull(rules);
     assertEquals(7, rules.size());
   }

   @Test
   public void testReadingItalianPennTreebankParseTreeSamples() throws Exception {
     String newsSample = "( (S \n" +
             "    (VP (VMA~RE Slitta) \n" +
             "        (PP-LOC (PREP a) \n" +
             "            (NP (NOU~PR Tirana))) \n" +
             "         (NP-EXTPSBJ-433 \n" +
             "             (NP (ART~DE la) (NOU~CS decisione)) \n" +
             "             (PP (PREP sullo) \n" +
             "                 (NP \n" +
             "                     (NP (ART~DE sullo) (NOU~CS stato)) \n" +
             "                     (PP (PREP di) \n" +
             "                         (NP (NOU~CS emergenza))))))) \n" +
             "          (NP-SBJ (-NONE- *-433)) \n" +
             "          (. .)) ) ";
     Map<Rule, Double> rules = new HashMap<>();
     ProbabilisticContextFreeGrammar.parseRules(rules, true, newsSample);
     assertNotNull(rules);

     String newsSample2 = "( (S \n" +
             "    (NP-SBJ (ART~DE La) (NOU~CS mafia) (ADJ~QU italiana)) \n" +
             "    (VP (VMA~RE opera) \n" +
             "        (PP-LOC (PREP in) \n" +
             "            (NP (NOU~PR Albania)))) \n" +
             "      (. .)) ) ";
     Map<Rule, Double> rules2 = new HashMap<>();
     ProbabilisticContextFreeGrammar.parseRules(rules2, true, newsSample2);
     assertNotNull(rules2);

     // aggregated
     Map<Rule, Double> rules3 = new HashMap<>();
     ProbabilisticContextFreeGrammar.parseRules(rules3, true, newsSample, newsSample2);
     assertNotNull(rules3);

   }

   @Ignore
   @Test
   public void testReadingItalianPennTreebankParseTree() throws Exception {
     InputStream resourceAsStream = getClass().getResourceAsStream("/it-tb-news.txt");
     BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(resourceAsStream));
     Collection<String> sentences = parseSentences(bufferedReader);
     ProbabilisticContextFreeGrammar cfg = ProbabilisticContextFreeGrammar.parseGrammar(sentences.toArray(new String[sentences.size()]));
     assertNotNull(cfg);
     String[] derivation = cfg.leftMostDerivation("S");
     assertNotNull(derivation);
     System.err.println(Arrays.toString(derivation));
     String sentence = "Il governo di Berisha pare in difficolta'";
     ProbabilisticContextFreeGrammar.ParseTree parseTree = cfg.cky(Arrays.asList(sentence.split(" ")));
     assertNotNull(parseTree);
   }

   private Collection<String> parseSentences(BufferedReader bufferedReader) throws IOException {

     Collection<String> sentences = new LinkedList<>();
     String line;
     StringBuilder sentence = new StringBuilder();
     while ((line = bufferedReader.readLine()) != null) {
       if (line.contains("(") || line.contains(")")) {
         sentence.append(line);
       } else if (line.contains("*****")){
         // only use single sentences
         String s = sentence.toString();
         if (s.trim().split("\\(S ").length == 2 && s.trim().startsWith("( (S")) {
           sentences.add(s);
         }
         sentence = new StringBuilder();
       }
     }
     if (sentence.length() > 0) {
       sentences.add(sentence.toString());
     }

     return sentences;
   }
 }
	/*
	* Licensed to the Apache Software Foundation (ASF) under one
	* or more contributor license agreements. See the NOTICE file
	* distributed with this work for additional information
	* regarding copyright ownership. The ASF licenses this file
	* to you under the Apache License, Version 2.0 (the
	* "License"); you may not use this file except in compliance
	* with the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing,
	* software distributed under the License is distributed on an
	* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
	* KIND, either express or implied. See the License for the
	* specific language governing permissions and limitations
	* under the License.
	*/
	package org.apache.opennlp.utils.cfg;

	import java.io.BufferedReader;
	import java.io.IOException;
	import java.io.InputStream;
	import java.io.InputStreamReader;
	import java.util.ArrayList;
	import java.util.Arrays;
	import java.util.Collection;
	import java.util.HashMap;
	import java.util.LinkedList;
	import java.util.List;
	import java.util.Map;
	import org.junit.BeforeClass;
	import org.junit.Ignore;
	import org.junit.Test;

	import static org.junit.Assert.*;

	/**
	* Testcase for {@link org.apache.opennlp.utils.cfg.ProbabilisticContextFreeGrammar}
	*/
	public class ProbabilisticContextFreeGrammarTest {

	private static LinkedList<String> nonTerminals;
	private static String startSymbol;
	private static LinkedList<String> terminals;
	private static Map<Rule, Double> rules;

	@BeforeClass
	public static void setUp() throws Exception {
	nonTerminals = new LinkedList<String>();
	nonTerminals.add("S");
	nonTerminals.add("NP");
	nonTerminals.add("VP");
	nonTerminals.add("DT");
	nonTerminals.add("Vi");
	nonTerminals.add("Vt");
	nonTerminals.add("NN");
	nonTerminals.add("IN");
	nonTerminals.add("NNP");
	nonTerminals.add("Adv");

	startSymbol = "S";

	terminals = new LinkedList<String>();
	terminals.add("works");
	terminals.add("saw");
	terminals.add("man");
	terminals.add("woman");
	terminals.add("dog");
	terminals.add("the");
	terminals.add("with");
	terminals.add("in");
	terminals.add("joe");
	terminals.add("john");
	terminals.add("sam");
	terminals.add("michael");
	terminals.add("michelle");
	terminals.add("scarlett");
	terminals.add("and");
	terminals.add("but");
	terminals.add("while");
	terminals.add("of");
	terminals.add("for");
	terminals.add("badly");
	terminals.add("nicely");

	rules = new HashMap<Rule, Double>();
	rules.put(new Rule("S", "NP", "VP"), 1d);
	rules.put(new Rule("VP", "Vi", "Adv"), 0.3);
	rules.put(new Rule("VP", "Vt", "NP"), 0.7);
	rules.put(new Rule("NP", "DT", "NN"), 1d);
	rules.put(new Rule("Vi", "works"), 1d);
	rules.put(new Rule("Vt", "saw"), 1d);
	rules.put(new Rule("NN", "man"), 0.5);
	rules.put(new Rule("NN", "woman"), 0.2);
	rules.put(new Rule("NN", "dog"), 0.3);
	rules.put(new Rule("DT", "the"), 1d);
	rules.put(new Rule("IN", "with"), 0.2);
	rules.put(new Rule("IN", "in"), 0.1);
	rules.put(new Rule("IN", "for"), 0.4);
	rules.put(new Rule("IN", "of"), 0.4);
	rules.put(new Rule("NNP", "joe"), 0.1);
	rules.put(new Rule("NNP", "john"), 0.1);
	rules.put(new Rule("NNP", "sam"), 0.1);
	rules.put(new Rule("NNP", "michael"), 0.1);
	rules.put(new Rule("NNP", "michelle"), 0.1);
	rules.put(new Rule("NNP", "scarlett"), 0.5);
	rules.put(new Rule("Adv", "badly"), 0.3);
	rules.put(new Rule("Adv", "nicely"), 0.7);
	}

	@Test
	public void testIntermediateProbability() throws Exception {
	ArrayList<String> sentence = new ArrayList<String>();
	sentence.add("the");
	sentence.add("dog");
	sentence.add("saw");
	sentence.add("the");
	sentence.add("man");
	sentence.add("with");
	sentence.add("the");
	sentence.add("woman");

	ProbabilisticContextFreeGrammar pcfg = new ProbabilisticContextFreeGrammar(nonTerminals, terminals, rules, startSymbol);

	double pi = pcfg.pi(sentence, 0, 1, pcfg.getStartSymbol()).getProbability();
	assertEquals(0.3d, pi, 0d);

	pi = pcfg.pi(sentence, 2, 4, "VP").getProbability();
	assertEquals(0.35d, pi, 0d);
	}

	@Test
	public void testFullSentenceCKY() throws Exception {
	ProbabilisticContextFreeGrammar pcfg = new ProbabilisticContextFreeGrammar(nonTerminals, terminals, rules, startSymbol, true);

	// fixed sentence one
	List<String> sentence = new ArrayList<String>();
	sentence.add("the");
	sentence.add("dog");
	sentence.add("saw");
	sentence.add("the");
	sentence.add("man");

	ProbabilisticContextFreeGrammar.ParseTree parseTree = pcfg.cky(sentence);
	check(pcfg, parseTree, sentence);

	// fixed sentence two
	sentence = new ArrayList<String>();
	sentence.add("the");
	sentence.add("man");
	sentence.add("works");
	sentence.add("nicely");

	parseTree = pcfg.cky(sentence);
	check(pcfg, parseTree, sentence);

	// random sentence generated by the grammar
	String[] expansion = pcfg.leftMostDerivation("S");
	sentence = Arrays.asList(expansion);

	parseTree = pcfg.cky(sentence);
	check(pcfg, parseTree, sentence);
	}

	private void check(ProbabilisticContextFreeGrammar pcfg, ProbabilisticContextFreeGrammar.ParseTree parseTree, List<String> sentence) {
	Rule rule = parseTree.getRule();
	assertNotNull(rule);
	assertEquals(pcfg.getStartSymbol(), rule.getEntry());
	int s = parseTree.getSplitPoint();
	assertTrue(s >= 0);
	double pi = parseTree.getProbability();
	assertTrue(pi <= 1 && pi >= 0);
	List<String> expandedTerminals = getTerminals(parseTree);
	for (int i = 0; i < sentence.size(); i++) {
	assertEquals(sentence.get(i), expandedTerminals.get(i));
	}

	}

	private List<String> getTerminals(ProbabilisticContextFreeGrammar.ParseTree parseTree) {
	if (parseTree.getLeftTree() == null && parseTree.getRightTree() == null) {
	return Arrays.asList(parseTree.getRule().getExpansion());
	}

	ArrayList<String> list = new ArrayList<String>();
	list.addAll(getTerminals(parseTree.getLeftTree()));
	list.addAll(getTerminals(parseTree.getRightTree()));
	return list;
	}

	@Test
	public void testParseString() throws Exception {
	String string = "(S (VP (Adv last) (Vb tidy)) (NP (Adj biogenic) (NN Gainesville)))";
	Map<Rule, Double> rules = ProbabilisticContextFreeGrammar.parseRules(string);
	assertNotNull(rules);
	assertEquals(7, rules.size());
	}

	@Test
	public void testReadingItalianPennTreebankParseTreeSamples() throws Exception {
	String newsSample = "( (S \n" +
	" (VP (VMA~RE Slitta) \n" +
	" (PP-LOC (PREP a) \n" +
	" (NP (NOU~PR Tirana))) \n" +
	" (NP-EXTPSBJ-433 \n" +
	" (NP (ART~DE la) (NOU~CS decisione)) \n" +
	" (PP (PREP sullo) \n" +
	" (NP \n" +
	" (NP (ART~DE sullo) (NOU~CS stato)) \n" +
	" (PP (PREP di) \n" +
	" (NP (NOU~CS emergenza))))))) \n" +
	" (NP-SBJ (-NONE- *-433)) \n" +
	" (. .)) ) ";
	Map<Rule, Double> rules = new HashMap<>();
	ProbabilisticContextFreeGrammar.parseRules(rules, true, newsSample);
	assertNotNull(rules);

	String newsSample2 = "( (S \n" +
	" (NP-SBJ (ART~DE La) (NOU~CS mafia) (ADJ~QU italiana)) \n" +
	" (VP (VMA~RE opera) \n" +
	" (PP-LOC (PREP in) \n" +
	" (NP (NOU~PR Albania)))) \n" +
	" (. .)) ) ";
	Map<Rule, Double> rules2 = new HashMap<>();
	ProbabilisticContextFreeGrammar.parseRules(rules2, true, newsSample2);
	assertNotNull(rules2);

	// aggregated
	Map<Rule, Double> rules3 = new HashMap<>();
	ProbabilisticContextFreeGrammar.parseRules(rules3, true, newsSample, newsSample2);
	assertNotNull(rules3);

	}

	@Ignore
	@Test
	public void testReadingItalianPennTreebankParseTree() throws Exception {
	InputStream resourceAsStream = getClass().getResourceAsStream("/it-tb-news.txt");
	BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(resourceAsStream));
	Collection<String> sentences = parseSentences(bufferedReader);
	ProbabilisticContextFreeGrammar cfg = ProbabilisticContextFreeGrammar.parseGrammar(sentences.toArray(new String[sentences.size()]));
	assertNotNull(cfg);
	String[] derivation = cfg.leftMostDerivation("S");
	assertNotNull(derivation);
	System.err.println(Arrays.toString(derivation));
	String sentence = "Il governo di Berisha pare in difficolta'";
	ProbabilisticContextFreeGrammar.ParseTree parseTree = cfg.cky(Arrays.asList(sentence.split(" ")));
	assertNotNull(parseTree);
	}

	private Collection<String> parseSentences(BufferedReader bufferedReader) throws IOException {

	Collection<String> sentences = new LinkedList<>();
	String line;
	StringBuilder sentence = new StringBuilder();
	while ((line = bufferedReader.readLine()) != null) {
	if (line.contains("(") \|\| line.contains(")")) {
	sentence.append(line);
	} else if (line.contains("*****")){
	// only use single sentences
	String s = sentence.toString();
	if (s.trim().split("\\(S ").length == 2 && s.trim().startsWith("( (S")) {
	sentences.add(s);
	}
	sentence = new StringBuilder();
	}
	}
	if (sentence.length() > 0) {
	sentences.add(sentence.toString());
	}

	return sentences;
	}
	}