OPENNLP-817 - switch to j7, added missing AL header, added runner test, tweaked parse rules method to adjust probs

commit: 8faad08031b24de16eaa6f65e68d94546e2e7690 [log] [tgz]
author: Tommaso Teofili <tommaso@apache.org> Fri Sep 18 08:02:12 2015 +0000
committer: Tommaso Teofili <tommaso@apache.org> Fri Sep 18 08:02:12 2015 +0000
tree: 71a947e6353671cab7fd340e1d8b705c32b7750c
parent: fbbf80357b8793886c6721ca85ef0be9653fe53d [diff]
diff --git a/nlp-utils/pom.xml b/nlp-utils/pom.xml
index d32a6ae..70d0df9 100644
--- a/nlp-utils/pom.xml
+++ b/nlp-utils/pom.xml

@@ -43,9 +43,8 @@
         <artifactId>maven-compiler-plugin</artifactId>
         <version>2.0.2</version>
         <configuration>
-          <compilerVersion>1.6</compilerVersion>
-          <source>1.6</source>
-          <target>1.6</target>
+          <source>1.7</source>
+          <target>1.7</target>
           <encoding>UTF-8</encoding>
         </configuration>
       </plugin>

diff --git a/nlp-utils/src/main/java/org/apache/opennlp/utils/cfg/CFGRunner.java b/nlp-utils/src/main/java/org/apache/opennlp/utils/cfg/CFGRunner.java
index e135c7f..e3bb59b 100644
--- a/nlp-utils/src/main/java/org/apache/opennlp/utils/cfg/CFGRunner.java
+++ b/nlp-utils/src/main/java/org/apache/opennlp/utils/cfg/CFGRunner.java

@@ -1,3 +1,21 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
 package org.apache.opennlp.utils.cfg;
 
 import java.io.BufferedReader;

diff --git a/nlp-utils/src/main/java/org/apache/opennlp/utils/cfg/ProbabilisticContextFreeGrammar.java b/nlp-utils/src/main/java/org/apache/opennlp/utils/cfg/ProbabilisticContextFreeGrammar.java
index 26c2abd..f5d936c 100644
--- a/nlp-utils/src/main/java/org/apache/opennlp/utils/cfg/ProbabilisticContextFreeGrammar.java
+++ b/nlp-utils/src/main/java/org/apache/opennlp/utils/cfg/ProbabilisticContextFreeGrammar.java

@@ -41,15 +41,15 @@
   private final String startSymbol;
   private boolean randomExpansion;
 
-  private static final Rule emptyRule = new Rule("E", "");
+  private static final Rule emptyRule = new Rule("EMPTY~", "");
 
   private static final String nonTerminalMatcher = "[\\w\\~\\*\\-\\.\\,\\'\\:\\_\\\"]";
-  private static final String terminalMatcher = "[òàùìèé\\|\\w\\'\\.\\,\\:\\_Ù\\?È\\%\\;À\\-\\\"]";
+  private static final String terminalMatcher = "[\\*òàùìèé\\|\\w\\'\\.\\,\\:\\_Ù\\?È\\%\\;À\\-\\\"]";
 
   private static final Pattern terminalPattern = Pattern.compile("\\(("+nonTerminalMatcher+"+)\\s("+terminalMatcher+"+)\\)");
   private static final Pattern nonTerminalPattern = Pattern.compile(
           "\\(("+nonTerminalMatcher+"+)" + // source NT
-                  "\\s("+nonTerminalMatcher+"+)(\\s("+nonTerminalMatcher+"+))*\\)" // expansion NTs
+                  "\\s("+nonTerminalMatcher+"+)((\\s"+nonTerminalMatcher+"+)*)\\)" // expansion NTs
   );
 
   public ProbabilisticContextFreeGrammar(Collection<String> nonTerminalSymbols, Collection<String> terminalSymbols,
@@ -94,7 +94,6 @@
       expansion.addAll(getTerminals(word));
     }
     return expansion.toArray(new String[expansion.size()]);
-
   }
 
   private Collection<String> getTerminals(String word) {
@@ -258,10 +257,10 @@
     public String toString() {
       if (getRule() != emptyRule) {
         return "(" +
-                rule.getEntry() + " " +
+                (rule != null ? rule.getEntry() : null) + " " +
                 (leftTree != null && rightTree != null ?
                         leftTree.toString() + " " + rightTree.toString() :
-                        rule.getExpansion()[0]
+                        (rule != null ? rule.getExpansion()[0] : null)
                 ) +
                 ')';
       } else {
@@ -296,6 +295,11 @@
     Collection<String> nonTerminals = new HashSet<>();
     Collection<String> terminals = new HashSet<>();
 
+    rules.put(emptyRule, 1d);
+    rulesMap.put(emptyRule, 1d);
+    nonTerminals.add(emptyRule.getEntry());
+    terminals.add(emptyRule.getExpansion()[0]);
+
     for (String parseTreeString : parseStrings) {
 
       if (trim) {
@@ -312,7 +316,6 @@
         if (!rules.containsKey(key)) {
           rules.put(key, 1d);
           terminals.add(t);
-//          System.err.println(key);
         }
         toConsume = toConsume.replace(m.group(), nt);
       }
@@ -340,16 +343,12 @@
 
           if (!rules.containsKey(key)) {
             rules.put(key, 1d);
-//            startSymbol = key.getEntry();
-//            System.err.println(key);
           }
           toConsume = toConsume.replace(m2.group(), nt);
         }
       }
     }
 
-    // TODO : check/adjust rules to make them respect CNF
-    // TODO : adjust probabilities based on term frequencies
     for (Map.Entry<Rule, Double> entry : rules.entrySet()) {
       normalize(entry.getKey(), nonTerminals, terminals, rulesMap);
     }
@@ -357,35 +356,55 @@
     return new ProbabilisticContextFreeGrammar(nonTerminals, terminals, rulesMap, startSymbol, true);
   }
 
+  /**
+   * Normalize (check and eventually adjust) rules to make them respect CNF
+   * @param rule
+   * @param nonTerminals
+   * @param terminals
+   * @param rulesMap
+   */
   private static void normalize(Rule rule, Collection<String> nonTerminals, Collection<String> terminals, Map<Rule, Double> rulesMap) {
     String[] expansion = rule.getExpansion();
+    String firstExpansion = expansion[0];
     if (expansion.length == 1) {
-      if (!terminals.contains(expansion[0])) {
-        if (nonTerminals.contains(expansion[0])) {
+      if (!terminals.contains(firstExpansion)) {
+        if (nonTerminals.contains(firstExpansion)) {
           // nt1 -> nt2 should be expanded in nt1 -> nt2,E
-          rulesMap.put(new Rule(rule.getEntry(), expansion[0], emptyRule.getEntry()), 1d);
-          if (rulesMap.containsKey(emptyRule)) {
-            rulesMap.put(emptyRule, 1d);
-          }
+          Rule newRule = new Rule(rule.getEntry(), firstExpansion, emptyRule.getEntry());
+          addRule(newRule, rulesMap);
         } else {
           throw new RuntimeException("rule "+rule+" expands to neither a terminal or non terminal");
         }
       } else {
-        rulesMap.put(rule, 1d);
+        addRule(rule, rulesMap);
       }
     } else if (expansion.length > 2){
       // nt1 -> nt2,nt3,...,ntn should be collapsed to a hierarchy of ntX -> ntY,ntZ rules
-      String nt2 = expansion[0];
       int seed = nonTerminals.size();
       String generatedNT = "GEN~" + seed;
       nonTerminals.add(generatedNT);
-      Rule newRule = new Rule(rule.getEntry(), nt2, generatedNT);
+      Rule newRule = new Rule(rule.getEntry(), firstExpansion, generatedNT);
       rulesMap.put(newRule, 1d);
-      Rule chainedRule = new Rule(generatedNT, Arrays.copyOfRange(expansion, 1, expansion.length - 1));
+      Rule chainedRule = new Rule(generatedNT, Arrays.copyOfRange(expansion, 1, expansion.length));
       rulesMap.put(chainedRule, 1d);
       normalize(chainedRule, nonTerminals, terminals, rulesMap);
     } else {
-      rulesMap.put(rule, 1d);
+      addRule(rule, rulesMap);
     }
   }
+
+  private static void addRule(Rule rule, Map<Rule, Double> rulesMap) {
+    Double prob = rulesMap.get(rule);
+    if (prob != null && prob > 0d) {
+      if (prob > 0.9d) {
+        prob += 1d - prob - 0.01d;
+      } else {
+        prob += 0.01;
+      }
+    } else {
+      prob = 0.3d;
+    }
+
+    rulesMap.put(rule, prob);
+  }
 }

diff --git a/nlp-utils/src/test/java/org/apache/opennlp/utils/cfg/CFGRunnerTest.java b/nlp-utils/src/test/java/org/apache/opennlp/utils/cfg/CFGRunnerTest.java
new file mode 100644
index 0000000..2d98104
--- /dev/null
+++ b/nlp-utils/src/test/java/org/apache/opennlp/utils/cfg/CFGRunnerTest.java

@@ -0,0 +1,47 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.opennlp.utils.cfg;
+
+import org.junit.Test;
+
+/**
+ * Tests for {@link CFGRunner}
+ */
+public class CFGRunnerTest {
+
+  @Test
+  public void testDefaultMain() throws Exception {
+    CFGRunner.main(new String[0]);
+  }
+
+  @Test
+  public void testMainWithWD() throws Exception {
+    CFGRunner.main(new String[]{"-wn"});
+  }
+
+  @Test
+  public void testMainWithPT() throws Exception {
+    CFGRunner.main(new String[]{"-pt"});
+  }
+
+  @Test
+  public void testMainWithWNAndPT() throws Exception {
+    CFGRunner.main(new String[]{"-wn", "-pt"});
+  }
+}
\ No newline at end of file

diff --git a/nlp-utils/src/test/java/org/apache/opennlp/utils/cfg/ProbabilisticContextFreeGrammarTest.java b/nlp-utils/src/test/java/org/apache/opennlp/utils/cfg/ProbabilisticContextFreeGrammarTest.java
index 292032c..f820612 100644
--- a/nlp-utils/src/test/java/org/apache/opennlp/utils/cfg/ProbabilisticContextFreeGrammarTest.java
+++ b/nlp-utils/src/test/java/org/apache/opennlp/utils/cfg/ProbabilisticContextFreeGrammarTest.java

@@ -194,7 +194,7 @@
     String string = "(S (VP (Adv last) (Vb tidy)) (NP (Adj biogenic) (NN Gainesville)))";
     Map<Rule, Double> rules = ProbabilisticContextFreeGrammar.parseRules(string);
     assertNotNull(rules);
-    assertEquals(7, rules.size());
+    assertEquals(8, rules.size());
   }
 
   @Test
@@ -231,6 +231,11 @@
     ProbabilisticContextFreeGrammar.parseRules(rules3, true, newsSample, newsSample2);
     assertNotNull(rules3);
 
+    ProbabilisticContextFreeGrammar contextFreeGrammar = ProbabilisticContextFreeGrammar.parseGrammar(newsSample, newsSample2);
+    assertNotNull(contextFreeGrammar);
+    String[] derivation = contextFreeGrammar.leftMostDerivation("S");
+    assertNotNull(derivation);
+    assertTrue(derivation.length > 1);
   }
 
   @Ignore
@@ -244,9 +249,14 @@
     String[] derivation = cfg.leftMostDerivation("S");
     assertNotNull(derivation);
     System.err.println(Arrays.toString(derivation));
-    String sentence = "Il governo di Berisha pare in difficolta'";
-    ProbabilisticContextFreeGrammar.ParseTree parseTree = cfg.cky(Arrays.asList(sentence.split(" ")));
-    assertNotNull(parseTree);
+    ProbabilisticContextFreeGrammar.ParseTree parseTree1 = cfg.cky(Arrays.asList(derivation));
+    assertNotNull(parseTree1);
+    System.err.println(parseTree1);
+
+    String sentence = "Il Governo di Berisha appare in difficolta'";
+    List<String> fixedSentence = Arrays.asList(sentence.split(" "));
+    ProbabilisticContextFreeGrammar.ParseTree parseTree2 = cfg.cky(fixedSentence);
+    assertNotNull(parseTree2);
   }
 
   private Collection<String> parseSentences(BufferedReader bufferedReader) throws IOException {
commit	8faad08031b24de16eaa6f65e68d94546e2e7690	[log] [tgz]
author	Tommaso Teofili <tommaso@apache.org>	Fri Sep 18 08:02:12 2015 +0000
committer	Tommaso Teofili <tommaso@apache.org>	Fri Sep 18 08:02:12 2015 +0000
tree	71a947e6353671cab7fd340e1d8b705c32b7750c
parent	fbbf80357b8793886c6721ca85ef0be9653fe53d [diff]