OPENNLP-817 - switch to j7, added missing AL header, added runner test, tweaked parse rules method to adjust probs
diff --git a/nlp-utils/pom.xml b/nlp-utils/pom.xml
index d32a6ae..70d0df9 100644
--- a/nlp-utils/pom.xml
+++ b/nlp-utils/pom.xml
@@ -43,9 +43,8 @@
<artifactId>maven-compiler-plugin</artifactId>
<version>2.0.2</version>
<configuration>
- <compilerVersion>1.6</compilerVersion>
- <source>1.6</source>
- <target>1.6</target>
+ <source>1.7</source>
+ <target>1.7</target>
<encoding>UTF-8</encoding>
</configuration>
</plugin>
diff --git a/nlp-utils/src/main/java/org/apache/opennlp/utils/cfg/CFGRunner.java b/nlp-utils/src/main/java/org/apache/opennlp/utils/cfg/CFGRunner.java
index e135c7f..e3bb59b 100644
--- a/nlp-utils/src/main/java/org/apache/opennlp/utils/cfg/CFGRunner.java
+++ b/nlp-utils/src/main/java/org/apache/opennlp/utils/cfg/CFGRunner.java
@@ -1,3 +1,21 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
package org.apache.opennlp.utils.cfg;
import java.io.BufferedReader;
diff --git a/nlp-utils/src/main/java/org/apache/opennlp/utils/cfg/ProbabilisticContextFreeGrammar.java b/nlp-utils/src/main/java/org/apache/opennlp/utils/cfg/ProbabilisticContextFreeGrammar.java
index 26c2abd..f5d936c 100644
--- a/nlp-utils/src/main/java/org/apache/opennlp/utils/cfg/ProbabilisticContextFreeGrammar.java
+++ b/nlp-utils/src/main/java/org/apache/opennlp/utils/cfg/ProbabilisticContextFreeGrammar.java
@@ -41,15 +41,15 @@
private final String startSymbol;
private boolean randomExpansion;
- private static final Rule emptyRule = new Rule("E", "");
+ private static final Rule emptyRule = new Rule("EMPTY~", "");
private static final String nonTerminalMatcher = "[\\w\\~\\*\\-\\.\\,\\'\\:\\_\\\"]";
- private static final String terminalMatcher = "[òàùìèé\\|\\w\\'\\.\\,\\:\\_Ù\\?È\\%\\;À\\-\\\"]";
+ private static final String terminalMatcher = "[\\*òàùìèé\\|\\w\\'\\.\\,\\:\\_Ù\\?È\\%\\;À\\-\\\"]";
private static final Pattern terminalPattern = Pattern.compile("\\(("+nonTerminalMatcher+"+)\\s("+terminalMatcher+"+)\\)");
private static final Pattern nonTerminalPattern = Pattern.compile(
"\\(("+nonTerminalMatcher+"+)" + // source NT
- "\\s("+nonTerminalMatcher+"+)(\\s("+nonTerminalMatcher+"+))*\\)" // expansion NTs
+ "\\s("+nonTerminalMatcher+"+)((\\s"+nonTerminalMatcher+"+)*)\\)" // expansion NTs
);
public ProbabilisticContextFreeGrammar(Collection<String> nonTerminalSymbols, Collection<String> terminalSymbols,
@@ -94,7 +94,6 @@
expansion.addAll(getTerminals(word));
}
return expansion.toArray(new String[expansion.size()]);
-
}
private Collection<String> getTerminals(String word) {
@@ -258,10 +257,10 @@
public String toString() {
if (getRule() != emptyRule) {
return "(" +
- rule.getEntry() + " " +
+ (rule != null ? rule.getEntry() : null) + " " +
(leftTree != null && rightTree != null ?
leftTree.toString() + " " + rightTree.toString() :
- rule.getExpansion()[0]
+ (rule != null ? rule.getExpansion()[0] : null)
) +
')';
} else {
@@ -296,6 +295,11 @@
Collection<String> nonTerminals = new HashSet<>();
Collection<String> terminals = new HashSet<>();
+ rules.put(emptyRule, 1d);
+ rulesMap.put(emptyRule, 1d);
+ nonTerminals.add(emptyRule.getEntry());
+ terminals.add(emptyRule.getExpansion()[0]);
+
for (String parseTreeString : parseStrings) {
if (trim) {
@@ -312,7 +316,6 @@
if (!rules.containsKey(key)) {
rules.put(key, 1d);
terminals.add(t);
-// System.err.println(key);
}
toConsume = toConsume.replace(m.group(), nt);
}
@@ -340,16 +343,12 @@
if (!rules.containsKey(key)) {
rules.put(key, 1d);
-// startSymbol = key.getEntry();
-// System.err.println(key);
}
toConsume = toConsume.replace(m2.group(), nt);
}
}
}
- // TODO : check/adjust rules to make them respect CNF
- // TODO : adjust probabilities based on term frequencies
for (Map.Entry<Rule, Double> entry : rules.entrySet()) {
normalize(entry.getKey(), nonTerminals, terminals, rulesMap);
}
@@ -357,35 +356,55 @@
return new ProbabilisticContextFreeGrammar(nonTerminals, terminals, rulesMap, startSymbol, true);
}
+ /**
+ * Normalize (check and eventually adjust) rules to make them respect CNF
+ * @param rule
+ * @param nonTerminals
+ * @param terminals
+ * @param rulesMap
+ */
private static void normalize(Rule rule, Collection<String> nonTerminals, Collection<String> terminals, Map<Rule, Double> rulesMap) {
String[] expansion = rule.getExpansion();
+ String firstExpansion = expansion[0];
if (expansion.length == 1) {
- if (!terminals.contains(expansion[0])) {
- if (nonTerminals.contains(expansion[0])) {
+ if (!terminals.contains(firstExpansion)) {
+ if (nonTerminals.contains(firstExpansion)) {
// nt1 -> nt2 should be expanded in nt1 -> nt2,E
- rulesMap.put(new Rule(rule.getEntry(), expansion[0], emptyRule.getEntry()), 1d);
- if (rulesMap.containsKey(emptyRule)) {
- rulesMap.put(emptyRule, 1d);
- }
+ Rule newRule = new Rule(rule.getEntry(), firstExpansion, emptyRule.getEntry());
+ addRule(newRule, rulesMap);
} else {
throw new RuntimeException("rule "+rule+" expands to neither a terminal or non terminal");
}
} else {
- rulesMap.put(rule, 1d);
+ addRule(rule, rulesMap);
}
} else if (expansion.length > 2){
// nt1 -> nt2,nt3,...,ntn should be collapsed to a hierarchy of ntX -> ntY,ntZ rules
- String nt2 = expansion[0];
int seed = nonTerminals.size();
String generatedNT = "GEN~" + seed;
nonTerminals.add(generatedNT);
- Rule newRule = new Rule(rule.getEntry(), nt2, generatedNT);
+ Rule newRule = new Rule(rule.getEntry(), firstExpansion, generatedNT);
rulesMap.put(newRule, 1d);
- Rule chainedRule = new Rule(generatedNT, Arrays.copyOfRange(expansion, 1, expansion.length - 1));
+ Rule chainedRule = new Rule(generatedNT, Arrays.copyOfRange(expansion, 1, expansion.length));
rulesMap.put(chainedRule, 1d);
normalize(chainedRule, nonTerminals, terminals, rulesMap);
} else {
- rulesMap.put(rule, 1d);
+ addRule(rule, rulesMap);
}
}
+
+ private static void addRule(Rule rule, Map<Rule, Double> rulesMap) {
+ Double prob = rulesMap.get(rule);
+ if (prob != null && prob > 0d) {
+ if (prob > 0.9d) {
+ prob += 1d - prob - 0.01d;
+ } else {
+ prob += 0.01;
+ }
+ } else {
+ prob = 0.3d;
+ }
+
+ rulesMap.put(rule, prob);
+ }
}
diff --git a/nlp-utils/src/test/java/org/apache/opennlp/utils/cfg/CFGRunnerTest.java b/nlp-utils/src/test/java/org/apache/opennlp/utils/cfg/CFGRunnerTest.java
new file mode 100644
index 0000000..2d98104
--- /dev/null
+++ b/nlp-utils/src/test/java/org/apache/opennlp/utils/cfg/CFGRunnerTest.java
@@ -0,0 +1,47 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.opennlp.utils.cfg;
+
+import org.junit.Test;
+
+/**
+ * Tests for {@link CFGRunner}
+ */
+public class CFGRunnerTest {
+
+ @Test
+ public void testDefaultMain() throws Exception {
+ CFGRunner.main(new String[0]);
+ }
+
+ @Test
+ public void testMainWithWD() throws Exception {
+ CFGRunner.main(new String[]{"-wn"});
+ }
+
+ @Test
+ public void testMainWithPT() throws Exception {
+ CFGRunner.main(new String[]{"-pt"});
+ }
+
+ @Test
+ public void testMainWithWNAndPT() throws Exception {
+ CFGRunner.main(new String[]{"-wn", "-pt"});
+ }
+}
\ No newline at end of file
diff --git a/nlp-utils/src/test/java/org/apache/opennlp/utils/cfg/ProbabilisticContextFreeGrammarTest.java b/nlp-utils/src/test/java/org/apache/opennlp/utils/cfg/ProbabilisticContextFreeGrammarTest.java
index 292032c..f820612 100644
--- a/nlp-utils/src/test/java/org/apache/opennlp/utils/cfg/ProbabilisticContextFreeGrammarTest.java
+++ b/nlp-utils/src/test/java/org/apache/opennlp/utils/cfg/ProbabilisticContextFreeGrammarTest.java
@@ -194,7 +194,7 @@
String string = "(S (VP (Adv last) (Vb tidy)) (NP (Adj biogenic) (NN Gainesville)))";
Map<Rule, Double> rules = ProbabilisticContextFreeGrammar.parseRules(string);
assertNotNull(rules);
- assertEquals(7, rules.size());
+ assertEquals(8, rules.size());
}
@Test
@@ -231,6 +231,11 @@
ProbabilisticContextFreeGrammar.parseRules(rules3, true, newsSample, newsSample2);
assertNotNull(rules3);
+ ProbabilisticContextFreeGrammar contextFreeGrammar = ProbabilisticContextFreeGrammar.parseGrammar(newsSample, newsSample2);
+ assertNotNull(contextFreeGrammar);
+ String[] derivation = contextFreeGrammar.leftMostDerivation("S");
+ assertNotNull(derivation);
+ assertTrue(derivation.length > 1);
}
@Ignore
@@ -244,9 +249,14 @@
String[] derivation = cfg.leftMostDerivation("S");
assertNotNull(derivation);
System.err.println(Arrays.toString(derivation));
- String sentence = "Il governo di Berisha pare in difficolta'";
- ProbabilisticContextFreeGrammar.ParseTree parseTree = cfg.cky(Arrays.asList(sentence.split(" ")));
- assertNotNull(parseTree);
+ ProbabilisticContextFreeGrammar.ParseTree parseTree1 = cfg.cky(Arrays.asList(derivation));
+ assertNotNull(parseTree1);
+ System.err.println(parseTree1);
+
+ String sentence = "Il Governo di Berisha appare in difficolta'";
+ List<String> fixedSentence = Arrays.asList(sentence.split(" "));
+ ProbabilisticContextFreeGrammar.ParseTree parseTree2 = cfg.cky(fixedSentence);
+ assertNotNull(parseTree2);
}
private Collection<String> parseSentences(BufferedReader bufferedReader) throws IOException {