blob: 59e534c843a8f61d99d075aa5860e247f7fa1d7a [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.apache.uima.ruta.textruler.learner.lp2;
import java.io.File;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.List;
import java.util.Set;
import org.apache.uima.cas.CAS;
import org.apache.uima.cas.Type;
import org.apache.uima.cas.TypeSystem;
import org.apache.uima.cas.text.AnnotationFS;
import org.apache.uima.ruta.engine.RutaEngine;
import org.apache.uima.ruta.textruler.TextRulerPlugin;
import org.apache.uima.ruta.textruler.core.TextRulerAnnotation;
import org.apache.uima.ruta.textruler.core.TextRulerExample;
import org.apache.uima.ruta.textruler.core.TextRulerRule;
import org.apache.uima.ruta.textruler.core.TextRulerRuleItem;
import org.apache.uima.ruta.textruler.core.TextRulerRulePattern;
import org.apache.uima.ruta.textruler.core.TextRulerTarget;
import org.apache.uima.ruta.textruler.core.TextRulerTarget.MLTargetType;
import org.apache.uima.ruta.textruler.core.TextRulerToolkit;
import org.apache.uima.ruta.textruler.extension.TextRulerLearnerDelegate;
import org.apache.uima.ruta.textruler.learner.lp2.LP2RuleItem.MLLP2ContextConstraint;
import org.apache.uima.ruta.textruler.learner.lp2.LP2RuleItem.MLLP2OtherConstraint;
import org.apache.uima.util.FileUtils;
public class NaiveLP2 extends BasicLP2 {
public NaiveLP2(String inputDir, String prePropTMFile, String tmpDir, String[] slotNames,
Set<String> filterSet, boolean skip, TextRulerLearnerDelegate delegate) {
super(inputDir, prePropTMFile, tmpDir, slotNames, filterSet, skip, delegate);
}
public static final boolean SAVE_DEBUG_INFO_IN_TEMPFOLDER = false;
@Override
protected void induceRulesFromExample(TextRulerExample e, int roundNumber) {
LP2Rule baseRule = createInitialRuleForPositiveExample(e);
List<LP2Rule> genRules = generalizeRule(baseRule);
if (shouldAbort())
return;
List<LP2Rule> test = new ArrayList<LP2Rule>();
// int i=1;
// for (LP2Rule newRule : genRules)
// {
// if (shouldAbort())
// return;
// sendStatusUpdateToDelegate("Round "+roundNumber+" - Testing proposed generalization "+i+"/"+(genRules.size())+
// " - uncovered examples: "+
// (examples.size()-coveredExamples.size() + " / "+examples.size()),
// TextRulerLearnerState.ML_RUNNING, false);
// i++;
// testRuleOnDocumentSet(newRule, exampleDocuments);
//
// checkAndHandleNewRule(newRule);
//
// if (TextRulerToolkit.DEBUG)
// test.add(newRule);
// }
// new cache and testCAS optimized rule testing:
sendStatusUpdateToDelegate(
"Round " + roundNumber + " - Testing " + (genRules.size())
+ "generalizations... - uncovered examples: "
+ (examples.size() - coveredExamples.size() + " / " + examples.size()),
TextRulerLearnerState.ML_RUNNING, false);
testRulesOnDocumentSet(new ArrayList<TextRulerRule>(genRules), exampleDocuments);
for (LP2Rule newRule : genRules) {
checkAndHandleNewRule(newRule);
if (TextRulerToolkit.DEBUG)
test.add(newRule);
}
if (TextRulerToolkit.DEBUG && SAVE_DEBUG_INFO_IN_TEMPFOLDER) {
Collections.sort(test, new Comparator<LP2Rule>() {
public int compare(LP2Rule o1, LP2Rule o2) {
return o1.getRuleString().compareTo(o2.getRuleString());
}
});
String startend = e.getTarget().type == MLTargetType.SINGLE_LEFT_BOUNDARY ? "left_"
: "right_";
File file = new File(tempDirectory() + startend + "generalizations" + roundNumber + RutaEngine.SCRIPT_FILE_EXTENSION);
StringBuffer str = new StringBuffer();
for (TextRulerRule rule : test) {
str.append(rule.getCoveringStatistics() + "\t\t" + rule.getRuleString() + "\n");
}
try {
FileUtils.saveString2File(str.toString(), file);
} catch (Exception ex) {
TextRulerPlugin.error(ex);
}
}
}
protected void checkAndHandleNewRule(LP2Rule rule) {
boolean tooFewPositives = rule.getCoveringStatistics().getCoveredPositivesCount() < minCoveredPositives;
boolean tooManyErrors = rule.getErrorRate() > maxErrorThreshold;
boolean isBestRule = !(tooFewPositives || tooManyErrors);
if (TextRulerToolkit.DEBUG && SAVE_DEBUG_INFO_IN_TEMPFOLDER)
TextRulerToolkit.appendStringToFile(tempDirectory() + "bestcandidates"+RutaEngine.SCRIPT_FILE_EXTENSION,
rule.getRuleString() + "\n");
if (isBestRule) {
currentBestRules.add(rule);
currentBestRules.removeSubsumedRules();
currentBestRules.cutToMaxSize();
} else if (!tooFewPositives) {
// test in context
// in our TM representation, we simply can add a NEAR condition in
// the MARKing rule item and retest it on the
// corpus. we should do that for all kinds of tags we have, but
// currently we only do it for the corresponding opening/closing
// tag, since we do not have any information about other slots yet!
// // TODO use all other slot tags! (see optimized version as well)
if (true) {
rule = rule.copy();
LP2RuleItem item = rule.getMarkingRuleItem();
// TextRulerToolkit.log("CONTEXTUAL RULE CANDIDATE: "+rule.getRuleString()+" ; "+rule.getCoveringStatistics());
item.setContextConstraint(new MLLP2ContextConstraint(slotMaximumTokenCountMap.get(rule
.getTarget().getSingleSlotRawTypeName()), rule));
rule.setIsContextualRule(true);
rule.setNeedsCompile(true);
if (TextRulerToolkit.DEBUG && SAVE_DEBUG_INFO_IN_TEMPFOLDER)
TextRulerToolkit.appendStringToFile(tempDirectory() + "ctxcandidates"+RutaEngine.SCRIPT_FILE_EXTENSION,
rule.getRuleString());
testRuleOnDocumentSet(rule, exampleDocuments); // not very
// fast... but
// works!
boolean ctxTooFewPositives = rule.getCoveringStatistics().getCoveredPositivesCount() < minCoveredPositives;
boolean ctxTooManyErrors = rule.getErrorRate() > maxErrorThreshold;
boolean isGoodContextRule = !(ctxTooFewPositives || ctxTooManyErrors);
if (isGoodContextRule) {
currentContextualRules.add(rule);
currentContextualRules.removeSubsumedRules();
currentContextualRules.cutToMaxSize();
}
}
}
}
protected List<LP2Rule> generalizeRule(LP2Rule baseRule) {
List<LP2Rule> result = new ArrayList<LP2Rule>();
TextRulerRulePattern rulePattern = new TextRulerRulePattern();
TextRulerRulePattern prePattern = baseRule.getPreFillerPattern();
for (int i = prePattern.size() - 1; i >= 0; i--) // we have to reverse
// the order again!
{
rulePattern.add(prePattern.get(i));
}
rulePattern.addAll(baseRule.getPostFillerPattern());
recursiveGeneralizeRule(baseRule, rulePattern, new TextRulerRulePattern(), result);
TextRulerToolkit.log("GENERALIZATIONS: " + result.size());
for (LP2Rule r : result)
removeOutermostWildCardItemsFromRule(r);
// for (LP2Rule r : result)
// {
// TextRulerToolkit.log("NEWRULE = "+r.getRuleString());
// }
return result;
}
protected LP2Rule createInitialRuleForPositiveExample(TextRulerExample example) {
TextRulerTarget target = example.getTarget();
LP2Rule rule = new LP2Rule(this, example.getTarget());
CAS docCas = example.getDocumentCAS();
TextRulerAnnotation exampleAnnotation = example.getAnnotation();
TypeSystem ts = docCas.getTypeSystem();
Type tokensRootType = ts.getType(TextRulerToolkit.RUTA_ANY_TYPE_NAME);
int thePosition = target.type == MLTargetType.SINGLE_LEFT_BOUNDARY ? exampleAnnotation
.getBegin() : exampleAnnotation.getEnd();
List<AnnotationFS> leftContext = TextRulerToolkit.getAnnotationsBeforePosition(docCas,
thePosition, windowSize,
TextRulerToolkit.getFilterSetWithSlotNames(slotNames, filterSet), tokensRootType);
List<AnnotationFS> rightContext = TextRulerToolkit.getAnnotationsAfterPosition(docCas,
thePosition, windowSize,
TextRulerToolkit.getFilterSetWithSlotNames(slotNames, filterSet), tokensRootType);
// the left context has to be reversed since we get the arrayList from
// the slot's point of view!
for (int i = leftContext.size() - 1; i >= 0; i--) {
TextRulerAnnotation annot = new TextRulerAnnotation(leftContext.get(i), example.getDocument());
LP2RuleItem item = new LP2RuleItem();
item.setWordConstraint(annot);
if (item.getWordConstraint().isRegExpConstraint())
item.addOtherConstraint(new MLLP2OtherConstraint(annot, annot));
rule.addPreFillerItem(item);
}
for (AnnotationFS afs : rightContext) {
TextRulerAnnotation annot = new TextRulerAnnotation(afs, example.getDocument());
LP2RuleItem item = new LP2RuleItem();
item.setWordConstraint(annot);
if (item.getWordConstraint().isRegExpConstraint())
item.addOtherConstraint(new MLLP2OtherConstraint(annot, annot));
rule.addPostFillerItem(item);
}
TextRulerToolkit.log("INITIAL RULE: " + rule.getRuleString());
return rule;
}
protected void recursiveGeneralizeRule(LP2Rule baseRule, TextRulerRulePattern allItems,
TextRulerRulePattern currentPattern, List<LP2Rule> resultList) {
if (currentPattern.size() == allItems.size()) {
// create new Rule
LP2Rule newRule = new LP2Rule(this, baseRule.getTarget());
int preCount = baseRule.getPreFillerPattern().size();
for (int i = 0; i < currentPattern.size(); i++) {
if (i < preCount)
newRule.addPreFillerItem(currentPattern.get(i));
else
newRule.addPostFillerItem(currentPattern.get(i));
}
// TextRulerToolkit.log("GEN: "+newRule.getRuleString());
if (newRule.totalInnerConstraintCount() > 0) // skip the ANY ANY ANY
// ANY... rule ! this
// makes no sense in no
// application!!
resultList.add(newRule);
} else {
int index = currentPattern.size();
TextRulerRuleItem baseItem = allItems.get(index);
List<TextRulerRuleItem> itemGeneralizations = generalizeRuleItem((LP2RuleItem) baseItem);
for (TextRulerRuleItem newItem : itemGeneralizations) {
currentPattern.add(newItem);
recursiveGeneralizeRule(baseRule, allItems, currentPattern, resultList);
currentPattern.remove(currentPattern.size() - 1);
}
}
}
protected void recursiveGeneralizeRuleItem(LP2RuleItem baseItem,
List<MLLP2OtherConstraint> otherConstraints, int currentConstraintIndex,
List<MLLP2OtherConstraint> currentConstraintTuple, List<TextRulerRuleItem> result) {
if (currentConstraintIndex > otherConstraints.size() - 1) {
LP2RuleItem newItem;
newItem = new LP2RuleItem();
for (MLLP2OtherConstraint c : currentConstraintTuple)
newItem.addOtherConstraint(c.copy());
result.add(newItem);
} else {
MLLP2OtherConstraint currentConstraint = otherConstraints.get(currentConstraintIndex);
// recurse WITH and WITHOUT this key:
recursiveGeneralizeRuleItem(baseItem, otherConstraints, currentConstraintIndex + 1,
currentConstraintTuple, result);
currentConstraintTuple.add(currentConstraint);
recursiveGeneralizeRuleItem(baseItem, otherConstraints, currentConstraintIndex + 1,
currentConstraintTuple, result);
currentConstraintTuple.remove(currentConstraintTuple.size() - 1);
}
}
protected List<TextRulerRuleItem> generalizeRuleItem(LP2RuleItem baseItem) {
List<TextRulerRuleItem> result = new ArrayList<TextRulerRuleItem>();
// one with word constraint
if (baseItem.getWordConstraint() != null) {
LP2RuleItem newItem = new LP2RuleItem();
newItem.setWordConstraint(baseItem.getWordConstraint().copy());
result.add(newItem);
}
// all other combinations without word constraint
// List<String> keys = new
// ArrayList<String>(baseItem.getOtherConstraints().keySet());
List<MLLP2OtherConstraint> constraints = baseItem.getOtherConstraints();
recursiveGeneralizeRuleItem(baseItem, constraints, 0, new ArrayList<MLLP2OtherConstraint>(),
result);
return result;
}
protected void removeOutermostWildCardItemsFromRule(LP2Rule rule) {
while (true) {
LP2RuleItem item = (LP2RuleItem) rule.getOutermostPreFillerItem();
if (item == null) // no more items left
break;
// if this rule is a RIGHT BOUNDARY rule, we must not remove the
// last remaining pre filler item,
// since this is used for marking the SLOT END BOUNDARY (= RIGHT
// BOUNDARY)
if ((rule.getTarget().type == MLTargetType.SINGLE_RIGHT_BOUNDARY)
&& (rule.getPreFillerPattern().size() == 1))
break;
if (item.totalConstraintCount() == 0)
rule.removeOutermostPreFillerItem();
else
break;
}
while (true) {
LP2RuleItem item = (LP2RuleItem) rule.getOutermostPostFillerItem();
if (item == null) // no more items left
break;
// if this rule is a LEFT BOUNDARY rule, we must not remove the last
// remaining post filler item,
// since this is used for marking the SLOT START BOUNDARY (= LEFT
// BOUNDARY)
if ((rule.getTarget().type == MLTargetType.SINGLE_LEFT_BOUNDARY)
&& (rule.getPostFillerPattern().size() == 1))
break;
if (item.totalConstraintCount() == 0)
rule.removeOutermostPostFillerItem();
else
break;
}
}
@Override
public boolean collectNegativeCoveredInstancesWhenTesting() {
return false;
}
}