blob: d65f6acfc43692180df7aa547d94fd23d0d0911e [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.apache.uima.ruta.textruler.learner.whisk.generic;
import java.util.ArrayList;
import java.util.Collection;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import org.apache.commons.lang3.StringUtils;
import org.apache.uima.cas.CAS;
import org.apache.uima.cas.FSIterator;
import org.apache.uima.cas.FeatureStructure;
import org.apache.uima.cas.Type;
import org.apache.uima.cas.TypeSystem;
import org.apache.uima.cas.text.AnnotationFS;
import org.apache.uima.ruta.textruler.core.TextRulerAnnotation;
import org.apache.uima.ruta.textruler.core.TextRulerBasicLearner;
import org.apache.uima.ruta.textruler.core.TextRulerExample;
import org.apache.uima.ruta.textruler.core.TextRulerExampleDocument;
import org.apache.uima.ruta.textruler.core.TextRulerRule;
import org.apache.uima.ruta.textruler.core.TextRulerRuleItem;
import org.apache.uima.ruta.textruler.core.TextRulerRuleList;
import org.apache.uima.ruta.textruler.core.TextRulerRulePattern;
import org.apache.uima.ruta.textruler.core.TextRulerSlotPattern;
import org.apache.uima.ruta.textruler.core.TextRulerStatisticsCollector;
import org.apache.uima.ruta.textruler.core.TextRulerTarget;
import org.apache.uima.ruta.textruler.core.TextRulerToolkit;
import org.apache.uima.ruta.textruler.core.TextRulerWordConstraint;
import org.apache.uima.ruta.textruler.extension.TextRulerLearnerDelegate;
import org.apache.uima.ruta.textruler.learner.whisk.generic.WhiskRuleItem.MLWhiskOtherConstraint;
public class Whisk extends TextRulerBasicLearner {
public final static String WINDOWSIZE_KEY = "windowSize";
public final static String ERROR_THRESHOLD_KEY = "errorThreshold";
public final static String POSTAG_ROOTTYPE_KEY = "posTagRootType";
public final static int STANDARD_WINDOWSIZE = 5;
public final static float STANDARD_ERROR_THRESHOLD = 0.1f;
public final static String STANDARD_POSTAG_ROOTTYPE = "org.apache.uima.ml.ML.postag";
public static final String CONSIDERED_FEATURES = "consideredFeatures";
public static final String STANDARD_CONSIDERED_FEATURES = "";
TextRulerRuleList ruleList;
protected Set<TextRulerExample> coveredExamples;
protected int windowSize = STANDARD_WINDOWSIZE;
protected double errorThreshold = STANDARD_ERROR_THRESHOLD;
protected String posTagRootTypeName = STANDARD_POSTAG_ROOTTYPE;
int roundNumber = 0;
int allExamplesCount = 0;
private List<String> consideredFeatures = new ArrayList<String>();
private Map<String, TextRulerStatisticsCollector> cachedTestedRuleStatistics = new HashMap<String, TextRulerStatisticsCollector>();
public Whisk(String inputDir, String prePropTmFile, String tmpDir, String[] slotNames,
Set<String> filterSet, boolean skip, TextRulerLearnerDelegate delegate) {
super(inputDir, prePropTmFile, tmpDir, slotNames, filterSet, skip, delegate);
// useDynamicAnchoring = true;
}
@Override
public boolean collectNegativeCoveredInstancesWhenTesting() {
return false;
}
@Override
protected void doRun() {
// we don't use the same overall structure like the original WHISK since
// we do not
// repeat the whole process for some new training documents at the
// user's request, we
// learn like the other algorithms from the whole training set, so we
// for example do not
// need to test the intermediate rule base on a newly "incoming"
// training document since we
// tested all rules already on all training documents !
// this version of whisk is not tested for mutli slot learning since the
// seminar announcements
// are not quite suitable for this task: they do not all contain all 4
// slots and some of them
// occur more than once in one document ! And the order of them is not
// always the same as well!
// so this is now made only tested for the single slot case even if it
// is built capable of multislot
// examples!
// this is the inner loop of the WHISK pseudo-code:
// For each inst in Training
// for each tag
cachedTestedRuleStatistics.clear();
ruleList = new TextRulerRuleList();
coveredExamples = new HashSet<TextRulerExample>();
sendStatusUpdateToDelegate("Creating examples...", TextRulerLearnerState.ML_RUNNING, false);
for (int i = 0; i < slotNames.length; i++) {
TextRulerTarget target = new TextRulerTarget(slotNames[i], this);
exampleDocuments.createExamplesForTarget(target);
TextRulerExampleDocument[] docs = exampleDocuments.getSortedDocumentsInCacheOptimizedOrder();
allExamplesCount = exampleDocuments.getAllPositiveExamples().size();
for (TextRulerExampleDocument inst : docs) {
List<TextRulerExample> tags = inst.getPositiveExamples();
// for each uncovered example -> induce a new rule:
for (TextRulerExample tag : tags) {
if (!coveredExamples.contains(tag)) {
roundNumber++;
WhiskRule newRule = growRule(inst, tag);
if (shouldAbort())
break;
// if (newRule == null)
// break;
// else
if (newRule != null
&& (newRule.getCoveringStatistics().getCoveredNegativesCount() == 00 || newRule
.getLaplacian() <= errorThreshold)) {
ruleList.addRule(newRule);
coveredExamples.addAll(newRule.getCoveringStatistics().getCoveredPositiveExamples());
sendStatusUpdateToDelegate("New Rule added...", TextRulerLearnerState.ML_RUNNING,
true);
}
}
}
if (shouldAbort())
return;
}
}
sendStatusUpdateToDelegate("Done", TextRulerLearnerState.ML_DONE, true);
cachedTestedRuleStatistics.clear();
}
protected WhiskRule growRule(TextRulerExampleDocument doc, TextRulerExample example) {
sendStatusUpdateToDelegate("Creating new rule from seed...", TextRulerLearnerState.ML_RUNNING,
false);
WhiskRule theRule = new WhiskRule(this, example.getTarget(), example);
int numberOfSlotsInTag = example.getAnnotations().length;
for (int i = 0; i < numberOfSlotsInTag; i++)
theRule.getPatterns().add(new TextRulerSlotPattern());
sendStatusUpdateToDelegate("Creating new rule: anchoring...", TextRulerLearnerState.ML_RUNNING,
false);
for (int i = 0; i < numberOfSlotsInTag; i++) {
theRule = anchor(theRule, doc, example, i);
if (shouldAbort())
return null;
}
sendStatusUpdateToDelegate("Creating new rule: extending...", TextRulerLearnerState.ML_RUNNING,
false);
if (theRule != null) {
double oldLaplacian = theRule.getLaplacian();
int subRoundNumber = 0;
// repeat while we still make errors...
while (theRule.getCoveringStatistics().getCoveredNegativesCount() > 0) {
WhiskRule extendedRule = extendRule(theRule, doc, example, subRoundNumber);
if (extendedRule == null) {
// this way we get the previous rule
// as the best rule...
break;
}
theRule = extendedRule;
TextRulerToolkit.log("----------------------------");
TextRulerToolkit.log("BEST EXTENSION IS: " + theRule.getRuleString());
TextRulerToolkit.log("Laplacian: " + theRule.getLaplacian() + " ; "
+ theRule.getCoveringStatistics());
subRoundNumber++;
double newLaplacian = theRule.getLaplacian();
if (newLaplacian >= oldLaplacian) {
break;
}
oldLaplacian = newLaplacian;
}
TextRulerToolkit.log("----------------------------");
TextRulerToolkit.log("FINAL RULE IS : " + theRule.getRuleString());
}
return theRule;
}
protected WhiskRule extendRule(WhiskRule rule, TextRulerExampleDocument doc,
TextRulerExample example, int subRoundNumber) {
WhiskRule bestRule = null;
double bestL = 1.0;
int bestRuleConstraintPoints = -1;
if (rule.getLaplacian() <= errorThreshold) {
bestRule = rule;
bestL = rule.getLaplacian();
}
List<TextRulerRule> rulesToTest = new ArrayList<TextRulerRule>();
// first only add conditions, e.g., for features
List<TextRulerSlotPattern> patterns = rule.getPatterns();
for (TextRulerSlotPattern eachPattern : patterns) {
for (TextRulerRuleItem item : eachPattern.fillerPattern) {
if (item instanceof WhiskRuleItem) {
WhiskRuleItem wri = (WhiskRuleItem) item;
WhiskRule proposedRule = rule;
TextRulerWordConstraint wordConstraint = wri.getWordConstraint();
for (String eachFeature : consideredFeatures) {
if (wordConstraint != null) {
Map<String, String> featureMap = wordConstraint.getTokenAnnotation().getFeatureMap();
String stringValue = featureMap.get(eachFeature);
if (stringValue != null && !wri.getActivatedFeatures().contains(eachFeature)) {
wri.activateFeature(eachFeature);
WhiskRule proposedRuleF = proposedRule.copy();
wri.deactivateFeature(eachFeature);
proposedRuleF.setNeedsCompile(true);
if (!rulesToTest.contains(proposedRuleF)) {
rulesToTest.add(proposedRuleF);
}
}
}
}
if (wordConstraint != null && wordConstraint.isRegExpConstraint() && wri.isHideRegExp()) {
wri.setHideRegExp(false);
WhiskRule proposedRuleF = proposedRule.copy();
wri.setHideRegExp(true);
proposedRuleF.setNeedsCompile(true);
if (!rulesToTest.contains(proposedRuleF)) {
rulesToTest.add(proposedRuleF);
}
}
}
}
}
List<List<WhiskRuleItem>> slotTerms = getTermsWithinBounds(
example.getAnnotations()[0].getBegin(), example.getAnnotations()[0].getEnd(), example);
List<List<WhiskRuleItem>> windowTerms = getTermsWithinWindow(slotTerms, example, 0);
for (List<WhiskRuleItem> eachList : windowTerms) {
for (WhiskRuleItem term : eachList) {
if (rule.containsTerm(term)) {
continue;
}
WhiskRule proposedRule = createNewRuleByAddingTerm(rule, term);
if (proposedRule == null)
continue;
WhiskRuleItem t = term;
if (!rulesToTest.contains(proposedRule))
rulesToTest.add(proposedRule);
// add a second version where we add the exact token content if
// it is a regexp item:
WhiskRule proposedRule2 = proposedRule;
if (t.getWordConstraint().isRegExpConstraint()) {
t.setHideRegExp(false);
WhiskRule proposedRuleF = proposedRule.copy();
t.setHideRegExp(true);
proposedRuleF.setNeedsCompile(true);
if (!rulesToTest.contains(proposedRuleF)) {
rulesToTest.add(proposedRuleF);
}
}
// extend with feature conditions
WhiskRule proposedRuleF = null;
for (String eachFeature : consideredFeatures) {
Map<String, String> featureMap = t.getWordConstraint().getTokenAnnotation()
.getFeatureMap();
String stringValue = featureMap.get(eachFeature);
if (stringValue != null) {
t.activateFeature(eachFeature);
proposedRuleF = proposedRule.copy();
t.deactivateFeature(eachFeature);
proposedRuleF.setNeedsCompile(true);
if (!rulesToTest.contains(proposedRuleF)) {
rulesToTest.add(proposedRuleF);
}
}
}
// and now, for WHISK performance testing purposes, we also add POS
// tags:
// this is not very nice code and not dynamic feature capable, but
// for testpurposes
// in order to test WHISK with PosTag Terms...
if (posTagRootTypeName != null && posTagRootTypeName.length() > 0) {
TextRulerAnnotation tokenAnnotation = term.getWordConstraint().getTokenAnnotation();
CAS cas = example.getDocumentCAS();
TypeSystem ts = cas.getTypeSystem();
Type posTagsRootType = ts.getType(posTagRootTypeName);
if (ts != null) {
// POS-Tags created by our test hmm tagger.
List<AnnotationFS> posTagAnnotations = TextRulerToolkit.getAnnotationsWithinBounds(cas,
tokenAnnotation.getBegin(), tokenAnnotation.getEnd(), null, posTagsRootType);
if (posTagAnnotations.size() > 0) {
AnnotationFS posTag = posTagAnnotations.get(0);
if (posTag.getBegin() == tokenAnnotation.getBegin()
&& posTag.getEnd() == tokenAnnotation.getEnd()) {
TextRulerAnnotation posTagAnnotation = new TextRulerAnnotation(posTag, doc,
consideredFeatures);
// 1. most specific term with all constraints we
// have:
WhiskRule proposedRule3 = proposedRule.copy();
WhiskRuleItem t3 = term;
t3.addOtherConstraint(new MLWhiskOtherConstraint(tokenAnnotation, posTagAnnotation));
proposedRule3.setNeedsCompile(true);
if (!rulesToTest.contains(proposedRule3))
rulesToTest.add(proposedRule3);
// 2. the same without the regexp thingy:
if (proposedRule2 != null) {
WhiskRule proposedRule4 = proposedRule2.copy();
WhiskRuleItem t4 = term;
t4.addOtherConstraint(new MLWhiskOtherConstraint(tokenAnnotation,
posTagAnnotation));
proposedRule4.setNeedsCompile(true);
if (!rulesToTest.contains(proposedRule4))
rulesToTest.add(proposedRule4);
}
// 3. last but not least: a rule with only the pos
// tag constraint:
WhiskRule proposedRule5 = proposedRule.copy();
WhiskRuleItem t5 = term;
t5.addOtherConstraint(new MLWhiskOtherConstraint(tokenAnnotation, posTagAnnotation));
t5.setWordConstraint(null);
proposedRule5.setNeedsCompile(true);
if (!rulesToTest.contains(proposedRule5)) {
rulesToTest.add(proposedRule5);
}
}
}
}
}
}
}
if (rulesToTest.size() == 0)
return bestRule;
sendStatusUpdateToDelegate(
"Round "
+ roundNumber
+ "."
+ subRoundNumber
+ " - Testing "
+ rulesToTest.size()
+ " rules... "
+ " - uncovered examples: "
+ (allExamplesCount - coveredExamples.size() + " / " + allExamplesCount
+ " ; cs=" + cachedTestedRuleStatistics.size()),
TextRulerLearnerState.ML_RUNNING, false);
TextRulerToolkit.log("Testing " + rulesToTest.size() + " rules on training set...");
for (TextRulerRule r : rulesToTest)
TextRulerToolkit.log(r.getRuleString());
testRulesIfNotCached(rulesToTest);
if (shouldAbort())
return null;
for (TextRulerRule r : rulesToTest) {
WhiskRule wr = (WhiskRule) r;
if (wr.getLaplacian() < bestL) {
bestL = wr.getLaplacian();
bestRule = wr;
bestRuleConstraintPoints = bestRule.totalConstraintPoints();
} else if (wr.getLaplacian() == bestL && bestRuleConstraintPoints >= 0) {
TextRulerToolkit.log("Same Laplacian! So prefer more general rule!");
if (wr.totalConstraintPoints() < bestRuleConstraintPoints) {
TextRulerToolkit.log("\tYes, prefered!");
bestL = wr.getLaplacian();
bestRule = wr;
bestRuleConstraintPoints = bestRule.totalConstraintPoints();
}
}
}
return bestRule;
}
private List<List<WhiskRuleItem>> getTermsWithinWindow(List<List<WhiskRuleItem>> slotTerms,
TextRulerExample example, int steps) {
if (steps == windowSize)
return slotTerms;
List<List<WhiskRuleItem>> result = new ArrayList<List<WhiskRuleItem>>();
for (List<WhiskRuleItem> list : slotTerms) {
List<WhiskRuleItem> termsBefore = getTermsBefore(list.get(0), example);
List<WhiskRuleItem> termsAfter = getTermsAfter(list.get(list.size() - 1), example);
if (!termsBefore.isEmpty()) {
for (WhiskRuleItem before : termsBefore) {
for (WhiskRuleItem after : termsAfter) {
List<WhiskRuleItem> newList = new ArrayList<WhiskRuleItem>();
newList.add(before);
newList.addAll(list);
newList.add(after);
result.add(newList);
}
}
} else {
for (WhiskRuleItem after : termsAfter) {
List<WhiskRuleItem> newList = new ArrayList<WhiskRuleItem>();
newList.addAll(list);
newList.add(after);
result.add(newList);
}
}
}
result = getTermsWithinWindow(result, example, ++steps);
return result;
}
protected WhiskRule createNewRuleByAddingTerm(WhiskRule baseRule, WhiskRuleItem term) {
if (term == null)
return null;
if (term.isStarWildCard() || term.getWordConstraint() == null)
return null;
WhiskRule newRule = baseRule.copy();
// int foundSlotNumber = -1; // debug info
// String foundSlotPattern = "";
int termBeginNumber = term.getWordConstraint().getTokenAnnotation().getBegin();
int termEndNumber = term.getWordConstraint().getTokenAnnotation().getEnd();
TextRulerRulePattern targetPattern = null;
TextRulerRulePattern previousSlotPostFillerPattern = null;
for (int i = 0; i < newRule.getPatterns().size(); i++) {
TextRulerSlotPattern slotPattern = newRule.getPatterns().get(i);
WhiskRuleItem it = (WhiskRuleItem) slotPattern.preFillerPattern.lastItem(); // look at the
// prefiller
// pattern
if (it != null && it.getWordConstraint() != null
&& termEndNumber <= it.getWordConstraint().getTokenAnnotation().getBegin())
targetPattern = slotPattern.preFillerPattern;
if (targetPattern == null && slotPattern.fillerPattern.size() > 0) // now
// look
// at
// the
// filler
// pattern
{
it = (WhiskRuleItem) slotPattern.fillerPattern.firstItem();
if (it.getWordConstraint() != null
&& termEndNumber <= it.getWordConstraint().getTokenAnnotation().getBegin()) // it's
// still
// for
// the prefiller
// pattern but it
// seems to be
// emtpy so we
// could not find
// that out above!
targetPattern = slotPattern.preFillerPattern;
else {
it = (WhiskRuleItem) slotPattern.fillerPattern.lastItem();
if (it.getWordConstraint() != null
&& termEndNumber <= it.getWordConstraint().getTokenAnnotation().getBegin()) {
targetPattern = slotPattern.fillerPattern;
}
}
}
if (targetPattern == null && slotPattern.postFillerPattern.size() > 0) // now
// look
// at
// the
// postfiller
// pattern
{
it = (WhiskRuleItem) slotPattern.postFillerPattern.firstItem();
if (it.getWordConstraint() != null
&& termEndNumber <= it.getWordConstraint().getTokenAnnotation().getBegin()) // it's
// still
// for
// the filler
// pattern but it
// seems to be
// emtpy so we
// could not find
// that out above!
targetPattern = slotPattern.fillerPattern;
else {
it = (WhiskRuleItem) slotPattern.postFillerPattern.lastItem();
if (it.getWordConstraint() != null
&& termEndNumber <= it.getWordConstraint().getTokenAnnotation().getBegin())
targetPattern = slotPattern.postFillerPattern;
}
}
if (targetPattern == null) {
targetPattern = previousSlotPostFillerPattern;
// debug info
// if (i > 0) {
// TextRulerSlotPattern prevSlotPattern = newRule.getPatterns().get(i -
// 1);
// foundSlotPattern = targetPattern == prevSlotPattern.preFillerPattern
// ? "PRE FILLER"
// : (targetPattern == prevSlotPattern.fillerPattern ? "FILLER" :
// "POST FILLER");
// foundSlotNumber = i - 1;
// }
// } else {
// foundSlotPattern = targetPattern == slotPattern.preFillerPattern ?
// "PRE FILLER"
// : (targetPattern == slotPattern.fillerPattern ? "FILLER" :
// "POST FILLER");
// foundSlotNumber = i;
}
previousSlotPostFillerPattern = slotPattern.postFillerPattern;
}
if (targetPattern == null) {
targetPattern = previousSlotPostFillerPattern;
// debug info
// foundSlotNumber = newRule.getPatterns().size() - 1;
// foundSlotPattern = "POST FILLER";
}
if (targetPattern == null) {
TextRulerToolkit.log("ERROR, NO TARGET PATTERN FOR NEW RULE TERM FOUND !");
} else {
// TextRulerToolkit.log("Ok, found for Rule: "+newRule.getRuleString());
// TextRulerToolkit.log("Term: "+term.getTermNumberInExample()+" ; "+term);
// TextRulerToolkit.log("Slot "+foundSlotNumber+" - Pattern: "+foundSlotPattern);
// now put that term into the rule:
int indexInPattern = -1;
if (targetPattern.size() == 0) {
targetPattern.add(term.copy());
indexInPattern = 0;
} else {
// 1. search if the term would replace a wildcard:
WhiskRuleItem wildCard = null;
for (TextRulerRuleItem i : newRule.getPatterns().get(0).preFillerPattern) {
if (((WhiskRuleItem) i).isStarWildCard()) {
WhiskRuleItem left = newRule.searchNeighborOfItem(((WhiskRuleItem) i), true);
WhiskRuleItem right = newRule.searchNeighborOfItem(((WhiskRuleItem) i), false);
if (left.getWordConstraint().getTokenAnnotation().getEnd() <= termBeginNumber
&& right.getWordConstraint().getTokenAnnotation().getBegin() >= termEndNumber)
wildCard = (WhiskRuleItem) i;
}
}
if (wildCard == null) {
for (TextRulerRuleItem i : newRule.getPatterns().get(0).fillerPattern) {
if (((WhiskRuleItem) i).isStarWildCard()) {
WhiskRuleItem left = newRule.searchNeighborOfItem(((WhiskRuleItem) i), true);
WhiskRuleItem right = newRule.searchNeighborOfItem(((WhiskRuleItem) i), false);
if (left != null
&& left.getWordConstraint().getTokenAnnotation().getEnd() <= termBeginNumber
&& right.getWordConstraint().getTokenAnnotation().getBegin() >= termEndNumber)
wildCard = (WhiskRuleItem) i;
}
}
}
if (wildCard == null) {
for (TextRulerRuleItem i : newRule.getPatterns().get(0).postFillerPattern) {
if (((WhiskRuleItem) i).isStarWildCard()) {
WhiskRuleItem left = newRule.searchNeighborOfItem(((WhiskRuleItem) i), true);
WhiskRuleItem right = newRule.searchNeighborOfItem(((WhiskRuleItem) i), false);
if (left.getWordConstraint().getTokenAnnotation().getEnd() <= termBeginNumber
&& right.getWordConstraint().getTokenAnnotation().getBegin() >= termEndNumber)
wildCard = (WhiskRuleItem) i;
}
}
}
if (wildCard != null) {
if (!wildCard.isStarWildCard()) {
TextRulerToolkit
.log("ERROR, FOUND A TERM WITH THE SAME NUMBER THAT IS NOT A WILDCARD! HOW IS THAT???");
return null;
}
if (!targetPattern.contains(wildCard)) {
TextRulerToolkit.log("EVEN WORSE, THAT MUST NOT BE AT ALL!");
return null;
}
indexInPattern = targetPattern.indexOf(wildCard);
targetPattern.set(indexInPattern, term.copy());
} else {
// not a wildcard, so search for the insertion point:
for (int i = 0; i < targetPattern.size(); i++) {
WhiskRuleItem it = (WhiskRuleItem) targetPattern.get(i);
if (it.getWordConstraint() != null
&& termEndNumber <= it.getWordConstraint().getTokenAnnotation().getBegin()) {
indexInPattern = i;
break;
}
}
if (indexInPattern < 0) {
indexInPattern = targetPattern.size();
targetPattern.add(term.copy());
} else
targetPattern.add(indexInPattern, term.copy());
}
}
// ok, now we have replaced a wildcard with the term or added the
// term between two other items.
// we now have to check the neighbors of the new term: if it is a
// direct neighbor (according to the termNumber),
// we have nothing special to do. but if it is not a direct
// neighbor, we have to add a wildcard between the two items (if the
// neighbor item
// is not a wildcard itself!
WhiskRuleItem newTerm = (WhiskRuleItem) targetPattern.get(indexInPattern);
// look at left neighbor:
WhiskRuleItem left = newRule.searchNeighborOfItem(newTerm, true);
if (left != null && left.getWordConstraint() != null) {
// TextRulerToolkit.log("LEFT NEIGHBOR FOUND!");
// so we have a left neighbor. let's see if it also is the
// neighbor in our seed token stream:
if (!left.isStarWildCard()) { // no direct neighbor and
// no wildcard yet,
// so insert a wildcard between us!
boolean isValid = isNextValidNeighbor(left, newTerm, newRule.getSeedExample());
if (!isValid) {
targetPattern.add(indexInPattern, WhiskRuleItem.newWildCardItem());
indexInPattern++;
}
}
}
// look at right neighbor:
WhiskRuleItem right = newRule.searchNeighborOfItem(newTerm, false);
if (right != null && right.getWordConstraint() != null) {
// TextRulerToolkit.log("RIGHT NEIGHBOR FOUND!");
// so we have a right neighbor. let's see if it also is the
// neighbor in our seed token stream:
if (!right.isStarWildCard()) {
// no direct neighbor and
// no wildcard yet,
// so insert a wildcard between us!
boolean isValid = isNextValidNeighbor(newTerm, right, newRule.getSeedExample());
if (!isValid) {
WhiskRuleItem wc = WhiskRuleItem.newWildCardItem();
if (indexInPattern + 1 < targetPattern.size())
targetPattern.add(indexInPattern + 1, wc);
else
targetPattern.add(wc);
}
}
}
newRule.setNeedsCompile(true);
// TextRulerToolkit.log("BEFORE: "+baseRule.getRuleString());
// TextRulerToolkit.log("AFTER : "+newRule.getRuleString());
// TextRulerToolkit.log("");
}
if (newRule.getRuleString().equals(baseRule.getRuleString())) // this
// must
// not be!
return null;
else
return newRule;
}
protected WhiskRule anchor(WhiskRule rule, TextRulerExampleDocument doc,
TextRulerExample example, int slotIndex) {
List<WhiskRule> result = new ArrayList<WhiskRule>();
TextRulerAnnotation slotAnnotation = example.getAnnotations()[slotIndex];
List<List<WhiskRuleItem>> window = getTermsWithinBounds(slotAnnotation.getBegin(),
slotAnnotation.getEnd(), example);
for (List<WhiskRuleItem> inside : window) {
if (rule == null || inside.isEmpty()) {
return null;
}
// create base 1 and base 2:
WhiskRule base1 = rule.copy(); // slot filler rule
TextRulerSlotPattern slotPattern = base1.getPatterns().get(slotIndex);
// questionable restriction:
if (inside.size() <= windowSize) { // TODO add parameter for this!
slotPattern.fillerPattern.addAll(inside);
} else {
for (int i = 0; i < inside.size(); i++)
if (i == 0 || (i == inside.size() - 1))
slotPattern.fillerPattern.add(inside.get(i).copy());
else if (inside.size() > 2 && i < 2)
slotPattern.fillerPattern.add(WhiskRuleItem.newWildCardItem());
}
List<WhiskRuleItem> beforeList = getTermsBefore(inside.get(0), example);
List<WhiskRuleItem> afterList = getTermsAfter(inside.get(inside.size() - 1), example);
beforeList.add(null);
afterList.add(null);
Collection<WhiskRule> tempRules = new HashSet<WhiskRule>();
// workaround for better rules:
// only inner begin
for (WhiskRuleItem eachBefore : beforeList) {
for (WhiskRuleItem eachAfter : afterList) {
WhiskRule copy = rule.copy();
TextRulerSlotPattern textRulerSlotPattern = copy.getPatterns().get(slotIndex);
if (eachBefore != null) {
textRulerSlotPattern.preFillerPattern.add(eachBefore);
}
textRulerSlotPattern.fillerPattern.add(inside.get(0).copy());
textRulerSlotPattern.fillerPattern.add(WhiskRuleItem.newWildCardItem());
if (eachAfter != null) {
textRulerSlotPattern.postFillerPattern.add(eachAfter);
}
tempRules.add(copy);
}
}
// only inner end
for (WhiskRuleItem eachBefore : beforeList) {
for (WhiskRuleItem eachAfter : afterList) {
WhiskRule copy = rule.copy();
TextRulerSlotPattern textRulerSlotPattern = copy.getPatterns().get(slotIndex);
if (eachBefore != null) {
textRulerSlotPattern.preFillerPattern.add(eachBefore);
}
textRulerSlotPattern.fillerPattern.add(WhiskRuleItem.newWildCardItem());
textRulerSlotPattern.fillerPattern.add(inside.get(inside.size() - 1).copy());
if (eachAfter != null) {
textRulerSlotPattern.postFillerPattern.add(eachAfter);
}
tempRules.add(copy);
}
}
if (!beforeList.isEmpty()) {
if (!afterList.isEmpty()) {
for (WhiskRuleItem eachBefore : beforeList) {
for (WhiskRuleItem eachAfter : afterList) {
WhiskRule copy = rule.copy();
TextRulerSlotPattern textRulerSlotPattern = copy.getPatterns().get(slotIndex);
if (eachBefore != null) {
textRulerSlotPattern.preFillerPattern.add(eachBefore);
}
textRulerSlotPattern.fillerPattern.add(WhiskRuleItem.newWildCardItem());
if (eachAfter != null) {
textRulerSlotPattern.postFillerPattern.add(eachAfter);
}
tempRules.add(copy);
}
}
} else {
for (WhiskRuleItem eachBefore : beforeList) {
WhiskRule copy = rule.copy();
TextRulerSlotPattern textRulerSlotPattern = copy.getPatterns().get(slotIndex);
textRulerSlotPattern.fillerPattern.add(WhiskRuleItem.newWildCardItem());
if (eachBefore != null) {
textRulerSlotPattern.preFillerPattern.add(eachBefore);
}
tempRules.add(copy);
}
}
} else {
for (WhiskRuleItem eachAfter : afterList) {
WhiskRule copy = rule.copy();
TextRulerSlotPattern textRulerSlotPattern = copy.getPatterns().get(slotIndex);
textRulerSlotPattern.fillerPattern.add(WhiskRuleItem.newWildCardItem());
if (eachAfter != null) {
textRulerSlotPattern.postFillerPattern.add(eachAfter);
}
tempRules.add(copy);
}
}
ArrayList<TextRulerRule> rules = new ArrayList<TextRulerRule>(tempRules);
testRulesIfNotCached(rules);
TextRulerRule best = null;
for (TextRulerRule each : rules) {
if (best == null) {
best = each;
} else {
if (each.getCoveringStatistics().getCoveredPositivesCount() > best
.getCoveringStatistics().getCoveredPositivesCount()) {
best = each;
}
}
}
WhiskRule base2 = (WhiskRule) best;
List<TextRulerRule> testRules = new ArrayList<TextRulerRule>();
if (base1 != null) {
TextRulerToolkit.log("base1: " + base1.getRuleString());
testRules.add(base1);
}
if (base2 != null) {
TextRulerToolkit.log("base2: " + base2.getRuleString());
testRules.add(base2);
}
testRulesIfNotCached(testRules);
if (shouldAbort()) {
return null;
}
if (base1 != null && base2 == null) {
TextRulerToolkit.log("\tbase1: " + base1.getCoveringStatistics() + " --> laplacian = "
+ base1.getLaplacian());
result.add(base1);
} else {
TextRulerToolkit.log("\tbase1: " + base1.getCoveringStatistics() + " --> laplacian = "
+ base1.getLaplacian());
TextRulerToolkit.log("\tbase2: " + base2.getCoveringStatistics() + " --> laplacian = "
+ base2.getLaplacian());
if (base2.getCoveringStatistics().getCoveredPositivesCount() > base1
.getCoveringStatistics().getCoveredPositivesCount()) {
result.add(base2);
} else {
result.add(base1);
}
}
}
TextRulerRule best = null;
for (TextRulerRule each : result) {
if (best == null) {
best = each;
} else {
if (each.getCoveringStatistics().getCoveredPositivesCount() > best.getCoveringStatistics()
.getCoveredPositivesCount()) {
best = each;
}
}
}
return (WhiskRule) best;
}
private List<WhiskRuleItem> getTermsAfter(WhiskRuleItem whiskRuleItem, TextRulerExample example) {
List<WhiskRuleItem> result = new ArrayList<WhiskRuleItem>();
int end = whiskRuleItem.getWordConstraint().getTokenAnnotation().getEnd();
CAS cas = example.getDocumentCAS();
// TODO: access type with string constant
Type frameType = cas.getTypeSystem().getType("org.apache.uima.ruta.type.RutaFrame");
AnnotationFS pointer = cas.createAnnotation(frameType, end, Integer.MAX_VALUE);
FSIterator iterator = cas.getAnnotationIndex().iterator(pointer);
int nextBegin = -1;
while (iterator.isValid()) {
FeatureStructure fs = iterator.get();
if (fs instanceof AnnotationFS) {
AnnotationFS a = (AnnotationFS) fs;
if (!filterSetWithSlotNames.contains(a.getType().getName())) {
if (nextBegin == -1) {
nextBegin = a.getBegin();
} else if (nextBegin != a.getBegin()) {
break;
}
if (a.getBegin() <= nextBegin && a.getBegin() >= end) {
WhiskRuleItem term = new WhiskRuleItem(new TextRulerAnnotation(a,
example.getDocument(), consideredFeatures));
result.add(term);
}
}
}
iterator.moveToNext();
}
return result;
}
private List<WhiskRuleItem> getTermsBefore(WhiskRuleItem whiskRuleItem, TextRulerExample example) {
List<WhiskRuleItem> result = new ArrayList<WhiskRuleItem>();
int begin = whiskRuleItem.getWordConstraint().getTokenAnnotation().getBegin();
CAS cas = example.getDocumentCAS();
// TODO: access type with string constant
Type frameType = cas.getTypeSystem().getType("org.apache.uima.ruta.type.RutaFrame");
AnnotationFS pointer = cas.createAnnotation(frameType, begin, begin);
FSIterator iterator = cas.getAnnotationIndex().iterator(pointer);
int nextEnd = -1;
// ???
iterator.moveToPrevious();
iterator.moveToPrevious();
while (iterator.isValid()) {
FeatureStructure fs = iterator.get();
if (fs instanceof AnnotationFS) {
AnnotationFS a = (AnnotationFS) fs;
if (!filterSetWithSlotNames.contains(a.getType().getName())) {
if (a.getEnd() > example.getAnnotation().getEnd()) {
iterator.moveToPrevious();
continue;
}
if (nextEnd == -1) {
nextEnd = a.getEnd();
} else if (nextEnd != a.getEnd()) {
break;
}
if (a.getEnd() >= nextEnd && a.getEnd() <= begin) {
WhiskRuleItem term = new WhiskRuleItem(new TextRulerAnnotation(a,
example.getDocument(), consideredFeatures));
result.add(term);
}
}
}
iterator.moveToPrevious();
}
return result;
}
public String getResultString() {
if (ruleList != null)
return getFileHeaderString(true) + ruleList.getRulesString("");
else
return "No results available yet!";
}
public void setParameters(Map<String, Object> params) {
if (TextRulerToolkit.DEBUG)
saveParametersToTempFolder(params);
// TODO try catch
if (params.containsKey(WINDOWSIZE_KEY))
windowSize = (Integer) params.get(WINDOWSIZE_KEY);
if (params.containsKey(ERROR_THRESHOLD_KEY))
errorThreshold = (Float) params.get(ERROR_THRESHOLD_KEY);
if (params.containsKey(POSTAG_ROOTTYPE_KEY))
posTagRootTypeName = (String) params.get(POSTAG_ROOTTYPE_KEY);
if (params.containsKey(CONSIDERED_FEATURES)) {
String list = (String) params.get(CONSIDERED_FEATURES);
if (!StringUtils.isBlank(list)) {
String[] split = list.split(",");
for (String string : split) {
String trim = string.trim();
if (!StringUtils.isBlank(trim)) {
consideredFeatures.add(trim);
}
}
}
}
}
public List<List<WhiskRuleItem>> getTermsWithinBounds(int startPos, int endPos,
TextRulerExample example) {
List<List<WhiskRuleItem>> result = new ArrayList<List<WhiskRuleItem>>();
CAS cas = example.getDocumentCAS();
// TODO: access type with string constant
Type frameType = cas.getTypeSystem().getType("org.apache.uima.ruta.type.RutaFrame");
AnnotationFS pointer = cas.createAnnotation(frameType, startPos, endPos);
FSIterator iterator = cas.getAnnotationIndex().iterator(pointer);
List<AnnotationFS> startAs = new ArrayList<AnnotationFS>();
int firstBegin = -1;
while (iterator.isValid()) {
FeatureStructure fs = iterator.get();
AnnotationFS a = (AnnotationFS) fs;
// TODO change for multislot rules!
if (a.getBegin() >= startPos && a.getEnd() <= endPos) {
if (!filterSetWithSlotNames.contains(a.getType().getName())) {
if (firstBegin == -1) {
firstBegin = a.getBegin();
} else if (firstBegin != a.getBegin()) {
break;
}
if (a.getBegin() == firstBegin)
startAs.add(a);
}
iterator.moveToNext();
} else {
iterator.moveToNext();
}
}
for (AnnotationFS annotation : startAs) {
List<WhiskRuleItem> startList = new ArrayList<WhiskRuleItem>();
WhiskRuleItem term = new WhiskRuleItem(new TextRulerAnnotation(annotation,
example.getDocument(), consideredFeatures));
startList.add(term);
result.add(startList);
}
result = addFollowing(result, endPos, example);
return result;
}
private List<List<WhiskRuleItem>> addFollowing(List<List<WhiskRuleItem>> lists, int till,
TextRulerExample example) {
List<List<WhiskRuleItem>> result = new ArrayList<List<WhiskRuleItem>>();
for (List<WhiskRuleItem> list : lists) {
WhiskRuleItem last = list.get(list.size() - 1);
List<WhiskRuleItem> termsAfter = getTermsAfter(last, example);
if (termsAfter.isEmpty()) {
return lists;
}
for (WhiskRuleItem eachAfter : termsAfter) {
if (eachAfter.getWordConstraint().getTokenAnnotation().getEnd() <= till) {
List<WhiskRuleItem> newList = new ArrayList<WhiskRuleItem>();
newList.addAll(list);
newList.add(eachAfter);
result.add(newList);
result = addFollowing(result, till, example);
} else {
return lists;
}
}
}
return result;
}
// TODO share this between algorithms (e.g. LP2 and RAPIER, WHISK ?) and
// make a maximum size of the cache, etc. like CasCache?
protected void testRulesIfNotCached(List<TextRulerRule> rules) {
List<TextRulerRule> rulesToTest = new ArrayList<TextRulerRule>();
for (TextRulerRule r : rules) {
String key = r.getRuleString();
if (cachedTestedRuleStatistics.containsKey(key)) {
r.setCoveringStatistics(cachedTestedRuleStatistics.get(key).copy());
} else
rulesToTest.add(r);
}
if (rulesToTest.size() > 0) {
testRulesOnDocumentSet(rulesToTest, exampleDocuments);
if (shouldAbort())
return;
for (TextRulerRule r : rulesToTest) {
String key = r.getRuleString();
cachedTestedRuleStatistics.put(key, r.getCoveringStatistics().copy());
}
}
}
private int getElementIndex(WhiskRule proposedRule, WhiskRuleItem term) {
if (term == null)
return -1;
int index = 0;
int result = -1;
for (TextRulerRuleItem i : proposedRule.getPatterns().get(0).preFillerPattern) {
if (((WhiskRuleItem) i).equals(term)) {
result = index;
}
index++;
}
for (TextRulerRuleItem i : proposedRule.getPatterns().get(0).fillerPattern) {
if (((WhiskRuleItem) i).equals(term)) {
result = index;
}
index++;
}
for (TextRulerRuleItem i : proposedRule.getPatterns().get(0).postFillerPattern) {
if (((WhiskRuleItem) i).equals(term)) {
result = index;
}
index++;
}
return result;
}
private boolean isNextValidNeighbor(WhiskRuleItem left, WhiskRuleItem right,
TextRulerExample example) {
CAS cas = example.getDocumentCAS();
// TODO: access type with string constant
Type frameType = cas.getTypeSystem().getType("org.apache.uima.ruta.type.RutaFrame");
int begin = left.getWordConstraint().getTokenAnnotation().getEnd();
int end = right.getWordConstraint().getTokenAnnotation().getBegin();
AnnotationFS pointer = cas.createAnnotation(frameType, begin, end);
FSIterator iterator = cas.getAnnotationIndex().iterator(pointer);
while (iterator.isValid()) {
FeatureStructure fs = iterator.get();
AnnotationFS a = (AnnotationFS) fs;
if (a.getBegin() >= begin && a.getEnd() <= end) {
if (!filterSetWithSlotNames.contains(a.getType().getName())) {
return false;
}
}
iterator.moveToNext();
}
return true;
}
}