blob: 6ce415318d49e19430d5d67cc3b66a27c1e7c7fe [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.apache.uima.ruta.textruler.learner.whisk.token;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import org.apache.uima.cas.CAS;
import org.apache.uima.cas.Type;
import org.apache.uima.cas.TypeSystem;
import org.apache.uima.cas.text.AnnotationFS;
import org.apache.uima.ruta.textruler.core.TextRulerAnnotation;
import org.apache.uima.ruta.textruler.core.TextRulerBasicLearner;
import org.apache.uima.ruta.textruler.core.TextRulerExample;
import org.apache.uima.ruta.textruler.core.TextRulerExampleDocument;
import org.apache.uima.ruta.textruler.core.TextRulerRule;
import org.apache.uima.ruta.textruler.core.TextRulerRuleList;
import org.apache.uima.ruta.textruler.core.TextRulerRulePattern;
import org.apache.uima.ruta.textruler.core.TextRulerSlotPattern;
import org.apache.uima.ruta.textruler.core.TextRulerStatisticsCollector;
import org.apache.uima.ruta.textruler.core.TextRulerTarget;
import org.apache.uima.ruta.textruler.core.TextRulerToolkit;
import org.apache.uima.ruta.textruler.extension.TextRulerLearnerDelegate;
import org.apache.uima.ruta.textruler.learner.whisk.token.WhiskRuleItem.MLWhiskOtherConstraint;
public class Whisk extends TextRulerBasicLearner {
public final static String WINDOSIZE_KEY = "windowSize";
public final static String ERROR_THRESHOLD_KEY = "errorThreshold";
public final static String POSTAG_ROOTTYPE_KEY = "posTagRootType";
public final static int STANDARD_WINDOWSIZE = 5;
public final static float STANDARD_ERROR_THRESHOLD = 0.1f;
public final static String STANDARD_POSTAG_ROOTTYPE = "org.apache.uima.ml.ML.postag";
TextRulerRuleList ruleList;
protected Set<TextRulerExample> coveredExamples;
protected int windowSize = STANDARD_WINDOWSIZE;
protected double errorThreshold = STANDARD_ERROR_THRESHOLD;
protected String posTagRootTypeName = STANDARD_POSTAG_ROOTTYPE;
int roundNumber = 0;
int allExamplesCount = 0;
private Map<String, TextRulerStatisticsCollector> cachedTestedRuleStatistics = new HashMap<String, TextRulerStatisticsCollector>();
public Whisk(String inputDir, String prePropTmFile, String tmpDir, String[] slotNames,
Set<String> filterSet, boolean skip, TextRulerLearnerDelegate delegate) {
super(inputDir, prePropTmFile, tmpDir, slotNames, filterSet, skip, delegate);
}
@Override
public boolean collectNegativeCoveredInstancesWhenTesting() {
return false;
}
@Override
protected void doRun() {
// we don't use the same overall structure like the original WHISK since
// we do not
// repeat the whole process for some new training documents at the
// user's request, we
// learn like the other algorithms from the whole training set, so we
// for example do not
// need to test the intermediate rule base on a newly "incoming"
// training document since we
// tested all rules already on all training documents !
// this version of whisk is not tested for mutli slot learning since the
// seminar announcements
// are not quite suitable for this task: they do not all contain all 4
// slots and some of them
// occur more than once in one document ! And the order of them is not
// always the same as well!
// so this is now made only tested for the single slot case even if it
// is built capable of multislot
// examples!
// this is the inner loop of the WHISK pseudo-code:
// For each inst in Training
// for each tag
cachedTestedRuleStatistics.clear();
ruleList = new TextRulerRuleList();
coveredExamples = new HashSet<TextRulerExample>();
sendStatusUpdateToDelegate("Creating examples...", TextRulerLearnerState.ML_RUNNING, false);
TextRulerTarget target = new TextRulerTarget(slotNames[0], this); // only
// single-slot-target
// for now
exampleDocuments.createExamplesForTarget(target);
TextRulerExampleDocument[] docs = exampleDocuments.getSortedDocumentsInCacheOptimizedOrder();
allExamplesCount = exampleDocuments.getAllPositiveExamples().size();
for (TextRulerExampleDocument inst : docs) {
List<TextRulerExample> tags = inst.getPositiveExamples();
// for each uncovered example -> induce a new rule:
for (TextRulerExample tag : tags) {
if (!coveredExamples.contains(tag)) {
roundNumber++;
WhiskRule newRule = growRule(inst, tag);
if (shouldAbort())
break;
// if (newRule == null)
// break;
// else
if (newRule != null
&& (newRule.getCoveringStatistics().getCoveredNegativesCount() == 00 || newRule
.getLaplacian() <= errorThreshold)) {
ruleList.addRule(newRule);
coveredExamples.addAll(newRule.getCoveringStatistics().getCoveredPositiveExamples());
sendStatusUpdateToDelegate("New Rule added...", TextRulerLearnerState.ML_RUNNING, true);
}
}
}
if (shouldAbort())
return;
}
sendStatusUpdateToDelegate("Done", TextRulerLearnerState.ML_DONE, true);
cachedTestedRuleStatistics.clear();
}
protected WhiskRule growRule(TextRulerExampleDocument doc, TextRulerExample example) {
sendStatusUpdateToDelegate("Creating new rule from seed...", TextRulerLearnerState.ML_RUNNING,
false);
WhiskRule theRule = new WhiskRule(this, example.getTarget(), example);
int numberOfSlotsInTag = example.getAnnotations().length;
for (int i = 0; i < numberOfSlotsInTag; i++)
theRule.getPatterns().add(new TextRulerSlotPattern());
List<WhiskRuleItem> allTerms = getAllTermsOfExample(example);
sendStatusUpdateToDelegate("Creating new rule: anchoring...", TextRulerLearnerState.ML_RUNNING,
false);
for (int i = 0; i < numberOfSlotsInTag; i++) {
theRule = anchor(theRule, doc, example, allTerms, i);
if (shouldAbort())
return null;
}
sendStatusUpdateToDelegate("Creating new rule: extending...", TextRulerLearnerState.ML_RUNNING,
false);
if (theRule != null) {
double oldLaplacian = theRule.getLaplacian();
int subRoundNumber = 0;
// repeat while we still make errors...
while (theRule.getCoveringStatistics().getCoveredNegativesCount() > 0) {
WhiskRule extendedRule = extendRule(theRule, doc, example, allTerms, subRoundNumber);
if (extendedRule == null) {
// this way we get the previous rule
// as the best rule...
break;
}
theRule = extendedRule;
TextRulerToolkit.log("----------------------------");
TextRulerToolkit.log("BEST EXTENSION IS: " + theRule.getRuleString());
TextRulerToolkit.log("Laplacian: " + theRule.getLaplacian() + " ; "
+ theRule.getCoveringStatistics());
subRoundNumber++;
double newLaplacian = theRule.getLaplacian();
if (newLaplacian >= oldLaplacian) {
break;
}
oldLaplacian = newLaplacian;
}
TextRulerToolkit.log("----------------------------");
TextRulerToolkit.log("FINAL RULE IS : " + theRule.getRuleString());
}
return theRule;
}
protected WhiskRule extendRule(WhiskRule rule, TextRulerExampleDocument doc,
TextRulerExample example, List<WhiskRuleItem> allTerms, int subRoundNumber) {
WhiskRule bestRule = null;
double bestL = 1.0;
int bestRuleConstraintPoints = -1;
if (rule.getLaplacian() <= errorThreshold) {
bestRule = rule;
bestL = rule.getLaplacian();
}
List<WhiskRuleItem> slotTerms = getTermsWithinBounds(allTerms,
example.getAnnotations()[0].getBegin(), example.getAnnotations()[0].getEnd());
WhiskRuleItem firstSlotTerm = slotTerms.get(0);
WhiskRuleItem lastSlotTerm = slotTerms.get(slotTerms.size() - 1);
List<TextRulerRule> rulesToTest = new ArrayList<TextRulerRule>();
for (WhiskRuleItem term : allTerms) {
if (rule.containsTerm(term)) {
continue;
}
boolean rejectTerm = false;
// for now this works only for slot 0 (no multislot stuff here yet!)
if (term.getTermNumberInExample() < firstSlotTerm.getTermNumberInExample())
rejectTerm = firstSlotTerm.getTermNumberInExample() - term.getTermNumberInExample() > windowSize;
else if (term.getTermNumberInExample() > lastSlotTerm.getTermNumberInExample())
rejectTerm = term.getTermNumberInExample() - firstSlotTerm.getTermNumberInExample() > windowSize;
if (rejectTerm) {
// out of window scope -> skip to next...
continue;
}
WhiskRule proposedRule = createNewRuleByAddingTerm(rule, term);
WhiskRuleItem t = proposedRule.searchItemWithTermNumber(term.getTermNumberInExample());
if (!rulesToTest.contains(proposedRule))
rulesToTest.add(proposedRule);
// add a second version where we remove the exact token content if
// it is a regexp item:
WhiskRule proposedRule2 = null;
WhiskRuleItem t2 = null;
if (t.getWordConstraint().isRegExpConstraint()) {
proposedRule2 = proposedRule.copy();
t2 = proposedRule2.searchItemWithTermNumber(term.getTermNumberInExample());
t2.setHideRegExp(true);
proposedRule2.setNeedsCompile(true);
if (!rulesToTest.contains(proposedRule2)) {
rulesToTest.add(proposedRule2);
}
}
// and now, for WHISK performance testing purposes, we also add POS
// tags:
// this is not very nice code and not dynamic feature capable, but
// for testpurposes
// in order to test WHISK with PosTag Terms...
if (posTagRootTypeName != null && posTagRootTypeName.length() > 0) {
TextRulerAnnotation tokenAnnotation = term.getWordConstraint().getTokenAnnotation();
CAS cas = example.getDocumentCAS();
TypeSystem ts = cas.getTypeSystem();
Type posTagsRootType = ts.getType(posTagRootTypeName);
if (ts != null) {
// POS-Tags created by our test hmm tagger.
List<AnnotationFS> posTagAnnotations = TextRulerToolkit.getAnnotationsWithinBounds(cas,
tokenAnnotation.getBegin(), tokenAnnotation.getEnd(), null, posTagsRootType);
if (posTagAnnotations.size() > 0) {
AnnotationFS posTag = posTagAnnotations.get(0);
if (posTag.getBegin() == tokenAnnotation.getBegin()
&& posTag.getEnd() == tokenAnnotation.getEnd()) {
TextRulerAnnotation posTagAnnotation = new TextRulerAnnotation(posTag, doc);
// 1. most specific term with all constraints we
// have:
WhiskRule proposedRule3 = proposedRule.copy();
WhiskRuleItem t3 = proposedRule3.searchItemWithTermNumber(term
.getTermNumberInExample());
t3.addOtherConstraint(new MLWhiskOtherConstraint(tokenAnnotation, posTagAnnotation));
proposedRule3.setNeedsCompile(true);
if (!rulesToTest.contains(proposedRule3))
rulesToTest.add(proposedRule3);
// 2. the same without the regexp thingy:
if (proposedRule2 != null) {
WhiskRule proposedRule4 = proposedRule2.copy();
WhiskRuleItem t4 = proposedRule4.searchItemWithTermNumber(term
.getTermNumberInExample());
t4.addOtherConstraint(new MLWhiskOtherConstraint(tokenAnnotation, posTagAnnotation));
proposedRule4.setNeedsCompile(true);
if (!rulesToTest.contains(proposedRule4))
rulesToTest.add(proposedRule4);
}
// 3. last but not least: a rule with only the pos
// tag constraint:
WhiskRule proposedRule5 = proposedRule.copy();
WhiskRuleItem t5 = proposedRule5.searchItemWithTermNumber(term
.getTermNumberInExample());
t5.addOtherConstraint(new MLWhiskOtherConstraint(tokenAnnotation, posTagAnnotation));
t5.setWordConstraint(null);
proposedRule5.setNeedsCompile(true);
if (!rulesToTest.contains(proposedRule5))
rulesToTest.add(proposedRule5);
}
}
}
}
}
if (rulesToTest.size() == 0)
return bestRule;
sendStatusUpdateToDelegate(
"Round "
+ roundNumber
+ "."
+ subRoundNumber
+ " - Testing "
+ rulesToTest.size()
+ " rules... "
+ " - uncovered examples: "
+ (allExamplesCount - coveredExamples.size() + " / " + allExamplesCount
+ " ; cs=" + cachedTestedRuleStatistics.size()),
TextRulerLearnerState.ML_RUNNING, false);
TextRulerToolkit.log("Testing " + rulesToTest.size() + " rules on training set...");
for (TextRulerRule r : rulesToTest)
TextRulerToolkit.log(r.getRuleString());
testRulesIfNotCached(rulesToTest); // testRulesOnDocumentSet(rulesToTest,
// exampleDocuments);
if (shouldAbort())
return null;
for (TextRulerRule r : rulesToTest) {
WhiskRule wr = (WhiskRule) r;
if (wr.getLaplacian() < bestL) {
bestL = wr.getLaplacian();
bestRule = wr;
bestRuleConstraintPoints = bestRule.totalConstraintPoints();
} else if (wr.getLaplacian() == bestL && bestRuleConstraintPoints >= 0) {
TextRulerToolkit.log("Same Laplacian! So prefer more general rule!");
if (wr.totalConstraintPoints() < bestRuleConstraintPoints) {
TextRulerToolkit.log("\tYes, prefered!");
bestL = wr.getLaplacian();
bestRule = wr;
bestRuleConstraintPoints = bestRule.totalConstraintPoints();
}
}
}
return bestRule;
}
protected WhiskRule createNewRuleByAddingTerm(WhiskRule baseRule, WhiskRuleItem term) {
WhiskRule newRule = baseRule.copy();
int foundSlotNumber = -1; // debug info
String foundSlotPattern = "";
int termNumber = term.getTermNumberInExample();
// determine, where this term is located relatively to the slots we
// have...
TextRulerRulePattern targetPattern = null;
TextRulerRulePattern previousSlotPostFillerPattern = null;
for (int i = 0; i < newRule.getPatterns().size(); i++) {
TextRulerSlotPattern slotPattern = newRule.getPatterns().get(i);
WhiskRuleItem it = (WhiskRuleItem) slotPattern.preFillerPattern.lastItem(); // look at the
// prefiller
// pattern
if (it != null && termNumber <= it.getTermNumberInExample())
targetPattern = slotPattern.preFillerPattern;
if (targetPattern == null && slotPattern.fillerPattern.size() > 0) // now
// look
// at
// the
// filler
// pattern
{
it = (WhiskRuleItem) slotPattern.fillerPattern.firstItem();
if (termNumber < it.getTermNumberInExample()) // it's still for
// the prefiller
// pattern but it
// seems to be
// emtpy so we
// could not find
// that out above!
targetPattern = slotPattern.preFillerPattern;
else {
it = (WhiskRuleItem) slotPattern.fillerPattern.lastItem();
if (termNumber <= it.getTermNumberInExample()) {
targetPattern = slotPattern.fillerPattern;
}
}
}
if (targetPattern == null && slotPattern.postFillerPattern.size() > 0) // now look at
// the
// postfiller
// pattern
{
it = (WhiskRuleItem) slotPattern.postFillerPattern.firstItem();
if (termNumber < it.getTermNumberInExample()) // it's still for
// the filler
// pattern but it
// seems to be
// emtpy so we
// could not find
// that out above!
targetPattern = slotPattern.fillerPattern;
else {
it = (WhiskRuleItem) slotPattern.postFillerPattern.lastItem();
if (termNumber <= it.getTermNumberInExample())
targetPattern = slotPattern.postFillerPattern;
}
}
if (targetPattern == null) {
targetPattern = previousSlotPostFillerPattern;
if (i > 0) {
TextRulerSlotPattern prevSlotPattern = newRule.getPatterns().get(i - 1);
foundSlotPattern = targetPattern == prevSlotPattern.preFillerPattern ? "PRE FILLER"
: (targetPattern == prevSlotPattern.fillerPattern ? "FILLER" : "POST FILLER");
foundSlotNumber = i - 1;
}
} else {
foundSlotPattern = targetPattern == slotPattern.preFillerPattern ? "PRE FILLER"
: (targetPattern == slotPattern.fillerPattern ? "FILLER" : "POST FILLER");
foundSlotNumber = i;
}
previousSlotPostFillerPattern = slotPattern.postFillerPattern;
}
if (targetPattern == null) {
targetPattern = previousSlotPostFillerPattern;
foundSlotNumber = newRule.getPatterns().size() - 1;
foundSlotPattern = "POST FILLER";
}
if (targetPattern == null) {
TextRulerToolkit.log("ERROR, NO TARGET PATTERN FOR NEW RULE TERM FOUND !");
} else {
// TextRulerToolkit.log("Ok, found for Rule: "+newRule.getRuleString());
// TextRulerToolkit.log("Term: "+term.getTermNumberInExample()+" ; "+term);
// TextRulerToolkit.log("Slot "+foundSlotNumber+" - Pattern: "+foundSlotPattern);
// now put that term into the rule:
int indexInPattern = -1;
if (targetPattern.size() == 0) {
targetPattern.add(term.copy());
indexInPattern = 0;
} else {
// 1. search if the term would replace a wildcard:
WhiskRuleItem wildCard = newRule.searchItemWithTermNumber(termNumber);
if (wildCard != null) {
if (!wildCard.isStarWildCard()) {
TextRulerToolkit
.log("ERROR, FOUND A TERM WITH THE SAME NUMBER THAT IS NOT A WILDCARD! HOW IS THAT???");
return null;
}
if (!targetPattern.contains(wildCard)) {
TextRulerToolkit.log("EVEN WORSE, THAT MUST NOT BE AT ALL!");
return null;
}
indexInPattern = targetPattern.indexOf(wildCard);
targetPattern.set(indexInPattern, term.copy());
} else {
// not a wildcard, so search for the insertion point:
for (int i = 0; i < targetPattern.size(); i++) {
WhiskRuleItem it = (WhiskRuleItem) targetPattern.get(i);
if (termNumber < it.getTermNumberInExample()) {
indexInPattern = i;
break;
}
}
if (indexInPattern < 0) {
indexInPattern = targetPattern.size();
targetPattern.add(term.copy());
} else
targetPattern.add(indexInPattern, term.copy());
}
}
// ok, now we have replaced a wildcard with the term or added the
// term between two other items.
// we now have to check the neighbors of the new term: if it is a
// direct neighbor (according to the termNumber),
// we have nothing special to do. but if it is not a direct
// neighbor, we have to add a wildcard between the two items (if the
// neighbor item
// is not a wildcard itself!
WhiskRuleItem newTerm = (WhiskRuleItem) targetPattern.get(indexInPattern);
// look at left neighbor:
WhiskRuleItem left = newRule.searchNeighborOfItem(newTerm, true);
if (left != null) {
// TextRulerToolkit.log("LEFT NEIGHBOR FOUND!");
// so we have a left neighbor. let's see if it also is the
// neighbor in our seed token stream:
if (left.getTermNumberInExample() < newTerm.getTermNumberInExample() - 1
&& !left.isStarWildCard()) { // no direct neighbor and
// no wildcard yet,
// so insert a wildcard between us!
targetPattern.add(indexInPattern,
WhiskRuleItem.newWildCardItem(left.getTermNumberInExample() + 1));
indexInPattern++;
}
}
// look at right neighbor:
WhiskRuleItem right = newRule.searchNeighborOfItem(newTerm, false);
if (right != null) {
// TextRulerToolkit.log("RIGHT NEIGHBOR FOUND!");
// so we have a right neighbor. let's see if it also is the
// neighbor in our seed token stream:
if (right.getTermNumberInExample() > newTerm.getTermNumberInExample() + 1
&& !right.isStarWildCard()) { // no direct neighbor and
// no wildcard yet,
// so insert a wildcard between us!
WhiskRuleItem wc = WhiskRuleItem.newWildCardItem(newTerm.getTermNumberInExample() + 1);
if (indexInPattern + 1 < targetPattern.size())
targetPattern.add(indexInPattern + 1, wc);
else
targetPattern.add(wc);
}
}
newRule.setNeedsCompile(true);
// TextRulerToolkit.log("BEFORE: "+baseRule.getRuleString());
// TextRulerToolkit.log("AFTER : "+newRule.getRuleString());
// TextRulerToolkit.log("");
}
if (newRule.getRuleString().equals(baseRule.getRuleString())) // this
// must
// not be!
return null;
else
return newRule;
}
protected WhiskRule anchor(WhiskRule rule, TextRulerExampleDocument doc,
TextRulerExample example, List<WhiskRuleItem> allTerms, int slotIndex) {
TextRulerAnnotation slotAnnotation = example.getAnnotations()[slotIndex];
List<WhiskRuleItem> inside = getTermsWithinBounds(allTerms, slotAnnotation.getBegin(),
slotAnnotation.getEnd());
if (rule == null || inside.isEmpty()) {
return null;
}
// create base 1 and base 2:
WhiskRule base1 = rule.copy(); // slot filler rule
TextRulerSlotPattern slotPattern = base1.getPatterns().get(slotIndex);
for (int i = 0; i < inside.size(); i++)
if (i == 0 || (i == inside.size() - 1))
slotPattern.fillerPattern.add(inside.get(i).copy());
else if (inside.size() > 2 && i < 2)
slotPattern.fillerPattern.add(WhiskRuleItem.newWildCardItem(inside.get(i)
.getTermNumberInExample()));
WhiskRule base2 = rule.copy(); // slot context rule
slotPattern = base2.getPatterns().get(slotIndex);
int firstOfSlot = allTerms.indexOf(inside.get(0));
int lastOfSlot = allTerms.indexOf(inside.get(inside.size() - 1));
if (firstOfSlot > 0)
slotPattern.preFillerPattern.add(allTerms.get(firstOfSlot - 1));
slotPattern.fillerPattern.add(WhiskRuleItem.newWildCardItem(inside.get(0)
.getTermNumberInExample()));
if (lastOfSlot + 1 < allTerms.size())
slotPattern.postFillerPattern.add(allTerms.get(lastOfSlot + 1));
TextRulerToolkit.log("base1: " + base1.getRuleString());
TextRulerToolkit.log("base2: " + base2.getRuleString());
List<TextRulerRule> testRules = new ArrayList<TextRulerRule>();
testRules.add(base1);
testRules.add(base2);
// testRulesOnDocumentSet(testRules, exampleDocuments);
testRulesIfNotCached(testRules);
if (shouldAbort())
return null;
TextRulerToolkit.log("\tbase1: " + base1.getCoveringStatistics() + " --> laplacian = "
+ base1.getLaplacian());
TextRulerToolkit.log("\tbase2: " + base2.getCoveringStatistics() + " --> laplacian = "
+ base2.getLaplacian());
if (base2.getCoveringStatistics().getCoveredPositivesCount() > base1.getCoveringStatistics()
.getCoveredPositivesCount())
return base2;
else
return base1;
}
public String getResultString() {
if (ruleList != null)
return getFileHeaderString(true) + ruleList.getRulesString("");
else
return "No results available yet!";
}
public void setParameters(Map<String, Object> params) {
if (TextRulerToolkit.DEBUG)
saveParametersToTempFolder(params);
// TODO try catch
if (params.containsKey(WINDOSIZE_KEY))
windowSize = (Integer) params.get(WINDOSIZE_KEY);
if (params.containsKey(ERROR_THRESHOLD_KEY))
errorThreshold = (Float) params.get(ERROR_THRESHOLD_KEY);
if (params.containsKey(POSTAG_ROOTTYPE_KEY))
posTagRootTypeName = (String) params.get(POSTAG_ROOTTYPE_KEY);
}
public List<WhiskRuleItem> getAllTermsOfExample(TextRulerExample example) {
CAS cas = example.getDocumentCAS();
Type tokensRootType = cas.getTypeSystem().getType(TextRulerToolkit.RUTA_ANY_TYPE_NAME);
List<AnnotationFS> all = TextRulerToolkit.getAnnotationsWithinBounds(cas, 0, cas
.getDocumentText().length() + 1, TextRulerToolkit.getFilterSetWithSlotNames(slotNames,
filterSet), tokensRootType);
List<WhiskRuleItem> result = new ArrayList<WhiskRuleItem>();
int i = 0;
for (AnnotationFS afs : all) {
WhiskRuleItem term = new WhiskRuleItem(new TextRulerAnnotation(afs, example.getDocument()));
term.setTermNumberInExample(i);
i++;
result.add(term);
}
return result;
}
public List<WhiskRuleItem> getTermsWithinBounds(List<WhiskRuleItem> allTerms, int startPos,
int endPos) {
List<WhiskRuleItem> result = new ArrayList<WhiskRuleItem>();
for (WhiskRuleItem term : allTerms) {
TextRulerAnnotation a = term.getWordConstraint().getTokenAnnotation();
if (a.getBegin() >= startPos && a.getEnd() <= endPos)
result.add(term);
if (a.getEnd() > endPos)
break;
}
return result;
}
// TODO share this between algorithms (e.g. LP2 and RAPIER, WHISK ?) and
// make a maximum size of the cache, etc. like CasCache?
protected void testRulesIfNotCached(List<TextRulerRule> rules) {
List<TextRulerRule> rulesToTest = new ArrayList<TextRulerRule>();
for (TextRulerRule r : rules) {
String key = r.getRuleString();
if (cachedTestedRuleStatistics.containsKey(key)) {
r.setCoveringStatistics(cachedTestedRuleStatistics.get(key).copy());
TextRulerToolkit.log("CACHE HIT !");
} else
rulesToTest.add(r);
}
if (rulesToTest.size() > 0) {
testRulesOnDocumentSet(rulesToTest, exampleDocuments);
if (shouldAbort())
return;
for (TextRulerRule r : rulesToTest) {
String key = r.getRuleString();
cachedTestedRuleStatistics.put(key, r.getCoveringStatistics().copy());
}
}
}
}