trunk/ruta-ep-textruler/src/main/java/org/apache/uima/ruta/textruler/learner/whisk/token/Whisk.java - uima-ruta - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one
  * or more contributor license agreements.  See the NOTICE file
  * distributed with this work for additional information
  * regarding copyright ownership.  The ASF licenses this file
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
  *
  * http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing,
  * software distributed under the License is distributed on an
  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
  * KIND, either express or implied.  See the License for the
  * specific language governing permissions and limitations
  * under the License.
  */

 package org.apache.uima.ruta.textruler.learner.whisk.token;

 import java.util.ArrayList;
 import java.util.HashMap;
 import java.util.HashSet;
 import java.util.List;
 import java.util.Map;
 import java.util.Set;

 import org.apache.uima.cas.CAS;
 import org.apache.uima.cas.Type;
 import org.apache.uima.cas.TypeSystem;
 import org.apache.uima.cas.text.AnnotationFS;
 import org.apache.uima.ruta.textruler.core.TextRulerAnnotation;
 import org.apache.uima.ruta.textruler.core.TextRulerBasicLearner;
 import org.apache.uima.ruta.textruler.core.TextRulerExample;
 import org.apache.uima.ruta.textruler.core.TextRulerExampleDocument;
 import org.apache.uima.ruta.textruler.core.TextRulerRule;
 import org.apache.uima.ruta.textruler.core.TextRulerRuleList;
 import org.apache.uima.ruta.textruler.core.TextRulerRulePattern;
 import org.apache.uima.ruta.textruler.core.TextRulerSlotPattern;
 import org.apache.uima.ruta.textruler.core.TextRulerStatisticsCollector;
 import org.apache.uima.ruta.textruler.core.TextRulerTarget;
 import org.apache.uima.ruta.textruler.core.TextRulerToolkit;
 import org.apache.uima.ruta.textruler.extension.TextRulerLearnerDelegate;
 import org.apache.uima.ruta.textruler.learner.whisk.token.WhiskRuleItem.MLWhiskOtherConstraint;

 public class Whisk extends TextRulerBasicLearner {

   public final static String WINDOSIZE_KEY = "windowSize";

   public final static String ERROR_THRESHOLD_KEY = "errorThreshold";

   public final static String POSTAG_ROOTTYPE_KEY = "posTagRootType";

   public final static int STANDARD_WINDOWSIZE = 5;

   public final static float STANDARD_ERROR_THRESHOLD = 0.1f;

   public final static String STANDARD_POSTAG_ROOTTYPE = "org.apache.uima.ml.ML.postag";

   TextRulerRuleList ruleList;

   protected Set<TextRulerExample> coveredExamples;

   protected int windowSize = STANDARD_WINDOWSIZE;

   protected double errorThreshold = STANDARD_ERROR_THRESHOLD;

   protected String posTagRootTypeName = STANDARD_POSTAG_ROOTTYPE;

   int roundNumber = 0;

   int allExamplesCount = 0;

   private Map<String, TextRulerStatisticsCollector> cachedTestedRuleStatistics = new HashMap<String, TextRulerStatisticsCollector>();

   public Whisk(String inputDir, String prePropTmFile, String tmpDir, String[] slotNames,
           Set<String> filterSet, boolean skip, TextRulerLearnerDelegate delegate) {
     super(inputDir, prePropTmFile, tmpDir, slotNames, filterSet, skip, delegate);
   }

   @Override
   public boolean collectNegativeCoveredInstancesWhenTesting() {
     return false;
   }

   @Override
   protected void doRun() {

     // we don't use the same overall structure like the original WHISK since
     // we do not
     // repeat the whole process for some new training documents at the
     // user's request, we
     // learn like the other algorithms from the whole training set, so we
     // for example do not
     // need to test the intermediate rule base on a newly "incoming"
     // training document since we
     // tested all rules already on all training documents !

     // this version of whisk is not tested for mutli slot learning since the
     // seminar announcements
     // are not quite suitable for this task: they do not all contain all 4
     // slots and some of them
     // occur more than once in one document ! And the order of them is not
     // always the same as well!
     // so this is now made only tested for the single slot case even if it
     // is built capable of multislot
     // examples!

     // this is the inner loop of the WHISK pseudo-code:
     // For each inst in Training
     // for each tag

     cachedTestedRuleStatistics.clear();
     ruleList = new TextRulerRuleList();
     coveredExamples = new HashSet<TextRulerExample>();

     sendStatusUpdateToDelegate("Creating examples...", TextRulerLearnerState.ML_RUNNING, false);
     TextRulerTarget target = new TextRulerTarget(slotNames[0], this); // only
     // single-slot-target
     // for now
     exampleDocuments.createExamplesForTarget(target);

     TextRulerExampleDocument[] docs = exampleDocuments.getSortedDocumentsInCacheOptimizedOrder();

     allExamplesCount = exampleDocuments.getAllPositiveExamples().size();

     for (TextRulerExampleDocument inst : docs) {
       List<TextRulerExample> tags = inst.getPositiveExamples();

       // for each uncovered example -> induce a new rule:
       for (TextRulerExample tag : tags) {
         if (!coveredExamples.contains(tag)) {
           roundNumber++;
           WhiskRule newRule = growRule(inst, tag);
           if (shouldAbort())
             break;
           // if (newRule == null)
           // break;
           // else
           if (newRule != null
                   && (newRule.getCoveringStatistics().getCoveredNegativesCount() == 00 || newRule
                           .getLaplacian() <= errorThreshold)) {
             ruleList.addRule(newRule);
             coveredExamples.addAll(newRule.getCoveringStatistics().getCoveredPositiveExamples());
             sendStatusUpdateToDelegate("New Rule added...", TextRulerLearnerState.ML_RUNNING, true);
           }
         }
       }
       if (shouldAbort())
         return;
     }
     sendStatusUpdateToDelegate("Done", TextRulerLearnerState.ML_DONE, true);
     cachedTestedRuleStatistics.clear();
   }

   protected WhiskRule growRule(TextRulerExampleDocument doc, TextRulerExample example) {
     sendStatusUpdateToDelegate("Creating new rule from seed...", TextRulerLearnerState.ML_RUNNING,
             false);
     WhiskRule theRule = new WhiskRule(this, example.getTarget(), example);
     int numberOfSlotsInTag = example.getAnnotations().length;
     for (int i = 0; i < numberOfSlotsInTag; i++)
       theRule.getPatterns().add(new TextRulerSlotPattern());

     List<WhiskRuleItem> allTerms = getAllTermsOfExample(example);

     sendStatusUpdateToDelegate("Creating new rule: anchoring...", TextRulerLearnerState.ML_RUNNING,
             false);
     for (int i = 0; i < numberOfSlotsInTag; i++) {
       theRule = anchor(theRule, doc, example, allTerms, i);
       if (shouldAbort())
         return null;
     }

     sendStatusUpdateToDelegate("Creating new rule: extending...", TextRulerLearnerState.ML_RUNNING,
             false);
     if (theRule != null) {
       double oldLaplacian = theRule.getLaplacian();
       int subRoundNumber = 0;
       // repeat while we still make errors...
       while (theRule.getCoveringStatistics().getCoveredNegativesCount() > 0) {
         WhiskRule extendedRule = extendRule(theRule, doc, example, allTerms, subRoundNumber);
         if (extendedRule == null) {
           // this way we get the previous rule
           // as the best rule...
           break;
         }
         theRule = extendedRule;
         TextRulerToolkit.log("----------------------------");
         TextRulerToolkit.log("BEST EXTENSION IS: " + theRule.getRuleString());
         TextRulerToolkit.log("Laplacian: " + theRule.getLaplacian() + "    ; "
                 + theRule.getCoveringStatistics());
         subRoundNumber++;

         double newLaplacian = theRule.getLaplacian();
         if (newLaplacian >= oldLaplacian) {
           break;
         }
         oldLaplacian = newLaplacian;
       }
       TextRulerToolkit.log("----------------------------");
       TextRulerToolkit.log("FINAL RULE IS : " + theRule.getRuleString());
     }
     return theRule;
   }

   protected WhiskRule extendRule(WhiskRule rule, TextRulerExampleDocument doc,
           TextRulerExample example, List<WhiskRuleItem> allTerms, int subRoundNumber) {
     WhiskRule bestRule = null;
     double bestL = 1.0;
     int bestRuleConstraintPoints = -1;
     if (rule.getLaplacian() <= errorThreshold) {
       bestRule = rule;
       bestL = rule.getLaplacian();
     }

     List<WhiskRuleItem> slotTerms = getTermsWithinBounds(allTerms,
             example.getAnnotations()[0].getBegin(), example.getAnnotations()[0].getEnd());
     WhiskRuleItem firstSlotTerm = slotTerms.get(0);
     WhiskRuleItem lastSlotTerm = slotTerms.get(slotTerms.size() - 1);

     List<TextRulerRule> rulesToTest = new ArrayList<TextRulerRule>();
     for (WhiskRuleItem term : allTerms) {
       if (rule.containsTerm(term)) {
         continue;
       }

       boolean rejectTerm = false;
       // for now this works only for slot 0 (no multislot stuff here yet!)
       if (term.getTermNumberInExample() < firstSlotTerm.getTermNumberInExample())
         rejectTerm = firstSlotTerm.getTermNumberInExample() - term.getTermNumberInExample() > windowSize;
       else if (term.getTermNumberInExample() > lastSlotTerm.getTermNumberInExample())
         rejectTerm = term.getTermNumberInExample() - firstSlotTerm.getTermNumberInExample() > windowSize;

       if (rejectTerm) {
         // out of window scope -> skip to next...
         continue;
       }

       WhiskRule proposedRule = createNewRuleByAddingTerm(rule, term);
       WhiskRuleItem t = proposedRule.searchItemWithTermNumber(term.getTermNumberInExample());

       if (!rulesToTest.contains(proposedRule))
         rulesToTest.add(proposedRule);

       // add a second version where we remove the exact token content if
       // it is a regexp item:
       WhiskRule proposedRule2 = null;
       WhiskRuleItem t2 = null;
       if (t.getWordConstraint().isRegExpConstraint()) {
         proposedRule2 = proposedRule.copy();
         t2 = proposedRule2.searchItemWithTermNumber(term.getTermNumberInExample());
         t2.setHideRegExp(true);
         proposedRule2.setNeedsCompile(true);
         if (!rulesToTest.contains(proposedRule2)) {
           rulesToTest.add(proposedRule2);
         }
       }

       // and now, for WHISK performance testing purposes, we also add POS
       // tags:
       // this is not very nice code and not dynamic feature capable, but
       // for testpurposes
       // in order to test WHISK with PosTag Terms...
       if (posTagRootTypeName != null && posTagRootTypeName.length() > 0) {
         TextRulerAnnotation tokenAnnotation = term.getWordConstraint().getTokenAnnotation();
         CAS cas = example.getDocumentCAS();
         TypeSystem ts = cas.getTypeSystem();
         Type posTagsRootType = ts.getType(posTagRootTypeName);
         if (ts != null) {
           // POS-Tags created by our test hmm tagger.
           List<AnnotationFS> posTagAnnotations = TextRulerToolkit.getAnnotationsWithinBounds(cas,
                   tokenAnnotation.getBegin(), tokenAnnotation.getEnd(), null, posTagsRootType);
           if (posTagAnnotations.size() > 0) {
             AnnotationFS posTag = posTagAnnotations.get(0);
             if (posTag.getBegin() == tokenAnnotation.getBegin()
                     && posTag.getEnd() == tokenAnnotation.getEnd()) {
               TextRulerAnnotation posTagAnnotation = new TextRulerAnnotation(posTag, doc);

               // 1. most specific term with all constraints we
               // have:
               WhiskRule proposedRule3 = proposedRule.copy();
               WhiskRuleItem t3 = proposedRule3.searchItemWithTermNumber(term
                       .getTermNumberInExample());
               t3.addOtherConstraint(new MLWhiskOtherConstraint(tokenAnnotation, posTagAnnotation));
               proposedRule3.setNeedsCompile(true);
               if (!rulesToTest.contains(proposedRule3))
                 rulesToTest.add(proposedRule3);

               // 2. the same without the regexp thingy:
               if (proposedRule2 != null) {
                 WhiskRule proposedRule4 = proposedRule2.copy();
                 WhiskRuleItem t4 = proposedRule4.searchItemWithTermNumber(term
                         .getTermNumberInExample());
                 t4.addOtherConstraint(new MLWhiskOtherConstraint(tokenAnnotation, posTagAnnotation));
                 proposedRule4.setNeedsCompile(true);
                 if (!rulesToTest.contains(proposedRule4))
                   rulesToTest.add(proposedRule4);
               }

               // 3. last but not least: a rule with only the pos
               // tag constraint:
               WhiskRule proposedRule5 = proposedRule.copy();
               WhiskRuleItem t5 = proposedRule5.searchItemWithTermNumber(term
                       .getTermNumberInExample());
               t5.addOtherConstraint(new MLWhiskOtherConstraint(tokenAnnotation, posTagAnnotation));
               t5.setWordConstraint(null);
               proposedRule5.setNeedsCompile(true);
               if (!rulesToTest.contains(proposedRule5))
                 rulesToTest.add(proposedRule5);
             }
           }
         }
       }

     }
     if (rulesToTest.size() == 0)
       return bestRule;

     sendStatusUpdateToDelegate(
             "Round "
                     + roundNumber
                     + "."
                     + subRoundNumber
                     + " - Testing "
                     + rulesToTest.size()
                     + " rules... "
                     + " - uncovered examples: "
                     + (allExamplesCount - coveredExamples.size() + " / " + allExamplesCount
                             + " ; cs=" + cachedTestedRuleStatistics.size()),
             TextRulerLearnerState.ML_RUNNING, false);

     TextRulerToolkit.log("Testing " + rulesToTest.size() + " rules on training set...");
     for (TextRulerRule r : rulesToTest)
       TextRulerToolkit.log(r.getRuleString());
     testRulesIfNotCached(rulesToTest); // testRulesOnDocumentSet(rulesToTest,
     // exampleDocuments);
     if (shouldAbort())
       return null;
     for (TextRulerRule r : rulesToTest) {
       WhiskRule wr = (WhiskRule) r;
       if (wr.getLaplacian() < bestL) {
         bestL = wr.getLaplacian();
         bestRule = wr;
         bestRuleConstraintPoints = bestRule.totalConstraintPoints();
       } else if (wr.getLaplacian() == bestL && bestRuleConstraintPoints >= 0) {
         TextRulerToolkit.log("Same Laplacian! So prefer more general rule!");
         if (wr.totalConstraintPoints() < bestRuleConstraintPoints) {
           TextRulerToolkit.log("\tYes, prefered!");
           bestL = wr.getLaplacian();
           bestRule = wr;
           bestRuleConstraintPoints = bestRule.totalConstraintPoints();
         }
       }
     }
     return bestRule;
   }

   protected WhiskRule createNewRuleByAddingTerm(WhiskRule baseRule, WhiskRuleItem term) {
     WhiskRule newRule = baseRule.copy();
     int foundSlotNumber = -1; // debug info
     String foundSlotPattern = "";
     int termNumber = term.getTermNumberInExample();
     // determine, where this term is located relatively to the slots we
     // have...
     TextRulerRulePattern targetPattern = null;
     TextRulerRulePattern previousSlotPostFillerPattern = null;
     for (int i = 0; i < newRule.getPatterns().size(); i++) {
       TextRulerSlotPattern slotPattern = newRule.getPatterns().get(i);
       WhiskRuleItem it = (WhiskRuleItem) slotPattern.preFillerPattern.lastItem(); // look at the
       // prefiller
       // pattern
       if (it != null && termNumber <= it.getTermNumberInExample())
         targetPattern = slotPattern.preFillerPattern;
       if (targetPattern == null && slotPattern.fillerPattern.size() > 0) // now
       // look
       // at
       // the
       // filler
       // pattern
       {
         it = (WhiskRuleItem) slotPattern.fillerPattern.firstItem();
         if (termNumber < it.getTermNumberInExample()) // it's still for
           // the prefiller
           // pattern but it
           // seems to be
           // emtpy so we
           // could not find
           // that out above!
           targetPattern = slotPattern.preFillerPattern;
         else {
           it = (WhiskRuleItem) slotPattern.fillerPattern.lastItem();
           if (termNumber <= it.getTermNumberInExample()) {
             targetPattern = slotPattern.fillerPattern;
           }
         }
       }
       if (targetPattern == null && slotPattern.postFillerPattern.size() > 0) // now look at
       // the
       // postfiller
       // pattern
       {
         it = (WhiskRuleItem) slotPattern.postFillerPattern.firstItem();
         if (termNumber < it.getTermNumberInExample()) // it's still for
           // the filler
           // pattern but it
           // seems to be
           // emtpy so we
           // could not find
           // that out above!
           targetPattern = slotPattern.fillerPattern;
         else {
           it = (WhiskRuleItem) slotPattern.postFillerPattern.lastItem();
           if (termNumber <= it.getTermNumberInExample())
             targetPattern = slotPattern.postFillerPattern;
         }
       }
       if (targetPattern == null) {
         targetPattern = previousSlotPostFillerPattern;
         if (i > 0) {
           TextRulerSlotPattern prevSlotPattern = newRule.getPatterns().get(i - 1);
           foundSlotPattern = targetPattern == prevSlotPattern.preFillerPattern ? "PRE FILLER"
                   : (targetPattern == prevSlotPattern.fillerPattern ? "FILLER" : "POST FILLER");
           foundSlotNumber = i - 1;
         }
       } else {
         foundSlotPattern = targetPattern == slotPattern.preFillerPattern ? "PRE FILLER"
                 : (targetPattern == slotPattern.fillerPattern ? "FILLER" : "POST FILLER");
         foundSlotNumber = i;
       }
       previousSlotPostFillerPattern = slotPattern.postFillerPattern;
     }

     if (targetPattern == null) {
       targetPattern = previousSlotPostFillerPattern;
       foundSlotNumber = newRule.getPatterns().size() - 1;
       foundSlotPattern = "POST FILLER";
     }

     if (targetPattern == null) {
       TextRulerToolkit.log("ERROR, NO TARGET PATTERN FOR NEW RULE TERM FOUND !");
     } else {
       // TextRulerToolkit.log("Ok, found for Rule: "+newRule.getRuleString());
       // TextRulerToolkit.log("Term: "+term.getTermNumberInExample()+" ; "+term);
       // TextRulerToolkit.log("Slot "+foundSlotNumber+" - Pattern: "+foundSlotPattern);
       // now put that term into the rule:
       int indexInPattern = -1;
       if (targetPattern.size() == 0) {
         targetPattern.add(term.copy());
         indexInPattern = 0;
       } else {
         // 1. search if the term would replace a wildcard:
         WhiskRuleItem wildCard = newRule.searchItemWithTermNumber(termNumber);
         if (wildCard != null) {
           if (!wildCard.isStarWildCard()) {
             TextRulerToolkit
                     .log("ERROR, FOUND A TERM WITH THE SAME NUMBER THAT IS NOT A WILDCARD! HOW IS THAT???");
             return null;
           }
           if (!targetPattern.contains(wildCard)) {
             TextRulerToolkit.log("EVEN WORSE, THAT MUST NOT BE AT ALL!");
             return null;
           }
           indexInPattern = targetPattern.indexOf(wildCard);
           targetPattern.set(indexInPattern, term.copy());
         } else {
           // not a wildcard, so search for the insertion point:
           for (int i = 0; i < targetPattern.size(); i++) {
             WhiskRuleItem it = (WhiskRuleItem) targetPattern.get(i);
             if (termNumber < it.getTermNumberInExample()) {
               indexInPattern = i;
               break;
             }
           }
           if (indexInPattern < 0) {
             indexInPattern = targetPattern.size();
             targetPattern.add(term.copy());
           } else
             targetPattern.add(indexInPattern, term.copy());
         }
       }
       // ok, now we have replaced a wildcard with the term or added the
       // term between two other items.
       // we now have to check the neighbors of the new term: if it is a
       // direct neighbor (according to the termNumber),
       // we have nothing special to do. but if it is not a direct
       // neighbor, we have to add a wildcard between the two items (if the
       // neighbor item
       // is not a wildcard itself!
       WhiskRuleItem newTerm = (WhiskRuleItem) targetPattern.get(indexInPattern);

       // look at left neighbor:
       WhiskRuleItem left = newRule.searchNeighborOfItem(newTerm, true);
       if (left != null) {
         // TextRulerToolkit.log("LEFT NEIGHBOR FOUND!");

         // so we have a left neighbor. let's see if it also is the
         // neighbor in our seed token stream:
         if (left.getTermNumberInExample() < newTerm.getTermNumberInExample() - 1
                 && !left.isStarWildCard()) { // no direct neighbor and
           // no wildcard yet,
           // so insert a wildcard between us!
           targetPattern.add(indexInPattern,
                   WhiskRuleItem.newWildCardItem(left.getTermNumberInExample() + 1));
           indexInPattern++;
         }
       }

       // look at right neighbor:
       WhiskRuleItem right = newRule.searchNeighborOfItem(newTerm, false);
       if (right != null) {
         // TextRulerToolkit.log("RIGHT NEIGHBOR FOUND!");
         // so we have a right neighbor. let's see if it also is the
         // neighbor in our seed token stream:
         if (right.getTermNumberInExample() > newTerm.getTermNumberInExample() + 1
                 && !right.isStarWildCard()) { // no direct neighbor and
           // no wildcard yet,
           // so insert a wildcard between us!
           WhiskRuleItem wc = WhiskRuleItem.newWildCardItem(newTerm.getTermNumberInExample() + 1);
           if (indexInPattern + 1 < targetPattern.size())
             targetPattern.add(indexInPattern + 1, wc);
           else
             targetPattern.add(wc);
         }
       }

       newRule.setNeedsCompile(true);
       // TextRulerToolkit.log("BEFORE: "+baseRule.getRuleString());
       // TextRulerToolkit.log("AFTER : "+newRule.getRuleString());
       // TextRulerToolkit.log("");
     }
     if (newRule.getRuleString().equals(baseRule.getRuleString())) // this
       // must
       // not be!
       return null;
     else
       return newRule;
   }

   protected WhiskRule anchor(WhiskRule rule, TextRulerExampleDocument doc,
           TextRulerExample example, List<WhiskRuleItem> allTerms, int slotIndex) {
     TextRulerAnnotation slotAnnotation = example.getAnnotations()[slotIndex];
     List<WhiskRuleItem> inside = getTermsWithinBounds(allTerms, slotAnnotation.getBegin(),
             slotAnnotation.getEnd());

     if (rule == null || inside.isEmpty()) {
       return null;
     }
     // create base 1 and base 2:
     WhiskRule base1 = rule.copy(); // slot filler rule
     TextRulerSlotPattern slotPattern = base1.getPatterns().get(slotIndex);
     for (int i = 0; i < inside.size(); i++)
       if (i == 0 || (i == inside.size() - 1))
         slotPattern.fillerPattern.add(inside.get(i).copy());
       else if (inside.size() > 2 && i < 2)
         slotPattern.fillerPattern.add(WhiskRuleItem.newWildCardItem(inside.get(i)
                 .getTermNumberInExample()));

     WhiskRule base2 = rule.copy(); // slot context rule
     slotPattern = base2.getPatterns().get(slotIndex);

     int firstOfSlot = allTerms.indexOf(inside.get(0));
     int lastOfSlot = allTerms.indexOf(inside.get(inside.size() - 1));
     if (firstOfSlot > 0)
       slotPattern.preFillerPattern.add(allTerms.get(firstOfSlot - 1));
     slotPattern.fillerPattern.add(WhiskRuleItem.newWildCardItem(inside.get(0)
             .getTermNumberInExample()));
     if (lastOfSlot + 1 < allTerms.size())
       slotPattern.postFillerPattern.add(allTerms.get(lastOfSlot + 1));

     TextRulerToolkit.log("base1: " + base1.getRuleString());
     TextRulerToolkit.log("base2: " + base2.getRuleString());
     List<TextRulerRule> testRules = new ArrayList<TextRulerRule>();
     testRules.add(base1);
     testRules.add(base2);
     // testRulesOnDocumentSet(testRules, exampleDocuments);
     testRulesIfNotCached(testRules);
     if (shouldAbort())
       return null;
     TextRulerToolkit.log("\tbase1: " + base1.getCoveringStatistics() + " --> laplacian = "
             + base1.getLaplacian());
     TextRulerToolkit.log("\tbase2: " + base2.getCoveringStatistics() + " --> laplacian = "
             + base2.getLaplacian());
     if (base2.getCoveringStatistics().getCoveredPositivesCount() > base1.getCoveringStatistics()
             .getCoveredPositivesCount())
       return base2;
     else
       return base1;
   }

   public String getResultString() {
     if (ruleList != null)
       return getFileHeaderString(true) + ruleList.getRulesString("");
     else
       return "No results available yet!";
   }

   public void setParameters(Map<String, Object> params) {
     if (TextRulerToolkit.DEBUG)
       saveParametersToTempFolder(params);

     // TODO try catch
     if (params.containsKey(WINDOSIZE_KEY))
       windowSize = (Integer) params.get(WINDOSIZE_KEY);

     if (params.containsKey(ERROR_THRESHOLD_KEY))
       errorThreshold = (Float) params.get(ERROR_THRESHOLD_KEY);

     if (params.containsKey(POSTAG_ROOTTYPE_KEY))
       posTagRootTypeName = (String) params.get(POSTAG_ROOTTYPE_KEY);

   }

   public List<WhiskRuleItem> getAllTermsOfExample(TextRulerExample example) {
     CAS cas = example.getDocumentCAS();
     Type tokensRootType = cas.getTypeSystem().getType(TextRulerToolkit.RUTA_ANY_TYPE_NAME);
     List<AnnotationFS> all = TextRulerToolkit.getAnnotationsWithinBounds(cas, 0, cas
             .getDocumentText().length() + 1, TextRulerToolkit.getFilterSetWithSlotNames(slotNames,
             filterSet), tokensRootType);

     List<WhiskRuleItem> result = new ArrayList<WhiskRuleItem>();
     int i = 0;
     for (AnnotationFS afs : all) {
       WhiskRuleItem term = new WhiskRuleItem(new TextRulerAnnotation(afs, example.getDocument()));
       term.setTermNumberInExample(i);
       i++;
       result.add(term);
     }
     return result;
   }

   public List<WhiskRuleItem> getTermsWithinBounds(List<WhiskRuleItem> allTerms, int startPos,
           int endPos) {
     List<WhiskRuleItem> result = new ArrayList<WhiskRuleItem>();
     for (WhiskRuleItem term : allTerms) {
       TextRulerAnnotation a = term.getWordConstraint().getTokenAnnotation();
       if (a.getBegin() >= startPos && a.getEnd() <= endPos)
         result.add(term);
       if (a.getEnd() > endPos)
         break;
     }
     return result;
   }

   // TODO share this between algorithms (e.g. LP2 and RAPIER, WHISK ?) and
   // make a maximum size of the cache, etc. like CasCache?
   protected void testRulesIfNotCached(List<TextRulerRule> rules) {
     List<TextRulerRule> rulesToTest = new ArrayList<TextRulerRule>();

     for (TextRulerRule r : rules) {
       String key = r.getRuleString();
       if (cachedTestedRuleStatistics.containsKey(key)) {
         r.setCoveringStatistics(cachedTestedRuleStatistics.get(key).copy());
         TextRulerToolkit.log("CACHE HIT !");
       } else
         rulesToTest.add(r);
     }

     if (rulesToTest.size() > 0) {
       testRulesOnDocumentSet(rulesToTest, exampleDocuments);
       if (shouldAbort())
         return;
       for (TextRulerRule r : rulesToTest) {
         String key = r.getRuleString();
         cachedTestedRuleStatistics.put(key, r.getCoveringStatistics().copy());
       }
     }
   }

 }