trunk/ruta-ep-textruler/src/main/java/org/apache/uima/ruta/textruler/learner/whisk/generic/Whisk.java - uima-ruta - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one
  * or more contributor license agreements.  See the NOTICE file
  * distributed with this work for additional information
  * regarding copyright ownership.  The ASF licenses this file
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
  *
  * http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing,
  * software distributed under the License is distributed on an
  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
  * KIND, either express or implied.  See the License for the
  * specific language governing permissions and limitations
  * under the License.
  */

 package org.apache.uima.ruta.textruler.learner.whisk.generic;

 import java.util.ArrayList;
 import java.util.Collection;
 import java.util.HashMap;
 import java.util.HashSet;
 import java.util.List;
 import java.util.Map;
 import java.util.Set;

 import org.apache.commons.lang3.StringUtils;
 import org.apache.uima.cas.CAS;
 import org.apache.uima.cas.FSIterator;
 import org.apache.uima.cas.FeatureStructure;
 import org.apache.uima.cas.Type;
 import org.apache.uima.cas.TypeSystem;
 import org.apache.uima.cas.text.AnnotationFS;
 import org.apache.uima.ruta.textruler.core.TextRulerAnnotation;
 import org.apache.uima.ruta.textruler.core.TextRulerBasicLearner;
 import org.apache.uima.ruta.textruler.core.TextRulerExample;
 import org.apache.uima.ruta.textruler.core.TextRulerExampleDocument;
 import org.apache.uima.ruta.textruler.core.TextRulerRule;
 import org.apache.uima.ruta.textruler.core.TextRulerRuleItem;
 import org.apache.uima.ruta.textruler.core.TextRulerRuleList;
 import org.apache.uima.ruta.textruler.core.TextRulerRulePattern;
 import org.apache.uima.ruta.textruler.core.TextRulerSlotPattern;
 import org.apache.uima.ruta.textruler.core.TextRulerStatisticsCollector;
 import org.apache.uima.ruta.textruler.core.TextRulerTarget;
 import org.apache.uima.ruta.textruler.core.TextRulerToolkit;
 import org.apache.uima.ruta.textruler.core.TextRulerWordConstraint;
 import org.apache.uima.ruta.textruler.extension.TextRulerLearnerDelegate;
 import org.apache.uima.ruta.textruler.learner.whisk.generic.WhiskRuleItem.MLWhiskOtherConstraint;

 public class Whisk extends TextRulerBasicLearner {

   public final static String WINDOWSIZE_KEY = "windowSize";

   public final static String ERROR_THRESHOLD_KEY = "errorThreshold";

   public final static String POSTAG_ROOTTYPE_KEY = "posTagRootType";

   public final static int STANDARD_WINDOWSIZE = 5;

   public final static float STANDARD_ERROR_THRESHOLD = 0.1f;

   public final static String STANDARD_POSTAG_ROOTTYPE = "org.apache.uima.ml.ML.postag";

   public static final String CONSIDERED_FEATURES = "consideredFeatures";

   public static final String STANDARD_CONSIDERED_FEATURES = "";

   TextRulerRuleList ruleList;

   protected Set<TextRulerExample> coveredExamples;

   protected int windowSize = STANDARD_WINDOWSIZE;

   protected double errorThreshold = STANDARD_ERROR_THRESHOLD;

   protected String posTagRootTypeName = STANDARD_POSTAG_ROOTTYPE;

   int roundNumber = 0;

   int allExamplesCount = 0;

   private List<String> consideredFeatures = new ArrayList<String>();

   private Map<String, TextRulerStatisticsCollector> cachedTestedRuleStatistics = new HashMap<String, TextRulerStatisticsCollector>();

   public Whisk(String inputDir, String prePropTmFile, String tmpDir, String[] slotNames,
           Set<String> filterSet, boolean skip, TextRulerLearnerDelegate delegate) {
     super(inputDir, prePropTmFile, tmpDir, slotNames, filterSet, skip, delegate);
     // useDynamicAnchoring = true;
   }

   @Override
   public boolean collectNegativeCoveredInstancesWhenTesting() {
     return false;
   }

   @Override
   protected void doRun() {

     // we don't use the same overall structure like the original WHISK since
     // we do not
     // repeat the whole process for some new training documents at the
     // user's request, we
     // learn like the other algorithms from the whole training set, so we
     // for example do not
     // need to test the intermediate rule base on a newly "incoming"
     // training document since we
     // tested all rules already on all training documents !

     // this version of whisk is not tested for mutli slot learning since the
     // seminar announcements
     // are not quite suitable for this task: they do not all contain all 4
     // slots and some of them
     // occur more than once in one document ! And the order of them is not
     // always the same as well!
     // so this is now made only tested for the single slot case even if it
     // is built capable of multislot
     // examples!

     // this is the inner loop of the WHISK pseudo-code:
     // For each inst in Training
     // for each tag

     cachedTestedRuleStatistics.clear();
     ruleList = new TextRulerRuleList();
     coveredExamples = new HashSet<TextRulerExample>();

     sendStatusUpdateToDelegate("Creating examples...", TextRulerLearnerState.ML_RUNNING, false);
     for (int i = 0; i < slotNames.length; i++) {
       TextRulerTarget target = new TextRulerTarget(slotNames[i], this);
       exampleDocuments.createExamplesForTarget(target);

       TextRulerExampleDocument[] docs = exampleDocuments.getSortedDocumentsInCacheOptimizedOrder();

       allExamplesCount = exampleDocuments.getAllPositiveExamples().size();

       for (TextRulerExampleDocument inst : docs) {
         List<TextRulerExample> tags = inst.getPositiveExamples();

         // for each uncovered example -> induce a new rule:
         for (TextRulerExample tag : tags) {
           if (!coveredExamples.contains(tag)) {
             roundNumber++;
             WhiskRule newRule = growRule(inst, tag);
             if (shouldAbort())
               break;
             // if (newRule == null)
             // break;
             // else
             if (newRule != null
                     && (newRule.getCoveringStatistics().getCoveredNegativesCount() == 00 || newRule
                             .getLaplacian() <= errorThreshold)) {
               ruleList.addRule(newRule);
               coveredExamples.addAll(newRule.getCoveringStatistics().getCoveredPositiveExamples());
               sendStatusUpdateToDelegate("New Rule added...", TextRulerLearnerState.ML_RUNNING,
                       true);
             }
           }
         }
         if (shouldAbort())
           return;
       }
     }
     sendStatusUpdateToDelegate("Done", TextRulerLearnerState.ML_DONE, true);
     cachedTestedRuleStatistics.clear();
   }

   protected WhiskRule growRule(TextRulerExampleDocument doc, TextRulerExample example) {
     sendStatusUpdateToDelegate("Creating new rule from seed...", TextRulerLearnerState.ML_RUNNING,
             false);
     WhiskRule theRule = new WhiskRule(this, example.getTarget(), example);
     int numberOfSlotsInTag = example.getAnnotations().length;
     for (int i = 0; i < numberOfSlotsInTag; i++)
       theRule.getPatterns().add(new TextRulerSlotPattern());

     sendStatusUpdateToDelegate("Creating new rule: anchoring...", TextRulerLearnerState.ML_RUNNING,
             false);
     for (int i = 0; i < numberOfSlotsInTag; i++) {
       theRule = anchor(theRule, doc, example, i);
       if (shouldAbort())
         return null;
     }

     sendStatusUpdateToDelegate("Creating new rule: extending...", TextRulerLearnerState.ML_RUNNING,
             false);
     if (theRule != null) {
       double oldLaplacian = theRule.getLaplacian();
       int subRoundNumber = 0;

       // repeat while we still make errors...
       while (theRule.getCoveringStatistics().getCoveredNegativesCount() > 0) {
         WhiskRule extendedRule = extendRule(theRule, doc, example, subRoundNumber);
         if (extendedRule == null) {
           // this way we get the previous rule
           // as the best rule...
           break;
         }
         theRule = extendedRule;
         TextRulerToolkit.log("----------------------------");
         TextRulerToolkit.log("BEST EXTENSION IS: " + theRule.getRuleString());
         TextRulerToolkit.log("Laplacian: " + theRule.getLaplacian() + "    ; "
                 + theRule.getCoveringStatistics());
         subRoundNumber++;

         double newLaplacian = theRule.getLaplacian();
         if (newLaplacian >= oldLaplacian) {
           break;
         }
         oldLaplacian = newLaplacian;
       }
       TextRulerToolkit.log("----------------------------");
       TextRulerToolkit.log("FINAL RULE IS : " + theRule.getRuleString());
     }
     return theRule;
   }

   protected WhiskRule extendRule(WhiskRule rule, TextRulerExampleDocument doc,
           TextRulerExample example, int subRoundNumber) {
     WhiskRule bestRule = null;
     double bestL = 1.0;
     int bestRuleConstraintPoints = -1;
     if (rule.getLaplacian() <= errorThreshold) {
       bestRule = rule;
       bestL = rule.getLaplacian();
     }
     List<TextRulerRule> rulesToTest = new ArrayList<TextRulerRule>();

     // first only add conditions, e.g., for features

     List<TextRulerSlotPattern> patterns = rule.getPatterns();
     for (TextRulerSlotPattern eachPattern : patterns) {
       for (TextRulerRuleItem item : eachPattern.fillerPattern) {
         if (item instanceof WhiskRuleItem) {
           WhiskRuleItem wri = (WhiskRuleItem) item;
           WhiskRule proposedRule = rule;
           TextRulerWordConstraint wordConstraint = wri.getWordConstraint();
           for (String eachFeature : consideredFeatures) {
             if (wordConstraint != null) {
               Map<String, String> featureMap = wordConstraint.getTokenAnnotation().getFeatureMap();
               String stringValue = featureMap.get(eachFeature);
               if (stringValue != null && !wri.getActivatedFeatures().contains(eachFeature)) {
                 wri.activateFeature(eachFeature);
                 WhiskRule proposedRuleF = proposedRule.copy();
                 wri.deactivateFeature(eachFeature);
                 proposedRuleF.setNeedsCompile(true);
                 if (!rulesToTest.contains(proposedRuleF)) {
                   rulesToTest.add(proposedRuleF);
                 }
               }
             }
           }
           if (wordConstraint != null && wordConstraint.isRegExpConstraint() && wri.isHideRegExp()) {
             wri.setHideRegExp(false);
             WhiskRule proposedRuleF = proposedRule.copy();
             wri.setHideRegExp(true);
             proposedRuleF.setNeedsCompile(true);
             if (!rulesToTest.contains(proposedRuleF)) {
               rulesToTest.add(proposedRuleF);
             }
           }
         }
       }
     }

     List<List<WhiskRuleItem>> slotTerms = getTermsWithinBounds(
             example.getAnnotations()[0].getBegin(), example.getAnnotations()[0].getEnd(), example);
     List<List<WhiskRuleItem>> windowTerms = getTermsWithinWindow(slotTerms, example, 0);

     for (List<WhiskRuleItem> eachList : windowTerms) {
       for (WhiskRuleItem term : eachList) {

         if (rule.containsTerm(term)) {
           continue;
         }

         WhiskRule proposedRule = createNewRuleByAddingTerm(rule, term);
         if (proposedRule == null)
           continue;
         WhiskRuleItem t = term;

         if (!rulesToTest.contains(proposedRule))
           rulesToTest.add(proposedRule);

         // add a second version where we add the exact token content if
         // it is a regexp item:
         WhiskRule proposedRule2 = proposedRule;
         if (t.getWordConstraint().isRegExpConstraint()) {
           t.setHideRegExp(false);
           WhiskRule proposedRuleF = proposedRule.copy();
           t.setHideRegExp(true);
           proposedRuleF.setNeedsCompile(true);
           if (!rulesToTest.contains(proposedRuleF)) {
             rulesToTest.add(proposedRuleF);
           }
         }

         // extend with feature conditions
         WhiskRule proposedRuleF = null;
         for (String eachFeature : consideredFeatures) {
           Map<String, String> featureMap = t.getWordConstraint().getTokenAnnotation()
                   .getFeatureMap();
           String stringValue = featureMap.get(eachFeature);
           if (stringValue != null) {
             t.activateFeature(eachFeature);
             proposedRuleF = proposedRule.copy();
             t.deactivateFeature(eachFeature);
             proposedRuleF.setNeedsCompile(true);
             if (!rulesToTest.contains(proposedRuleF)) {
               rulesToTest.add(proposedRuleF);
             }
           }
         }

         // and now, for WHISK performance testing purposes, we also add POS
         // tags:
         // this is not very nice code and not dynamic feature capable, but
         // for testpurposes
         // in order to test WHISK with PosTag Terms...
         if (posTagRootTypeName != null && posTagRootTypeName.length() > 0) {
           TextRulerAnnotation tokenAnnotation = term.getWordConstraint().getTokenAnnotation();
           CAS cas = example.getDocumentCAS();
           TypeSystem ts = cas.getTypeSystem();
           Type posTagsRootType = ts.getType(posTagRootTypeName);
           if (ts != null) {
             // POS-Tags created by our test hmm tagger.
             List<AnnotationFS> posTagAnnotations = TextRulerToolkit.getAnnotationsWithinBounds(cas,
                     tokenAnnotation.getBegin(), tokenAnnotation.getEnd(), null, posTagsRootType);
             if (posTagAnnotations.size() > 0) {
               AnnotationFS posTag = posTagAnnotations.get(0);
               if (posTag.getBegin() == tokenAnnotation.getBegin()
                       && posTag.getEnd() == tokenAnnotation.getEnd()) {
                 TextRulerAnnotation posTagAnnotation = new TextRulerAnnotation(posTag, doc,
                         consideredFeatures);

                 // 1. most specific term with all constraints we
                 // have:
                 WhiskRule proposedRule3 = proposedRule.copy();
                 WhiskRuleItem t3 = term;
                 t3.addOtherConstraint(new MLWhiskOtherConstraint(tokenAnnotation, posTagAnnotation));
                 proposedRule3.setNeedsCompile(true);
                 if (!rulesToTest.contains(proposedRule3))
                   rulesToTest.add(proposedRule3);

                 // 2. the same without the regexp thingy:
                 if (proposedRule2 != null) {
                   WhiskRule proposedRule4 = proposedRule2.copy();
                   WhiskRuleItem t4 = term;
                   t4.addOtherConstraint(new MLWhiskOtherConstraint(tokenAnnotation,
                           posTagAnnotation));
                   proposedRule4.setNeedsCompile(true);
                   if (!rulesToTest.contains(proposedRule4))
                     rulesToTest.add(proposedRule4);
                 }

                 // 3. last but not least: a rule with only the pos
                 // tag constraint:
                 WhiskRule proposedRule5 = proposedRule.copy();
                 WhiskRuleItem t5 = term;
                 t5.addOtherConstraint(new MLWhiskOtherConstraint(tokenAnnotation, posTagAnnotation));
                 t5.setWordConstraint(null);
                 proposedRule5.setNeedsCompile(true);
                 if (!rulesToTest.contains(proposedRule5)) {
                   rulesToTest.add(proposedRule5);
                 }

               }
             }
           }
         }
       }
     }
     if (rulesToTest.size() == 0)
       return bestRule;

     sendStatusUpdateToDelegate(
             "Round "
                     + roundNumber
                     + "."
                     + subRoundNumber
                     + " - Testing "
                     + rulesToTest.size()
                     + " rules... "
                     + " - uncovered examples: "
                     + (allExamplesCount - coveredExamples.size() + " / " + allExamplesCount
                             + " ; cs=" + cachedTestedRuleStatistics.size()),
             TextRulerLearnerState.ML_RUNNING, false);

     TextRulerToolkit.log("Testing " + rulesToTest.size() + " rules on training set...");
     for (TextRulerRule r : rulesToTest)
       TextRulerToolkit.log(r.getRuleString());
     testRulesIfNotCached(rulesToTest);

     if (shouldAbort())
       return null;
     for (TextRulerRule r : rulesToTest) {
       WhiskRule wr = (WhiskRule) r;
       if (wr.getLaplacian() < bestL) {
         bestL = wr.getLaplacian();
         bestRule = wr;
         bestRuleConstraintPoints = bestRule.totalConstraintPoints();
       } else if (wr.getLaplacian() == bestL && bestRuleConstraintPoints >= 0) {
         TextRulerToolkit.log("Same Laplacian! So prefer more general rule!");
         if (wr.totalConstraintPoints() < bestRuleConstraintPoints) {
           TextRulerToolkit.log("\tYes, prefered!");
           bestL = wr.getLaplacian();
           bestRule = wr;
           bestRuleConstraintPoints = bestRule.totalConstraintPoints();
         }
       }
     }
     return bestRule;
   }

   private List<List<WhiskRuleItem>> getTermsWithinWindow(List<List<WhiskRuleItem>> slotTerms,
           TextRulerExample example, int steps) {
     if (steps == windowSize)
       return slotTerms;
     List<List<WhiskRuleItem>> result = new ArrayList<List<WhiskRuleItem>>();

     for (List<WhiskRuleItem> list : slotTerms) {
       List<WhiskRuleItem> termsBefore = getTermsBefore(list.get(0), example);
       List<WhiskRuleItem> termsAfter = getTermsAfter(list.get(list.size() - 1), example);
       if (!termsBefore.isEmpty()) {
         for (WhiskRuleItem before : termsBefore) {
           for (WhiskRuleItem after : termsAfter) {
             List<WhiskRuleItem> newList = new ArrayList<WhiskRuleItem>();
             newList.add(before);
             newList.addAll(list);
             newList.add(after);
             result.add(newList);
           }
         }
       } else {
         for (WhiskRuleItem after : termsAfter) {
           List<WhiskRuleItem> newList = new ArrayList<WhiskRuleItem>();
           newList.addAll(list);
           newList.add(after);
           result.add(newList);
         }
       }
     }
     result = getTermsWithinWindow(result, example, ++steps);
     return result;
   }

   protected WhiskRule createNewRuleByAddingTerm(WhiskRule baseRule, WhiskRuleItem term) {
     if (term == null)
       return null;
     if (term.isStarWildCard() || term.getWordConstraint() == null)
       return null;
     WhiskRule newRule = baseRule.copy();
     // int foundSlotNumber = -1; // debug info
     // String foundSlotPattern = "";
     int termBeginNumber = term.getWordConstraint().getTokenAnnotation().getBegin();
     int termEndNumber = term.getWordConstraint().getTokenAnnotation().getEnd();
     TextRulerRulePattern targetPattern = null;
     TextRulerRulePattern previousSlotPostFillerPattern = null;
     for (int i = 0; i < newRule.getPatterns().size(); i++) {
       TextRulerSlotPattern slotPattern = newRule.getPatterns().get(i);
       WhiskRuleItem it = (WhiskRuleItem) slotPattern.preFillerPattern.lastItem(); // look at the
       // prefiller
       // pattern
       if (it != null && it.getWordConstraint() != null
               && termEndNumber <= it.getWordConstraint().getTokenAnnotation().getBegin())
         targetPattern = slotPattern.preFillerPattern;
       if (targetPattern == null && slotPattern.fillerPattern.size() > 0) // now
       // look
       // at
       // the
       // filler
       // pattern
       {
         it = (WhiskRuleItem) slotPattern.fillerPattern.firstItem();
         if (it.getWordConstraint() != null
                 && termEndNumber <= it.getWordConstraint().getTokenAnnotation().getBegin()) // it's
           // still
           // for
           // the prefiller
           // pattern but it
           // seems to be
           // emtpy so we
           // could not find
           // that out above!
           targetPattern = slotPattern.preFillerPattern;
         else {
           it = (WhiskRuleItem) slotPattern.fillerPattern.lastItem();
           if (it.getWordConstraint() != null
                   && termEndNumber <= it.getWordConstraint().getTokenAnnotation().getBegin()) {
             targetPattern = slotPattern.fillerPattern;
           }
         }
       }
       if (targetPattern == null && slotPattern.postFillerPattern.size() > 0) // now
                                                                              // look
                                                                              // at
       // the
       // postfiller
       // pattern
       {
         it = (WhiskRuleItem) slotPattern.postFillerPattern.firstItem();
         if (it.getWordConstraint() != null
                 && termEndNumber <= it.getWordConstraint().getTokenAnnotation().getBegin()) // it's
           // still
           // for
           // the filler
           // pattern but it
           // seems to be
           // emtpy so we
           // could not find
           // that out above!
           targetPattern = slotPattern.fillerPattern;
         else {
           it = (WhiskRuleItem) slotPattern.postFillerPattern.lastItem();
           if (it.getWordConstraint() != null
                   && termEndNumber <= it.getWordConstraint().getTokenAnnotation().getBegin())
             targetPattern = slotPattern.postFillerPattern;
         }
       }
       if (targetPattern == null) {
         targetPattern = previousSlotPostFillerPattern;
         // debug info
         // if (i > 0) {
         // TextRulerSlotPattern prevSlotPattern = newRule.getPatterns().get(i -
         // 1);
         // foundSlotPattern = targetPattern == prevSlotPattern.preFillerPattern
         // ? "PRE FILLER"
         // : (targetPattern == prevSlotPattern.fillerPattern ? "FILLER" :
         // "POST FILLER");
         // foundSlotNumber = i - 1;
         // }
         // } else {
         // foundSlotPattern = targetPattern == slotPattern.preFillerPattern ?
         // "PRE FILLER"
         // : (targetPattern == slotPattern.fillerPattern ? "FILLER" :
         // "POST FILLER");
         // foundSlotNumber = i;
       }
       previousSlotPostFillerPattern = slotPattern.postFillerPattern;
     }

     if (targetPattern == null) {
       targetPattern = previousSlotPostFillerPattern;
       // debug info
       // foundSlotNumber = newRule.getPatterns().size() - 1;
       // foundSlotPattern = "POST FILLER";
     }

     if (targetPattern == null) {
       TextRulerToolkit.log("ERROR, NO TARGET PATTERN FOR NEW RULE TERM FOUND !");
     } else {
       // TextRulerToolkit.log("Ok, found for Rule: "+newRule.getRuleString());
       // TextRulerToolkit.log("Term: "+term.getTermNumberInExample()+" ; "+term);
       // TextRulerToolkit.log("Slot "+foundSlotNumber+" - Pattern: "+foundSlotPattern);
       // now put that term into the rule:
       int indexInPattern = -1;
       if (targetPattern.size() == 0) {
         targetPattern.add(term.copy());
         indexInPattern = 0;
       } else {
         // 1. search if the term would replace a wildcard:
         WhiskRuleItem wildCard = null;
         for (TextRulerRuleItem i : newRule.getPatterns().get(0).preFillerPattern) {
           if (((WhiskRuleItem) i).isStarWildCard()) {
             WhiskRuleItem left = newRule.searchNeighborOfItem(((WhiskRuleItem) i), true);
             WhiskRuleItem right = newRule.searchNeighborOfItem(((WhiskRuleItem) i), false);
             if (left.getWordConstraint().getTokenAnnotation().getEnd() <= termBeginNumber
                     && right.getWordConstraint().getTokenAnnotation().getBegin() >= termEndNumber)
               wildCard = (WhiskRuleItem) i;
           }
         }
         if (wildCard == null) {
           for (TextRulerRuleItem i : newRule.getPatterns().get(0).fillerPattern) {
             if (((WhiskRuleItem) i).isStarWildCard()) {
               WhiskRuleItem left = newRule.searchNeighborOfItem(((WhiskRuleItem) i), true);
               WhiskRuleItem right = newRule.searchNeighborOfItem(((WhiskRuleItem) i), false);
               if (left != null
                       && left.getWordConstraint().getTokenAnnotation().getEnd() <= termBeginNumber
                       && right.getWordConstraint().getTokenAnnotation().getBegin() >= termEndNumber)
                 wildCard = (WhiskRuleItem) i;
             }
           }
         }
         if (wildCard == null) {
           for (TextRulerRuleItem i : newRule.getPatterns().get(0).postFillerPattern) {
             if (((WhiskRuleItem) i).isStarWildCard()) {
               WhiskRuleItem left = newRule.searchNeighborOfItem(((WhiskRuleItem) i), true);
               WhiskRuleItem right = newRule.searchNeighborOfItem(((WhiskRuleItem) i), false);
               if (left.getWordConstraint().getTokenAnnotation().getEnd() <= termBeginNumber
                       && right.getWordConstraint().getTokenAnnotation().getBegin() >= termEndNumber)
                 wildCard = (WhiskRuleItem) i;
             }
           }
         }
         if (wildCard != null) {
           if (!wildCard.isStarWildCard()) {
             TextRulerToolkit
                     .log("ERROR, FOUND A TERM WITH THE SAME NUMBER THAT IS NOT A WILDCARD! HOW IS THAT???");
             return null;
           }
           if (!targetPattern.contains(wildCard)) {
             TextRulerToolkit.log("EVEN WORSE, THAT MUST NOT BE AT ALL!");
             return null;
           }
           indexInPattern = targetPattern.indexOf(wildCard);
           targetPattern.set(indexInPattern, term.copy());
         } else {
           // not a wildcard, so search for the insertion point:
           for (int i = 0; i < targetPattern.size(); i++) {
             WhiskRuleItem it = (WhiskRuleItem) targetPattern.get(i);
             if (it.getWordConstraint() != null
                     && termEndNumber <= it.getWordConstraint().getTokenAnnotation().getBegin()) {
               indexInPattern = i;
               break;
             }
           }
           if (indexInPattern < 0) {
             indexInPattern = targetPattern.size();
             targetPattern.add(term.copy());
           } else
             targetPattern.add(indexInPattern, term.copy());
         }
       }
       // ok, now we have replaced a wildcard with the term or added the
       // term between two other items.
       // we now have to check the neighbors of the new term: if it is a
       // direct neighbor (according to the termNumber),
       // we have nothing special to do. but if it is not a direct
       // neighbor, we have to add a wildcard between the two items (if the
       // neighbor item
       // is not a wildcard itself!
       WhiskRuleItem newTerm = (WhiskRuleItem) targetPattern.get(indexInPattern);

       // look at left neighbor:
       WhiskRuleItem left = newRule.searchNeighborOfItem(newTerm, true);
       if (left != null && left.getWordConstraint() != null) {
         // TextRulerToolkit.log("LEFT NEIGHBOR FOUND!");

         // so we have a left neighbor. let's see if it also is the
         // neighbor in our seed token stream:
         if (!left.isStarWildCard()) { // no direct neighbor and
           // no wildcard yet,
           // so insert a wildcard between us!
           boolean isValid = isNextValidNeighbor(left, newTerm, newRule.getSeedExample());
           if (!isValid) {
             targetPattern.add(indexInPattern, WhiskRuleItem.newWildCardItem());
             indexInPattern++;
           }
         }
       }

       // look at right neighbor:
       WhiskRuleItem right = newRule.searchNeighborOfItem(newTerm, false);
       if (right != null && right.getWordConstraint() != null) {
         // TextRulerToolkit.log("RIGHT NEIGHBOR FOUND!");
         // so we have a right neighbor. let's see if it also is the
         // neighbor in our seed token stream:
         if (!right.isStarWildCard()) {
           // no direct neighbor and
           // no wildcard yet,
           // so insert a wildcard between us!
           boolean isValid = isNextValidNeighbor(newTerm, right, newRule.getSeedExample());
           if (!isValid) {
             WhiskRuleItem wc = WhiskRuleItem.newWildCardItem();
             if (indexInPattern + 1 < targetPattern.size())
               targetPattern.add(indexInPattern + 1, wc);
             else
               targetPattern.add(wc);
           }
         }
       }

       newRule.setNeedsCompile(true);
       // TextRulerToolkit.log("BEFORE: "+baseRule.getRuleString());
       // TextRulerToolkit.log("AFTER : "+newRule.getRuleString());
       // TextRulerToolkit.log("");
     }
     if (newRule.getRuleString().equals(baseRule.getRuleString())) // this
       // must
       // not be!
       return null;
     else
       return newRule;
   }

   protected WhiskRule anchor(WhiskRule rule, TextRulerExampleDocument doc,
           TextRulerExample example, int slotIndex) {
     List<WhiskRule> result = new ArrayList<WhiskRule>();
     TextRulerAnnotation slotAnnotation = example.getAnnotations()[slotIndex];
     List<List<WhiskRuleItem>> window = getTermsWithinBounds(slotAnnotation.getBegin(),
             slotAnnotation.getEnd(), example);

     for (List<WhiskRuleItem> inside : window) {

       if (rule == null || inside.isEmpty()) {
         return null;
       }
       // create base 1 and base 2:
       WhiskRule base1 = rule.copy(); // slot filler rule
       TextRulerSlotPattern slotPattern = base1.getPatterns().get(slotIndex);
       // questionable restriction:
       if (inside.size() <= windowSize) { // TODO add parameter for this!
         slotPattern.fillerPattern.addAll(inside);
       } else {
         for (int i = 0; i < inside.size(); i++)
           if (i == 0 || (i == inside.size() - 1))
             slotPattern.fillerPattern.add(inside.get(i).copy());
           else if (inside.size() > 2 && i < 2)
             slotPattern.fillerPattern.add(WhiskRuleItem.newWildCardItem());
       }
       List<WhiskRuleItem> beforeList = getTermsBefore(inside.get(0), example);
       List<WhiskRuleItem> afterList = getTermsAfter(inside.get(inside.size() - 1), example);
       beforeList.add(null);
       afterList.add(null);
       Collection<WhiskRule> tempRules = new HashSet<WhiskRule>();

       // workaround for better rules:
       // only inner begin
       for (WhiskRuleItem eachBefore : beforeList) {
         for (WhiskRuleItem eachAfter : afterList) {
           WhiskRule copy = rule.copy();
           TextRulerSlotPattern textRulerSlotPattern = copy.getPatterns().get(slotIndex);
           if (eachBefore != null) {
             textRulerSlotPattern.preFillerPattern.add(eachBefore);
           }
           textRulerSlotPattern.fillerPattern.add(inside.get(0).copy());
           textRulerSlotPattern.fillerPattern.add(WhiskRuleItem.newWildCardItem());
           if (eachAfter != null) {
             textRulerSlotPattern.postFillerPattern.add(eachAfter);
           }
           tempRules.add(copy);
         }
       }
       // only inner end
       for (WhiskRuleItem eachBefore : beforeList) {
         for (WhiskRuleItem eachAfter : afterList) {
           WhiskRule copy = rule.copy();
           TextRulerSlotPattern textRulerSlotPattern = copy.getPatterns().get(slotIndex);
           if (eachBefore != null) {
             textRulerSlotPattern.preFillerPattern.add(eachBefore);
           }
           textRulerSlotPattern.fillerPattern.add(WhiskRuleItem.newWildCardItem());
           textRulerSlotPattern.fillerPattern.add(inside.get(inside.size() - 1).copy());
           if (eachAfter != null) {
             textRulerSlotPattern.postFillerPattern.add(eachAfter);
           }
           tempRules.add(copy);
         }
       }

       if (!beforeList.isEmpty()) {
         if (!afterList.isEmpty()) {
           for (WhiskRuleItem eachBefore : beforeList) {
             for (WhiskRuleItem eachAfter : afterList) {
               WhiskRule copy = rule.copy();
               TextRulerSlotPattern textRulerSlotPattern = copy.getPatterns().get(slotIndex);
               if (eachBefore != null) {
                 textRulerSlotPattern.preFillerPattern.add(eachBefore);
               }
               textRulerSlotPattern.fillerPattern.add(WhiskRuleItem.newWildCardItem());
               if (eachAfter != null) {
                 textRulerSlotPattern.postFillerPattern.add(eachAfter);
               }
               tempRules.add(copy);
             }
           }
         } else {
           for (WhiskRuleItem eachBefore : beforeList) {
             WhiskRule copy = rule.copy();
             TextRulerSlotPattern textRulerSlotPattern = copy.getPatterns().get(slotIndex);
             textRulerSlotPattern.fillerPattern.add(WhiskRuleItem.newWildCardItem());
             if (eachBefore != null) {
               textRulerSlotPattern.preFillerPattern.add(eachBefore);
             }
             tempRules.add(copy);
           }
         }
       } else {
         for (WhiskRuleItem eachAfter : afterList) {
           WhiskRule copy = rule.copy();
           TextRulerSlotPattern textRulerSlotPattern = copy.getPatterns().get(slotIndex);
           textRulerSlotPattern.fillerPattern.add(WhiskRuleItem.newWildCardItem());
           if (eachAfter != null) {
             textRulerSlotPattern.postFillerPattern.add(eachAfter);
           }
           tempRules.add(copy);
         }
       }
       ArrayList<TextRulerRule> rules = new ArrayList<TextRulerRule>(tempRules);
       testRulesIfNotCached(rules);
       TextRulerRule best = null;
       for (TextRulerRule each : rules) {
         if (best == null) {
           best = each;
         } else {
           if (each.getCoveringStatistics().getCoveredPositivesCount() > best
                   .getCoveringStatistics().getCoveredPositivesCount()) {
             best = each;
           }
         }
       }
       WhiskRule base2 = (WhiskRule) best;
       List<TextRulerRule> testRules = new ArrayList<TextRulerRule>();
       if (base1 != null) {
         TextRulerToolkit.log("base1: " + base1.getRuleString());
         testRules.add(base1);
       }
       if (base2 != null) {
         TextRulerToolkit.log("base2: " + base2.getRuleString());
         testRules.add(base2);
       }
       testRulesIfNotCached(testRules);
       if (shouldAbort()) {
         return null;
       }
       if (base1 != null && base2 == null) {
         TextRulerToolkit.log("\tbase1: " + base1.getCoveringStatistics() + " --> laplacian = "
                 + base1.getLaplacian());
         result.add(base1);
       } else {
         TextRulerToolkit.log("\tbase1: " + base1.getCoveringStatistics() + " --> laplacian = "
                 + base1.getLaplacian());
         TextRulerToolkit.log("\tbase2: " + base2.getCoveringStatistics() + " --> laplacian = "
                 + base2.getLaplacian());
         if (base2.getCoveringStatistics().getCoveredPositivesCount() > base1
                 .getCoveringStatistics().getCoveredPositivesCount()) {
           result.add(base2);
         } else {
           result.add(base1);
         }
       }
     }
     TextRulerRule best = null;
     for (TextRulerRule each : result) {
       if (best == null) {
         best = each;
       } else {
         if (each.getCoveringStatistics().getCoveredPositivesCount() > best.getCoveringStatistics()
                 .getCoveredPositivesCount()) {
           best = each;
         }
       }
     }

     return (WhiskRule) best;
   }

   private List<WhiskRuleItem> getTermsAfter(WhiskRuleItem whiskRuleItem, TextRulerExample example) {
     List<WhiskRuleItem> result = new ArrayList<WhiskRuleItem>();
     int end = whiskRuleItem.getWordConstraint().getTokenAnnotation().getEnd();
     CAS cas = example.getDocumentCAS();
     // TODO: access type with string constant
     Type frameType = cas.getTypeSystem().getType("org.apache.uima.ruta.type.RutaFrame");
     AnnotationFS pointer = cas.createAnnotation(frameType, end, Integer.MAX_VALUE);
     FSIterator iterator = cas.getAnnotationIndex().iterator(pointer);
     int nextBegin = -1;
     while (iterator.isValid()) {
       FeatureStructure fs = iterator.get();
       if (fs instanceof AnnotationFS) {
         AnnotationFS a = (AnnotationFS) fs;
         if (!filterSetWithSlotNames.contains(a.getType().getName())) {
           if (nextBegin == -1) {
             nextBegin = a.getBegin();
           } else if (nextBegin != a.getBegin()) {
             break;
           }
           if (a.getBegin() <= nextBegin && a.getBegin() >= end) {
             WhiskRuleItem term = new WhiskRuleItem(new TextRulerAnnotation(a,
                     example.getDocument(), consideredFeatures));
             result.add(term);
           }
         }
       }
       iterator.moveToNext();
     }
     return result;
   }

   private List<WhiskRuleItem> getTermsBefore(WhiskRuleItem whiskRuleItem, TextRulerExample example) {
     List<WhiskRuleItem> result = new ArrayList<WhiskRuleItem>();
     int begin = whiskRuleItem.getWordConstraint().getTokenAnnotation().getBegin();
     CAS cas = example.getDocumentCAS();

     // TODO: access type with string constant
     Type frameType = cas.getTypeSystem().getType("org.apache.uima.ruta.type.RutaFrame");
     AnnotationFS pointer = cas.createAnnotation(frameType, begin, begin);
     FSIterator iterator = cas.getAnnotationIndex().iterator(pointer);
     int nextEnd = -1;

     // ???
     iterator.moveToPrevious();
     iterator.moveToPrevious();
     while (iterator.isValid()) {
       FeatureStructure fs = iterator.get();
       if (fs instanceof AnnotationFS) {
         AnnotationFS a = (AnnotationFS) fs;
         if (!filterSetWithSlotNames.contains(a.getType().getName())) {
           if (a.getEnd() > example.getAnnotation().getEnd()) {
             iterator.moveToPrevious();
             continue;
           }
           if (nextEnd == -1) {
             nextEnd = a.getEnd();
           } else if (nextEnd != a.getEnd()) {
             break;
           }
           if (a.getEnd() >= nextEnd && a.getEnd() <= begin) {
             WhiskRuleItem term = new WhiskRuleItem(new TextRulerAnnotation(a,
                     example.getDocument(), consideredFeatures));
             result.add(term);
           }
         }
       }
       iterator.moveToPrevious();
     }
     return result;
   }

   public String getResultString() {
     if (ruleList != null)
       return getFileHeaderString(true) + ruleList.getRulesString("");
     else
       return "No results available yet!";
   }

   public void setParameters(Map<String, Object> params) {
     if (TextRulerToolkit.DEBUG)
       saveParametersToTempFolder(params);

     // TODO try catch
     if (params.containsKey(WINDOWSIZE_KEY))
       windowSize = (Integer) params.get(WINDOWSIZE_KEY);

     if (params.containsKey(ERROR_THRESHOLD_KEY))
       errorThreshold = (Float) params.get(ERROR_THRESHOLD_KEY);

     if (params.containsKey(POSTAG_ROOTTYPE_KEY))
       posTagRootTypeName = (String) params.get(POSTAG_ROOTTYPE_KEY);

     if (params.containsKey(CONSIDERED_FEATURES)) {
       String list = (String) params.get(CONSIDERED_FEATURES);
       if (!StringUtils.isBlank(list)) {
         String[] split = list.split(",");
         for (String string : split) {
           String trim = string.trim();
           if (!StringUtils.isBlank(trim)) {
             consideredFeatures.add(trim);
           }
         }
       }
     }

   }

   public List<List<WhiskRuleItem>> getTermsWithinBounds(int startPos, int endPos,
           TextRulerExample example) {
     List<List<WhiskRuleItem>> result = new ArrayList<List<WhiskRuleItem>>();
     CAS cas = example.getDocumentCAS();
     // TODO: access type with string constant
     Type frameType = cas.getTypeSystem().getType("org.apache.uima.ruta.type.RutaFrame");
     AnnotationFS pointer = cas.createAnnotation(frameType, startPos, endPos);
     FSIterator iterator = cas.getAnnotationIndex().iterator(pointer);
     List<AnnotationFS> startAs = new ArrayList<AnnotationFS>();
     int firstBegin = -1;
     while (iterator.isValid()) {
       FeatureStructure fs = iterator.get();
       AnnotationFS a = (AnnotationFS) fs;

       // TODO change for multislot rules!
       if (a.getBegin() >= startPos && a.getEnd() <= endPos) {
         if (!filterSetWithSlotNames.contains(a.getType().getName())) {
           if (firstBegin == -1) {
             firstBegin = a.getBegin();
           } else if (firstBegin != a.getBegin()) {
             break;
           }
           if (a.getBegin() == firstBegin)
             startAs.add(a);
         }
         iterator.moveToNext();
       } else {
         iterator.moveToNext();
       }
     }

     for (AnnotationFS annotation : startAs) {
       List<WhiskRuleItem> startList = new ArrayList<WhiskRuleItem>();
       WhiskRuleItem term = new WhiskRuleItem(new TextRulerAnnotation(annotation,
               example.getDocument(), consideredFeatures));
       startList.add(term);
       result.add(startList);
     }

     result = addFollowing(result, endPos, example);
     return result;
   }

   private List<List<WhiskRuleItem>> addFollowing(List<List<WhiskRuleItem>> lists, int till,
           TextRulerExample example) {
     List<List<WhiskRuleItem>> result = new ArrayList<List<WhiskRuleItem>>();
     for (List<WhiskRuleItem> list : lists) {
       WhiskRuleItem last = list.get(list.size() - 1);
       List<WhiskRuleItem> termsAfter = getTermsAfter(last, example);
       if (termsAfter.isEmpty()) {
         return lists;
       }
       for (WhiskRuleItem eachAfter : termsAfter) {
         if (eachAfter.getWordConstraint().getTokenAnnotation().getEnd() <= till) {
           List<WhiskRuleItem> newList = new ArrayList<WhiskRuleItem>();
           newList.addAll(list);
           newList.add(eachAfter);
           result.add(newList);
           result = addFollowing(result, till, example);
         } else {
           return lists;
         }
       }
     }
     return result;
   }

   // TODO share this between algorithms (e.g. LP2 and RAPIER, WHISK ?) and
   // make a maximum size of the cache, etc. like CasCache?
   protected void testRulesIfNotCached(List<TextRulerRule> rules) {
     List<TextRulerRule> rulesToTest = new ArrayList<TextRulerRule>();

     for (TextRulerRule r : rules) {
       String key = r.getRuleString();
       if (cachedTestedRuleStatistics.containsKey(key)) {
         r.setCoveringStatistics(cachedTestedRuleStatistics.get(key).copy());
       } else
         rulesToTest.add(r);
     }

     if (rulesToTest.size() > 0) {
       testRulesOnDocumentSet(rulesToTest, exampleDocuments);
       if (shouldAbort())
         return;
       for (TextRulerRule r : rulesToTest) {
         String key = r.getRuleString();
         cachedTestedRuleStatistics.put(key, r.getCoveringStatistics().copy());
       }
     }
   }

   private int getElementIndex(WhiskRule proposedRule, WhiskRuleItem term) {
     if (term == null)
       return -1;
     int index = 0;
     int result = -1;
     for (TextRulerRuleItem i : proposedRule.getPatterns().get(0).preFillerPattern) {
       if (((WhiskRuleItem) i).equals(term)) {
         result = index;
       }
       index++;
     }
     for (TextRulerRuleItem i : proposedRule.getPatterns().get(0).fillerPattern) {
       if (((WhiskRuleItem) i).equals(term)) {
         result = index;
       }
       index++;
     }
     for (TextRulerRuleItem i : proposedRule.getPatterns().get(0).postFillerPattern) {
       if (((WhiskRuleItem) i).equals(term)) {
         result = index;
       }
       index++;
     }
     return result;
   }

   private boolean isNextValidNeighbor(WhiskRuleItem left, WhiskRuleItem right,
           TextRulerExample example) {
     CAS cas = example.getDocumentCAS();
     // TODO: access type with string constant
     Type frameType = cas.getTypeSystem().getType("org.apache.uima.ruta.type.RutaFrame");
     int begin = left.getWordConstraint().getTokenAnnotation().getEnd();
     int end = right.getWordConstraint().getTokenAnnotation().getBegin();
     AnnotationFS pointer = cas.createAnnotation(frameType, begin, end);
     FSIterator iterator = cas.getAnnotationIndex().iterator(pointer);
     while (iterator.isValid()) {
       FeatureStructure fs = iterator.get();
       AnnotationFS a = (AnnotationFS) fs;
       if (a.getBegin() >= begin && a.getEnd() <= end) {
         if (!filterSetWithSlotNames.contains(a.getType().getName())) {
           return false;
         }
       }
       iterator.moveToNext();
     }
     return true;
   }
 }