| /* |
| * Licensed to the Apache Software Foundation (ASF) under one |
| * or more contributor license agreements. See the NOTICE file |
| * distributed with this work for additional information |
| * regarding copyright ownership. The ASF licenses this file |
| * to you under the Apache License, Version 2.0 (the |
| * "License"); you may not use this file except in compliance |
| * with the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, |
| * software distributed under the License is distributed on an |
| * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
| * KIND, either express or implied. See the License for the |
| * specific language governing permissions and limitations |
| * under the License. |
| */ |
| |
| package org.apache.uima.ruta.textruler.learner.rapier; |
| |
| import java.util.ArrayList; |
| import java.util.HashMap; |
| import java.util.HashSet; |
| import java.util.Iterator; |
| import java.util.List; |
| import java.util.Map; |
| import java.util.Random; |
| import java.util.Set; |
| |
| import org.apache.uima.cas.CAS; |
| import org.apache.uima.cas.Type; |
| import org.apache.uima.cas.TypeSystem; |
| import org.apache.uima.cas.text.AnnotationFS; |
| import org.apache.uima.ruta.textruler.core.TextRulerAnnotation; |
| import org.apache.uima.ruta.textruler.core.TextRulerBasicLearner; |
| import org.apache.uima.ruta.textruler.core.TextRulerExample; |
| import org.apache.uima.ruta.textruler.core.TextRulerRule; |
| import org.apache.uima.ruta.textruler.core.TextRulerRuleItem; |
| import org.apache.uima.ruta.textruler.core.TextRulerRuleList; |
| import org.apache.uima.ruta.textruler.core.TextRulerRulePattern; |
| import org.apache.uima.ruta.textruler.core.TextRulerStatisticsCollector; |
| import org.apache.uima.ruta.textruler.core.TextRulerTarget; |
| import org.apache.uima.ruta.textruler.core.TextRulerToolkit; |
| import org.apache.uima.ruta.textruler.core.TextRulerWordConstraint; |
| import org.apache.uima.ruta.textruler.extension.TextRulerLearnerDelegate; |
| |
| public class Rapier extends TextRulerBasicLearner { |
| |
| public final static String COMPRESSION_FAIL_MAX_COUNT_KEY = "compressionFailMaxCount"; |
| |
| public final static String RULELIST_SIZE_KEY = "ruleListSize"; |
| |
| public final static String PAIR_COUNT_KEY = "pairCount"; |
| |
| public final static String LIM_NO_IMPROVEMENTS_KEY = "limNoImprovements"; |
| |
| public final static String NOISE_THESHOLD_KEY = "noiseThreshold"; |
| |
| public final static String POSTAG_ROOTTYPE_KEY = "posTagRootType"; |
| |
| public final static String MIN_COVERED_POSITIVES_KEY = "minCoveredPositives"; |
| |
| public final static String USE_ALL_GENSETS_AT_SPECIALIZATION_KEY = "useAllGenSetsAtSpecialization"; |
| |
| public final static int STANDARD_COMPRESSION_FAIL_MAX_COUNT = 3; |
| |
| public final static int STANDARD_RULELIST_SIZE = 50; |
| |
| public final static int STANDARD_PAIR_COUNT = 4; |
| |
| public final static int STANDARD_LIM_NO_IMPROVEMENTS = 3; |
| |
| public final static float STANDARD_NOISE_THREHSOLD = 0.9f; |
| |
| public final static String STANDARD_POSTAG_ROOTTYPE = "org.apache.uima.ml.ML.postag"; |
| |
| public final static int STANDARD_MIN_COVERED_POSITIVES = 1; |
| |
| public final static boolean STANDARD_USE_ALL_GENSETS_AT_SPECIALIZATION = true; |
| |
| private int compressionFailMaxCount = STANDARD_COMPRESSION_FAIL_MAX_COUNT; |
| |
| private int ruleListSize = STANDARD_RULELIST_SIZE; |
| |
| private int pairCount = STANDARD_PAIR_COUNT; |
| |
| private int limNoImprovements = STANDARD_LIM_NO_IMPROVEMENTS; |
| |
| private float noiseThreshold = STANDARD_NOISE_THREHSOLD; |
| |
| private String posTagRootTypeName = STANDARD_POSTAG_ROOTTYPE; |
| |
| private int minCoveredPositives = STANDARD_MIN_COVERED_POSITIVES; |
| |
| private boolean useAllGenSetsAtSpecialization = STANDARD_USE_ALL_GENSETS_AT_SPECIALIZATION; |
| |
| private Map<String, TextRulerStatisticsCollector> cachedTestedRuleStatistics = new HashMap<String, TextRulerStatisticsCollector>(); |
| |
| private int initialRuleBaseSize; |
| |
| private List<TextRulerExample> examples; |
| |
| private TextRulerRuleList slotRules; |
| |
| private RapierRulePriorityQueue ruleList; |
| |
| private String currentSlotName; |
| |
| public Rapier(String inputDir, String prePropTMFile, String tmpDir, String[] slotNames, |
| Set<String> filterSet, boolean skip, TextRulerLearnerDelegate delegate) { |
| super(inputDir, prePropTMFile, tmpDir, slotNames, filterSet, skip, delegate); |
| } |
| |
| @Override |
| protected void doRun() { |
| for (int i = 0; i < slotNames.length; i++) { |
| int compressionFailCount = 0; |
| |
| // only working for one slot yet ! |
| currentSlotName = slotNames[i]; |
| cachedTestedRuleStatistics.clear(); |
| exampleDocuments.createExamplesForTarget(new TextRulerTarget(currentSlotName, this)); |
| examples = exampleDocuments.getAllPositiveExamples(); |
| |
| if (shouldAbort()) |
| return; |
| |
| slotRules = new TextRulerRuleList(); |
| ruleList = new RapierRulePriorityQueue(ruleListSize); |
| |
| TextRulerToolkit.log("--- RAPIER START for Slot " + currentSlotName); |
| |
| sendStatusUpdateToDelegate("Creating initial rule base...", |
| TextRulerLearnerState.ML_INITIALIZING, false); |
| |
| fillSlotRulesWithMostSpecificRules(); |
| |
| updateCompressionStatusString(); |
| |
| if (TextRulerToolkit.DEBUG) { |
| slotRules.saveToRulesFile(getIntermediateRulesFileName(), getFileHeaderString(true)); |
| } |
| |
| while (compressionFailCount < compressionFailMaxCount) { |
| TextRulerToolkit.log("***** NEW COMPRESSION ROUND; FailCount = " + compressionFailCount); |
| if (shouldAbort()) { |
| return; |
| } |
| |
| RapierRule bestRule = findNewRule(); |
| if (bestRule != null |
| && (bestRule.getCoveringStatistics().getCoveredPositivesCount() >= minCoveredPositives) |
| && (bestRule.noiseValue() >= noiseThreshold) && (!slotRules.contains(bestRule))) { |
| addRuleAndRemoveEmpiricallySubsumedRules(bestRule); |
| if (TextRulerToolkit.DEBUG) |
| slotRules.saveToRulesFile(getIntermediateRulesFileName(), getFileHeaderString(true)); |
| } else { |
| compressionFailCount++; |
| } |
| } |
| |
| if (TextRulerToolkit.DEBUG) { |
| slotRules.saveToRulesFile(getIntermediateRulesFileName(), getFileHeaderString(true)); |
| } |
| } |
| |
| sendStatusUpdateToDelegate("Done", TextRulerLearnerState.ML_DONE, true); |
| cachedTestedRuleStatistics.clear(); |
| TextRulerToolkit.log("--- RAPIER END"); |
| |
| } |
| |
| private void updateCompressionStatusString() { |
| double percent = Math.round((slotRules.size() / (double) initialRuleBaseSize) * 100.0); |
| sendStatusUpdateToDelegate("Compressing... (Rules = " + slotRules.size() + "/" |
| + initialRuleBaseSize + " = " + percent + " % ratio)", |
| TextRulerLearnerState.ML_RUNNING, true); |
| // TODO also show round numbers and compression fail count and such |
| // things! |
| } |
| |
| private void addAvailablePosTagConstraintToItem(RapierRuleItem item, |
| AnnotationFS tokenAnnotation, TextRulerExample example) { |
| |
| if (posTagRootTypeName != null && posTagRootTypeName.length() > 0) { |
| CAS cas = example.getDocumentCAS(); |
| TypeSystem ts = cas.getTypeSystem(); |
| Type posTagsRootType = ts.getType(posTagRootTypeName); |
| if (ts != null) { |
| List<AnnotationFS> posTagAnnotations = TextRulerToolkit.getAnnotationsWithinBounds(cas, |
| tokenAnnotation.getBegin(), tokenAnnotation.getEnd(), null, posTagsRootType); |
| if (posTagAnnotations.size() > 0) { |
| AnnotationFS posTag = posTagAnnotations.get(0); |
| if (posTag.getBegin() == tokenAnnotation.getBegin() |
| && posTag.getEnd() == tokenAnnotation.getEnd()) |
| item.addTagConstraint(posTag.getType().getShortName()); |
| } |
| } |
| } |
| } |
| |
| private void fillSlotRulesWithMostSpecificRules() { |
| slotRules.clear(); |
| for (TextRulerExample example : examples) { |
| RapierRule rule = new RapierRule(this, example.getTarget()); |
| TextRulerAnnotation slotAnnotation = example.getAnnotation(); |
| CAS docCas = example.getDocumentCAS(); |
| TypeSystem ts = docCas.getTypeSystem(); |
| Type tokensRootType = ts.getType(TextRulerToolkit.RUTA_ANY_TYPE_NAME); |
| |
| // first, get all words/tokens: |
| List<AnnotationFS> before = TextRulerToolkit.getAnnotationsBeforePosition( |
| example.getDocumentCAS(), slotAnnotation.getBegin(), -1, |
| TextRulerToolkit.getFilterSetWithSlotNames(slotNames, filterSet), tokensRootType); |
| List<AnnotationFS> after = TextRulerToolkit.getAnnotationsAfterPosition( |
| example.getDocumentCAS(), slotAnnotation.getEnd(), -1, |
| TextRulerToolkit.getFilterSetWithSlotNames(slotNames, filterSet), tokensRootType); |
| List<AnnotationFS> inside = TextRulerToolkit.getAnnotationsWithinBounds( |
| example.getDocumentCAS(), slotAnnotation.getBegin(), slotAnnotation.getEnd(), |
| TextRulerToolkit.getFilterSetWithSlotNames(slotNames, filterSet), tokensRootType); |
| |
| // the before annotations have to be reversed: |
| for (int i = before.size() - 1; i >= 0; i--) { |
| AnnotationFS afs = before.get(i); |
| RapierRuleItem ruleItem = new RapierRuleItem(); |
| ruleItem.addWordConstraint(new TextRulerWordConstraint(new TextRulerAnnotation(afs, example |
| .getDocument()))); |
| addAvailablePosTagConstraintToItem(ruleItem, afs, example); |
| rule.addPreFillerItem(ruleItem); |
| } |
| |
| for (AnnotationFS afs : inside) { |
| RapierRuleItem ruleItem = new RapierRuleItem(); |
| ruleItem.addWordConstraint(new TextRulerWordConstraint(new TextRulerAnnotation(afs, example |
| .getDocument()))); |
| addAvailablePosTagConstraintToItem(ruleItem, afs, example); |
| rule.addFillerItem(ruleItem); |
| } |
| for (AnnotationFS afs : after) { |
| RapierRuleItem ruleItem = new RapierRuleItem(); |
| ruleItem.addWordConstraint(new TextRulerWordConstraint(new TextRulerAnnotation(afs, example |
| .getDocument()))); |
| addAvailablePosTagConstraintToItem(ruleItem, afs, example); |
| rule.addPostFillerItem(ruleItem); |
| } |
| |
| // TextRulerToolkit.log("RULE: "+rule.getRuleString()); |
| // testRuleOnTrainingsSet(rule, exampleDocuments.getDocuments()); |
| |
| // this rule has to at least cover its seed example!! |
| TextRulerStatisticsCollector c = new TextRulerStatisticsCollector(); |
| c.addCoveredPositive(example); |
| rule.setCoveringStatistics(c); |
| slotRules.add(rule); |
| } |
| initialRuleBaseSize = slotRules.size(); |
| } |
| |
| protected void addRuleAndRemoveEmpiricallySubsumedRules(RapierRule rule) { |
| if (!slotRules.contains(rule)) { |
| List<TextRulerRule> rulesToRemove = new ArrayList<TextRulerRule>(); |
| Set<TextRulerExample> coveredExamples = rule.getCoveringStatistics() |
| .getCoveredPositiveExamples(); |
| for (TextRulerRule r : slotRules) { |
| if (coveredExamples.containsAll(r.getCoveringStatistics().getCoveredPositiveExamples())) |
| rulesToRemove.add(r); |
| } |
| for (TextRulerRule removeR : rulesToRemove) |
| slotRules.remove(removeR); |
| slotRules.add(rule); |
| updateCompressionStatusString(); |
| } |
| } |
| |
| protected RapierRule findNewRule() { |
| Random rand = new Random(System.currentTimeMillis()); |
| |
| Set<RapierRule> generalizations = new HashSet<RapierRule>(); |
| // 0. initialization |
| ruleList.clear(); |
| |
| if (slotRules.size() <= 1) |
| return null; |
| |
| List<RapierRule> uncompressedRules = new ArrayList<RapierRule>(); |
| for (TextRulerRule r : slotRules) { |
| if (((RapierRule) r).isInitialRule()) |
| uncompressedRules.add((RapierRule) r); |
| } |
| |
| // 1. get generalizations of the two slot filler patterns: |
| |
| // create pairs and prefer still uncompressed rules when choosing |
| // "randomly": |
| int pairsLeft = pairCount; |
| if (uncompressedRules.size() == 1) { |
| RapierRule rule1 = uncompressedRules.get(0); |
| RapierRule rule2 = null; |
| while (rule2 == null || rule1 == rule2) { |
| rule2 = (RapierRule) slotRules.get(rand.nextInt(slotRules.size())); |
| } |
| generalizations.addAll(getFillerGeneralizationsForRulePair(rule1, rule2)); |
| if (shouldAbort()) |
| return null; |
| pairsLeft--; |
| } else if (uncompressedRules.size() == 2) { |
| RapierRule rule1 = uncompressedRules.get(0); |
| RapierRule rule2 = uncompressedRules.get(1); |
| generalizations.addAll(getFillerGeneralizationsForRulePair(rule1, rule2)); |
| if (shouldAbort()) |
| return null; |
| pairsLeft--; |
| } else if (uncompressedRules.size() > 2) { |
| int uPairCount = pairCount; |
| if (uPairCount > uncompressedRules.size()) |
| uPairCount /= 2; |
| for (int i = 0; i < uPairCount; i++) { |
| RapierRule rule1 = uncompressedRules.get(rand.nextInt(uncompressedRules.size())); |
| RapierRule rule2 = null; |
| while (rule2 == null || rule1 == rule2) { |
| rule2 = uncompressedRules.get(rand.nextInt(uncompressedRules.size())); |
| } |
| generalizations.addAll(getFillerGeneralizationsForRulePair(rule1, rule2)); |
| pairsLeft--; |
| } |
| } |
| |
| for (int i = 0; i < pairsLeft; i++) { |
| // TODO optimize !! don't call the machinery with the same rule pair |
| // two times in one session !!! |
| // randomly pick two rules: |
| RapierRule rule1 = (RapierRule) slotRules.get(rand.nextInt(slotRules.size())); |
| RapierRule rule2 = null; |
| while (rule2 == null || rule1 == rule2) { |
| rule2 = (RapierRule) slotRules.get(rand.nextInt(slotRules.size())); |
| } |
| generalizations.addAll(getFillerGeneralizationsForRulePair(rule1, rule2)); |
| |
| if (shouldAbort()) |
| return null; |
| } |
| |
| // if (TextRulerToolkit.DEBUG) |
| // { |
| // TextRulerToolkit.log("Rule Generalizations created: " + |
| // generalizations.size()); |
| // for (RapierRule newRule : generalizations) |
| // TextRulerToolkit.log("Rule = "+newRule.getRuleString()); |
| // } |
| |
| // 2. evaluate an enque to priority list: |
| List<RapierRule> testRules = new ArrayList<RapierRule>(generalizations); |
| |
| for (RapierRule r : testRules) { |
| r.combineSenselessPatternListItems(); |
| } |
| |
| testRulesIfNotCached(testRules); |
| if (shouldAbort()) |
| return null; |
| |
| for (RapierRule newRule : generalizations) { |
| if (TextRulerToolkit.DEBUG) { |
| if (!RapierDebugHelper.debugCheckIfRuleCoversItsSeedRuleCoverings(newRule)) { |
| TextRulerToolkit |
| .log("------------------------------------------------------------------------------------------"); |
| TextRulerToolkit |
| .log("ERROR, A RULE HAS TO COVER AT LEAST EVERY POSITIVE EXAMPLE OF ITS TWO SEED RULES!!!"); |
| TextRulerToolkit.log("\t RULE: " + newRule.getRuleString()); |
| TextRulerToolkit.log("\t Parent1: " + newRule.getParent1().getRuleString()); |
| TextRulerToolkit.log("\t Parent2: " + newRule.getParent2().getRuleString()); |
| TextRulerToolkit.log("--------"); |
| TextRulerToolkit.log("+RuleCovering: " |
| + newRule.getCoveringStatistics().getCoveredPositiveExamples()); |
| TextRulerToolkit.log("+P1Covering : " |
| + newRule.getParent1().getCoveringStatistics().getCoveredPositiveExamples()); |
| TextRulerToolkit.log("+P2Covering : " |
| + newRule.getParent2().getCoveringStatistics().getCoveredPositiveExamples()); |
| |
| } |
| } |
| ruleList.add(newRule); |
| } |
| |
| // 3. specialize pre and post fillers: |
| int n = 0; |
| double bestValue = Double.MAX_VALUE; |
| int noImprovementCounter = 0; |
| while (true) { |
| n++; |
| TextRulerToolkit.log(" --- NEW SPECIALIZATOIN ROUND; n = " + n + " noImprovementCounter = " |
| + noImprovementCounter); |
| List<RapierRule> newRuleList = new ArrayList<RapierRule>(); |
| for (RapierRule curRule : ruleList) { |
| |
| List<RapierRule> specTestRules = new ArrayList<RapierRule>(specializePreFiller(curRule, n)); |
| |
| for (RapierRule r : specTestRules) |
| r.combineSenselessPatternListItems(); |
| |
| testRulesIfNotCached(specTestRules); |
| if (shouldAbort()) |
| return null; |
| |
| for (RapierRule r : specTestRules) |
| newRuleList.add(r); |
| } |
| ruleList.addAll(newRuleList); |
| |
| newRuleList.clear(); |
| for (RapierRule curRule : ruleList) { |
| |
| List<RapierRule> specTestRules = new ArrayList<RapierRule>(specializePostFiller(curRule, n)); |
| |
| for (RapierRule r : specTestRules) |
| r.combineSenselessPatternListItems(); |
| |
| testRulesIfNotCached(specTestRules); |
| if (shouldAbort()) |
| return null; |
| |
| for (RapierRule r : specTestRules) |
| newRuleList.add(r); |
| } |
| ruleList.addAll(newRuleList); |
| |
| RapierRule bestRule = ruleList.peek(); |
| |
| if (TextRulerToolkit.DEBUG) { |
| // for (RapierRule r: ruleList) |
| // TextRulerToolkit.log("value="+r.getPriority()+" rule = "+r.getRuleString()); |
| TextRulerToolkit.log("------------------------------------"); |
| TextRulerToolkit.log("BEST RULE FOR THIS SESSION: " + bestRule.getCoveringStatistics()); |
| TextRulerToolkit.log(bestRule.getRuleString()); |
| TextRulerToolkit.log("------------------------------------"); |
| } |
| if (bestRule.producesOnlyValidFillers()) |
| break; // todo: horizon effects ?? |
| |
| if (bestRule.getPriority() < bestValue) { |
| noImprovementCounter = 0; |
| bestValue = bestRule.getPriority(); |
| } else { |
| noImprovementCounter++; |
| if (noImprovementCounter > limNoImprovements) |
| break; |
| } |
| } |
| |
| RapierRule bestRule = ruleList.peek(); |
| return bestRule; |
| } |
| |
| private List<RapierRule> getFillerGeneralizationsForRulePair(RapierRule rule1, RapierRule rule2) { |
| TextRulerToolkit |
| .log("------------------------------------------------------------------------------------------"); |
| TextRulerToolkit.log("getFillerGeneralizationsForRulePair:"); |
| TextRulerToolkit.log("Rule1: " + rule1.getRuleString()); |
| TextRulerToolkit.log("Rule2: " + rule2.getRuleString()); |
| |
| List<RapierRule> result = new ArrayList<RapierRule>(); |
| List<TextRulerRulePattern> genList = RapierGeneralizationHelper |
| .getGeneralizationsForRuleItemPatterns(rule1.getFillerPattern(), |
| rule2.getFillerPattern()); |
| // create rules: |
| for (TextRulerRulePattern pattern : genList) { |
| RapierRule newRule = new RapierRule(this, rule1.getTarget()); |
| for (TextRulerRuleItem patternItem : pattern) |
| newRule.addFillerItem(patternItem.copy()); |
| newRule.setParent1(rule1.copy()); |
| newRule.setParent1PreFiller_n(0); |
| newRule.setParent1PostFiller_n(0); |
| newRule.setParent2(rule2.copy()); |
| newRule.setParent2PreFiller_n(0); |
| newRule.setParent2PostFiller_n(0); |
| result.add(newRule); |
| newRule.setNeedsCompile(true); |
| // TextRulerToolkit.log("newRule: "+newRule.getRuleString()); |
| } |
| TextRulerToolkit.log(" getGeneralizationsForRulePair result list size = " + result.size()); |
| return result; |
| } |
| |
| public List<RapierRule> specializePreFiller(RapierRule curRule, int n) { |
| RapierRule baseRule1 = curRule.getParent1(); |
| RapierRule baseRule2 = curRule.getParent2(); |
| int n1 = curRule.getParent1PreFiller_n(); |
| int n2 = curRule.getParent2PreFiller_n(); |
| TextRulerRulePattern preFiller1 = baseRule1.getPreFillerPattern(); |
| TextRulerRulePattern preFiller2 = baseRule2.getPreFillerPattern(); |
| int preFiller1MaxIndex = preFiller1.size() - n1 - 1; |
| int preFiller2MaxIndex = preFiller2.size() - n2 - 1; |
| |
| // generate 3 different possible sets for generalizations: |
| |
| // 1. n vs. n-1 (n elements of baserule1, n-1 of baserule2) |
| TextRulerRulePattern consideredPreFiller1 = new TextRulerRulePattern(); |
| TextRulerRulePattern consideredPreFiller2 = new TextRulerRulePattern(); |
| for (int i = preFiller1.size() - n; i >= 0 && i <= preFiller1MaxIndex; i++) |
| consideredPreFiller1.add(preFiller1.get(i)); |
| for (int i = preFiller2.size() - n + 1; i >= 0 && i <= preFiller2MaxIndex; i++) |
| consideredPreFiller2.add(preFiller2.get(i)); |
| List<TextRulerRulePattern> genList1 = null; |
| if (consideredPreFiller1.size() + consideredPreFiller2.size() > 0) |
| genList1 = RapierGeneralizationHelper.getGeneralizationsForRuleItemPatterns( |
| consideredPreFiller1, consideredPreFiller2); |
| |
| List<TextRulerRulePattern> genList2 = null; |
| List<TextRulerRulePattern> genList3 = null; |
| |
| if (useAllGenSetsAtSpecialization) // due to performance reasons the |
| // user can switch this off |
| { |
| // 2. n-1 vs. n (n-1 elements of baserule1, n of baserule2) |
| consideredPreFiller1.clear(); |
| consideredPreFiller2.clear(); |
| for (int i = preFiller1.size() - n + 1; i >= 0 && i <= preFiller1MaxIndex; i++) |
| consideredPreFiller1.add(preFiller1.get(i)); |
| for (int i = preFiller2.size() - n; i >= 0 && i <= preFiller2MaxIndex; i++) |
| consideredPreFiller2.add(preFiller2.get(i)); |
| |
| if (consideredPreFiller1.size() + consideredPreFiller2.size() > 0) |
| genList2 = RapierGeneralizationHelper.getGeneralizationsForRuleItemPatterns( |
| consideredPreFiller1, consideredPreFiller2); |
| |
| // 3. n vs. n (n elements of baserule1, n of baserule2) |
| consideredPreFiller1.clear(); |
| consideredPreFiller2.clear(); |
| for (int i = preFiller1.size() - n; i >= 0 && i <= preFiller1MaxIndex; i++) |
| consideredPreFiller1.add(preFiller1.get(i)); |
| for (int i = preFiller2.size() - n; i >= 0 && i <= preFiller2MaxIndex; i++) |
| consideredPreFiller2.add(preFiller2.get(i)); |
| if (consideredPreFiller1.size() + consideredPreFiller2.size() > 0) |
| genList3 = RapierGeneralizationHelper.getGeneralizationsForRuleItemPatterns( |
| consideredPreFiller1, consideredPreFiller2); |
| } |
| |
| // TODO optimize and don't store all 3 genLists ! but for debugging |
| // purposes we keep them for now ! |
| Set<TextRulerRulePattern> genSet = new HashSet<TextRulerRulePattern>(); |
| if (genList1 != null) |
| genSet.addAll(genList1); |
| if (genList2 != null) |
| genSet.addAll(genList2); |
| if (genList3 != null) |
| genSet.addAll(genList3); |
| |
| List<RapierRule> resultRules = new ArrayList<RapierRule>(); |
| |
| for (TextRulerRulePattern l : genSet) { |
| RapierRule newRule = curRule.copy(); |
| for (int i = l.size() - 1; i >= 0; i--) |
| newRule.addPreFillerItem(l.get(i)); |
| newRule.setParent1PreFiller_n(n); |
| newRule.setParent2PreFiller_n(n); |
| resultRules.add(newRule); |
| } |
| return resultRules; |
| } |
| |
| // n = 1..maxN |
| public List<RapierRule> specializePostFiller(RapierRule curRule, int n) { |
| if (n == 0) { |
| TextRulerToolkit.log("ERROR ! N SHOULD NOT BE 0!"); |
| } |
| RapierRule baseRule1 = curRule.getParent1(); |
| RapierRule baseRule2 = curRule.getParent2(); |
| int n1 = curRule.getParent1PostFiller_n(); |
| int n2 = curRule.getParent2PostFiller_n(); |
| TextRulerRulePattern postFiller1 = baseRule1.getPostFillerPattern(); |
| TextRulerRulePattern postFiller2 = baseRule2.getPostFillerPattern(); |
| int postFiller1MinIndex = n1; |
| int postFiller2MinIndex = n2; |
| |
| // generate 3 different possible sets for generalizations: |
| |
| // 1. n vs. n-1 (n elements of baserule1, n-1 of baserule2) |
| TextRulerRulePattern consideredPostFiller1 = new TextRulerRulePattern(); |
| TextRulerRulePattern consideredPostFiller2 = new TextRulerRulePattern(); |
| for (int i = postFiller1MinIndex; i < postFiller1.size() && i < n; i++) |
| consideredPostFiller1.add(postFiller1.get(i)); |
| for (int i = postFiller2MinIndex; i < postFiller2.size() && i < n - 1; i++) |
| consideredPostFiller2.add(postFiller2.get(i)); |
| List<TextRulerRulePattern> genList1 = null; |
| if (consideredPostFiller1.size() + consideredPostFiller2.size() > 0) |
| genList1 = RapierGeneralizationHelper.getGeneralizationsForRuleItemPatterns( |
| consideredPostFiller1, consideredPostFiller2); |
| |
| // 2. n-1 vs. n (n-1 elements of baserule1, n of baserule2) |
| consideredPostFiller1.clear(); |
| consideredPostFiller2.clear(); |
| for (int i = postFiller1MinIndex; i < postFiller1.size() && i < n - 1; i++) |
| consideredPostFiller1.add(postFiller1.get(i)); |
| for (int i = postFiller2MinIndex; i < postFiller2.size() && i < n; i++) |
| consideredPostFiller2.add(postFiller2.get(i)); |
| List<TextRulerRulePattern> genList2 = null; |
| if (consideredPostFiller1.size() + consideredPostFiller2.size() > 0) |
| genList2 = RapierGeneralizationHelper.getGeneralizationsForRuleItemPatterns( |
| consideredPostFiller1, consideredPostFiller2); |
| |
| // 3. n vs. n (n elements of baserule1, n of baserule2) |
| consideredPostFiller1.clear(); |
| consideredPostFiller2.clear(); |
| for (int i = postFiller1MinIndex; i < postFiller1.size() && i < n; i++) |
| consideredPostFiller1.add(postFiller1.get(i)); |
| for (int i = postFiller2MinIndex; i < postFiller2.size() && i < n; i++) |
| consideredPostFiller2.add(postFiller2.get(i)); |
| List<TextRulerRulePattern> genList3 = null; |
| if (consideredPostFiller1.size() + consideredPostFiller2.size() > 0) |
| genList3 = RapierGeneralizationHelper.getGeneralizationsForRuleItemPatterns( |
| consideredPostFiller1, consideredPostFiller2); |
| |
| // TODO optimize and don't store all 3 genLists ! but for debugging |
| // purposes we keep them for now ! |
| Set<TextRulerRulePattern> genSet = new HashSet<TextRulerRulePattern>(); |
| if (genList1 != null) |
| genSet.addAll(genList1); |
| if (genList2 != null) |
| genSet.addAll(genList2); |
| if (genList3 != null) |
| genSet.addAll(genList3); |
| |
| List<RapierRule> resultRules = new ArrayList<RapierRule>(); |
| |
| for (TextRulerRulePattern l : genSet) { |
| RapierRule newRule = curRule.copy(); |
| for (TextRulerRuleItem t : l) |
| newRule.addPostFillerItem(t); |
| newRule.setParent1PostFiller_n(n); |
| newRule.setParent2PostFiller_n(n); |
| resultRules.add(newRule); |
| } |
| return resultRules; |
| } |
| |
| @Override |
| public boolean collectNegativeCoveredInstancesWhenTesting() { |
| return false; |
| } |
| |
| public String getResultString() { |
| if (slotRules != null) |
| return slotRules.getTMFileString(getFileHeaderString(true), 1000); // if |
| // a |
| // rule |
| // is |
| // >100 |
| // characters, |
| // it |
| // gets |
| // replaced |
| // by |
| // a |
| // placeholder |
| else |
| return "No results available yet!"; |
| } |
| |
| public void setParameters(Map<String, Object> params) { |
| if (TextRulerToolkit.DEBUG) |
| saveParametersToTempFolder(params); |
| |
| // TODO add try catch |
| if (params.containsKey(COMPRESSION_FAIL_MAX_COUNT_KEY)) |
| compressionFailMaxCount = (Integer) params.get(COMPRESSION_FAIL_MAX_COUNT_KEY); |
| |
| if (params.containsKey(RULELIST_SIZE_KEY)) |
| ruleListSize = (Integer) params.get(RULELIST_SIZE_KEY); |
| |
| if (params.containsKey(PAIR_COUNT_KEY)) |
| pairCount = (Integer) params.get(PAIR_COUNT_KEY); |
| |
| if (params.containsKey(LIM_NO_IMPROVEMENTS_KEY)) |
| limNoImprovements = (Integer) params.get(LIM_NO_IMPROVEMENTS_KEY); |
| |
| if (params.containsKey(NOISE_THESHOLD_KEY)) |
| noiseThreshold = (Float) params.get(NOISE_THESHOLD_KEY); |
| |
| if (params.containsKey(POSTAG_ROOTTYPE_KEY)) |
| posTagRootTypeName = (String) params.get(POSTAG_ROOTTYPE_KEY); |
| |
| if (params.containsKey(MIN_COVERED_POSITIVES_KEY)) |
| minCoveredPositives = (Integer) params.get(MIN_COVERED_POSITIVES_KEY); |
| |
| if (params.containsKey(USE_ALL_GENSETS_AT_SPECIALIZATION_KEY)) |
| useAllGenSetsAtSpecialization = (Boolean) params.get(USE_ALL_GENSETS_AT_SPECIALIZATION_KEY); |
| } |
| |
| // TODO share this between algorithms (e.g. LP2 and RAPIER ?) and make a |
| // maximum size of the cache, etc. like CasCache? |
| protected void testRulesIfNotCached(List<RapierRule> rules) { |
| List<TextRulerRule> rulesToTest = new ArrayList<TextRulerRule>(); |
| |
| for (RapierRule r : rules) { |
| String key = r.getRuleString(); |
| if (cachedTestedRuleStatistics.containsKey(key)) { |
| r.setCoveringStatistics(cachedTestedRuleStatistics.get(key).copy()); |
| TextRulerToolkit.log("CACHE HIT; size=" + cachedTestedRuleStatistics.size()); |
| } else |
| rulesToTest.add(r); |
| } |
| |
| if (rulesToTest.size() > 0) { |
| testRulesOnDocumentSet(rulesToTest, exampleDocuments); |
| if (shouldAbort()) |
| return; |
| while (cachedTestedRuleStatistics.size() + rulesToTest.size() > 10000) // TODO |
| // lohnt |
| // sich |
| // das |
| // ? |
| // speicher |
| // beobachten |
| // !! |
| { |
| Iterator<String> it = cachedTestedRuleStatistics.keySet().iterator(); |
| if (!it.hasNext()) |
| break; |
| String removeKey = cachedTestedRuleStatistics.keySet().iterator().next(); |
| cachedTestedRuleStatistics.remove(removeKey); |
| } |
| |
| for (TextRulerRule r : rulesToTest) { |
| String key = r.getRuleString(); |
| cachedTestedRuleStatistics.put(key, r.getCoveringStatistics().copy()); |
| } |
| } |
| } |
| |
| } |