blob: 3671bc7cec3d37a6cc5ab93069206952a4840d3a [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.apache.uima.ruta.textruler.learner.wien;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Set;
import org.apache.uima.cas.CAS;
import org.apache.uima.cas.Type;
import org.apache.uima.cas.TypeSystem;
import org.apache.uima.cas.text.AnnotationFS;
import org.apache.uima.ruta.textruler.TextRulerPlugin;
import org.apache.uima.ruta.textruler.core.TextRulerAnnotation;
import org.apache.uima.ruta.textruler.core.TextRulerBasicLearner;
import org.apache.uima.ruta.textruler.core.TextRulerExample;
import org.apache.uima.ruta.textruler.core.TextRulerExampleDocument;
import org.apache.uima.ruta.textruler.core.TextRulerRuleItem;
import org.apache.uima.ruta.textruler.core.TextRulerRulePattern;
import org.apache.uima.ruta.textruler.core.TextRulerSlotPattern;
import org.apache.uima.ruta.textruler.core.TextRulerTarget;
import org.apache.uima.ruta.textruler.core.TextRulerToolkit;
import org.apache.uima.ruta.textruler.extension.TextRulerLearnerDelegate;
public class Wien extends TextRulerBasicLearner {
TextRulerRulePattern hPattern;
TextRulerRulePattern tPattern;
Map<String, PatternPair> headTailCache = new HashMap<String, PatternPair>();
Map<String, List<TextRulerRulePattern>> interTupelSeparatorsCache = new HashMap<String, List<TextRulerRulePattern>>();
public static class PatternPair {
public TextRulerRulePattern l = new TextRulerRulePattern();
public TextRulerRulePattern r = new TextRulerRulePattern();
}
ArrayList<PatternPair> patternPairs = new ArrayList<PatternPair>();
WienRule theRule;
public enum constraint3ReturnType {
C3_SUCCESS, C3_L1CandidateSuffixError, C3_TailCandidateH_L1Error, C3_TailCandidateRK_PrefixError, C3_TailCandidateNotFoundError, C3_TailCandidateSucceedsL1InTailError, C3_L1CandidateInterTupleSeparatorSuffixError, C3_TailCandidatePrecedesL1InterTupleSeparatorError
};
public Wien(String inputDir, String prePropTmFile, String tmpDir, String[] slotNames,
Set<String> filterSet, boolean skip, TextRulerLearnerDelegate delegate) {
super(inputDir, prePropTmFile, tmpDir, slotNames, filterSet, skip, delegate);
}
@Override
public boolean collectNegativeCoveredInstancesWhenTesting() {
return false;
}
@Override
protected void doRun() {
TextRulerToolkit.log("-- WIEN START");
headTailCache.clear();
interTupelSeparatorsCache.clear();
for (int i = 0; i < slotNames.length; i++)
patternPairs.add(new PatternPair());
TextRulerTarget target = new TextRulerTarget(slotNames, this);
exampleDocuments.createExamplesForTarget(target); // new multislot
// target examples
for (TextRulerExample e : exampleDocuments.getAllPositiveExamples()) {
TextRulerToolkit.log("Example found: " + e);
}
try {
boolean allOk = true;
sendStatusUpdateToDelegate("Searching for right patterns...",
TextRulerLearnerState.ML_RUNNING, false);
if (!findRightPatterns())
allOk = false;
sendStatusUpdateToDelegate("Searching for left patterns...",
TextRulerLearnerState.ML_RUNNING, false);
if (!findLeftPatterns())
allOk = false;
sendStatusUpdateToDelegate("Searching for head, tail and left1 patterns...",
TextRulerLearnerState.ML_RUNNING, false);
if (!findHeadTailAndL1Patterns())
allOk = false;
// {
// String s = "";
// for (TextRulerRuleItem i : hPattern)
// s += " "+i;
// s += " ||||";
// for (TextRulerRuleItem i : patternPairs.get(0).l)
// s += " "+i;
// s += " ||||";
// for (TextRulerRuleItem i : tPattern)
// s += " "+i;
// TextRulerToolkit.log(s);
// }
if (allOk) {
sendStatusUpdateToDelegate("Building multi-slot rule.", TextRulerLearnerState.ML_RUNNING,
false);
theRule = new WienRule(this, target);
List<TextRulerSlotPattern> rPatterns = theRule.getPatterns();
int totalItemCount = 0;
for (int k = 0; k < slotNames.length; k++) {
WienRuleItem slotItem = new WienRuleItem((TextRulerAnnotation) null);
TextRulerSlotPattern rP = new TextRulerSlotPattern();
rPatterns.add(rP);
PatternPair p = patternPairs.get(k);
for (int i = 0; i < p.l.size(); i++) {
WienRuleItem item = (WienRuleItem) p.l.get(i);
if (k == 0 && i == 0) // the very first rule item:
{
item = item.copy();
// old version:
// item.addCondition("-NEAR,wien_tail,10000000,false");
item.addCondition("-AFTER(wien_tail)");
item.addCondition("-PARTOF(wien_rulemark)");
}
rP.preFillerPattern.add(item);
totalItemCount++;
}
rP.fillerPattern.add(slotItem.copy());
totalItemCount++;
for (int i = 0; i < p.r.size(); i++) {
WienRuleItem item = (WienRuleItem) p.r.get(i);
totalItemCount++;
if (k == slotNames.length - 1 && i == p.r.size() - 1) // the
// very
// last
// item
{
item = item.copy();
item.addAction("MARK(wien_rulemark, 1, " + totalItemCount + ")");
}
rP.postFillerPattern.add(item);
}
totalItemCount++; // the inter-slot ALL*? item has to be
// counted as well!
}
sendStatusUpdateToDelegate("Done", TextRulerLearnerState.ML_DONE, true);
} else
sendStatusUpdateToDelegate("Done - Not all patterns could be learned!",
TextRulerLearnerState.ML_DONE, true);
} catch (Exception e) {
TextRulerPlugin.error(e);
sendStatusUpdateToDelegate("Aborted due to Exception!", TextRulerLearnerState.ML_ERROR, true);
}
headTailCache.clear();
interTupelSeparatorsCache.clear();
TextRulerToolkit.log("-- WIEN END");
}
protected boolean findRightPatterns() {
TextRulerExampleDocument doc = exampleDocuments.getDocuments().get(0);
boolean allFound = true;
for (int k = 0; k < slotNames.length; k++) {
List<TextRulerRulePattern> rightContexts = getRightContextForSlot(doc, k);
System.out.println(rightContexts.get(0));
int shortest = Integer.MAX_VALUE;
for (TextRulerRulePattern p : rightContexts)
shortest = p.size() < shortest ? p.size() : shortest;
boolean found = false;
for (int len = 1; len <= shortest; len++) {
TextRulerRulePattern subPattern = rightContexts.get(0).subPattern(0, len);
if (testConstraint1(subPattern, k)) {
// for (TextRulerRuleItem i : subPattern)
// ((WienRuleItem)i).getWordConstraint().setGeneralizeLinkMarkUp(true);
patternPairs.get(k).r = subPattern;
TextRulerToolkit.log("right " + k + ": " + subPattern);
found = true;
break;
}
}
if (!found)
allFound = false;
}
return allFound;
}
protected boolean findLeftPatterns() {
TextRulerExampleDocument doc = exampleDocuments.getDocuments().get(0);
// skip l 0 !
if (slotNames.length < 2)
return true;
boolean allFound = true;
for (int k = 1; k < slotNames.length; k++) {
List<TextRulerRulePattern> leftContexts = getLeftContextForSlot(doc, k);
int shortest = Integer.MAX_VALUE;
for (TextRulerRulePattern p : leftContexts)
shortest = p.size() < shortest ? p.size() : shortest;
TextRulerRulePattern sourcePattern = leftContexts.get(0);
boolean found = false;
for (int len = 1; len <= shortest; len++) {
// get suffix:
TextRulerRulePattern subPattern = sourcePattern.subPattern(sourcePattern.size() - len, len);
if (testConstraint2(subPattern, k)) {
patternPairs.get(k).l = subPattern;
for (TextRulerRuleItem i : subPattern)
((WienRuleItem) i).getWordConstraint().setGeneralizeLinkMarkUp(true);
TextRulerToolkit.log("left " + k + ": " + subPattern);
found = true;
break;
}
}
if (!found)
allFound = false;
}
return allFound;
}
protected boolean findHeadTailAndL1Patterns() {
List<TextRulerExampleDocument> docs = exampleDocuments.getDocuments();
TextRulerExampleDocument doc0 = docs.get(0);
TextRulerRulePattern head = new TextRulerRulePattern();
TextRulerRulePattern tail = new TextRulerRulePattern();
getPageHeadAndTailPortion(doc0, head, tail);
final class HLCandidate {
public TextRulerRulePattern head = new TextRulerRulePattern();
public TextRulerRulePattern l1 = new TextRulerRulePattern();
}
// a small optimization:
// find out the maximum possible length for l1 in doc0 since l1 is much
// smaller than the possible head length!
List<TextRulerRulePattern> interTupleSeparators = getInterTupleSepatators(doc0);
int shortestL1 = head.size() - 1;
for (TextRulerRulePattern its : interTupleSeparators)
shortestL1 = its.size() < shortestL1 ? its.size() : shortestL1;
List<HLCandidate> hlCandidates = new ArrayList<HLCandidate>();
// create candidates for each separation of the head and tail patterns:
for (int separator = head.size() - 1; separator > 0; separator--) {
HLCandidate c = new HLCandidate();
for (int i = 0; i < head.size(); i++) {
if (i < separator)
c.head.add(head.get(i));
else {
WienRuleItem it = (WienRuleItem) head.get(i).copy();
it.getWordConstraint().setGeneralizeLinkMarkUp(true);
c.l1.add(it);
}
}
hlCandidates.add(c);
TextRulerToolkit.log(c.head.size() + " vs. " + c.l1.size());
if (c.l1.size() >= shortestL1)
break;
}
long total = 0;
// get total h l1 t combination count:
long tCand = (tail.size() * (tail.size() + 1)) / 2;
for (HLCandidate c : hlCandidates) {
total += ((c.head.size() - 1) * (c.head.size())) / 2;
}
total *= tCand;
long current = 0;
int oldPercent = -1;
for (HLCandidate c : hlCandidates) {
// for each "candidate" which represents a l1 suffix pattern of the
// head tokens and a rest pattern for the h pattern,
// we have to create every sub pattern of the remaining h pattern as
// a h candidate:
TextRulerRulePattern l1 = c.l1;
TextRulerRulePattern h = null;
boolean l1Sucks = false;
for (int endI = c.head.size() - 1; endI > 0; endI--) {
for (int startI = endI; startI > 0; startI--) {
h = new TextRulerRulePattern();
for (int i = startI; i <= endI; i++)
h.add(c.head.get(i));
// now for each h candidate we have to create each t
// candidate:
TextRulerRulePattern t = null;
for (int tstartI = 0; tstartI < tail.size(); tstartI++) {
for (int tendI = tstartI; tendI < tail.size(); tendI++) {
int percent = Math.round(((float) current * 100 / total));
if (percent != oldPercent) {
oldPercent = percent;
if (percent > 100)
percent = 100;
// TextRulerToolkit.log(current+" / "+total);
sendStatusUpdateToDelegate("Testing C3, " + percent + "%",
TextRulerLearnerState.ML_RUNNING, false);
}
if (shouldAbort())
return false;
current++;
t = new TextRulerRulePattern();
for (int i = tstartI; i <= tendI; i++)
t.add(tail.get(i));
// no we have a possible candidate triple: h, t and
// l1:
constraint3ReturnType c3Result = testConstraint3(h, t, l1);
if (c3Result == constraint3ReturnType.C3_SUCCESS) {
hPattern = h;
tPattern = t;
patternPairs.get(0).l = l1;
return true;
} else if (c3Result == constraint3ReturnType.C3_L1CandidateSuffixError
|| c3Result == constraint3ReturnType.C3_L1CandidateInterTupleSeparatorSuffixError) {
l1Sucks = true;
current += tail.size() - tendI - 1;
break;
} else if (c3Result == constraint3ReturnType.C3_TailCandidateH_L1Error
|| c3Result == constraint3ReturnType.C3_TailCandidateSucceedsL1InTailError) {
// no special pruning options here... we simply
// have to test the next t-candidate
} else if (c3Result == constraint3ReturnType.C3_TailCandidateRK_PrefixError
|| c3Result == constraint3ReturnType.C3_TailCandidateNotFoundError) {
// all candidates with the same start item are
// bad, so leave this inner loop:
current += tail.size() - tendI - 1;
break;
} else if (c3Result == constraint3ReturnType.C3_TailCandidatePrecedesL1InterTupleSeparatorError) {
// this is a problematic case... the cause could
// be L1 or the current Tail pattern,
// so we can't do nothing about it! just try the
// next t-candidate
}
}
if (l1Sucks) {
current += (tail.size() - tstartI - 1) * (tail.size() - tstartI) / 2;
break;
}
}
if (l1Sucks) {
if (startI > 0)
current += (startI - 1) * tCand;
break;
}
}
if (l1Sucks) {
current += (endI * (endI + 1) / 2) * tCand;
break;
}
}
}
return false;
}
protected void getPageHeadAndTailPortion(TextRulerExampleDocument doc, TextRulerRulePattern head,
TextRulerRulePattern tail) {
String key = doc.getCasFileName();
if (headTailCache.containsKey(key)) {
PatternPair p = headTailCache.get(key);
head.addAll(p.l);
tail.addAll(p.r);
} else {
CAS cas = doc.getCAS();
TextRulerExample firstExample = doc.getPositiveExamples().get(0);
TextRulerExample lastExample = doc.getPositiveExamples().get(
doc.getPositiveExamples().size() - 1);
TypeSystem ts = cas.getTypeSystem();
Type tokenType = ts.getType(TextRulerToolkit.RUTA_ALL_TYPE_NAME);
List<AnnotationFS> headTokens = TextRulerToolkit.getAnnotationsBeforePosition(cas,
firstExample.getAnnotations()[0].getBegin(), 0, TextRulerToolkit
.getFilterSetWithSlotNames(slotNames, filterSet), tokenType);
TextRulerAnnotation[] lastExampleAnnotations = lastExample.getAnnotations();
List<AnnotationFS> tailTokens = TextRulerToolkit.getAnnotationsAfterPosition(cas,
lastExampleAnnotations[lastExampleAnnotations.length - 1].getEnd(), 0,
TextRulerToolkit.getFilterSetWithSlotNames(slotNames, filterSet), tokenType);
for (AnnotationFS afs : headTokens)
head.add(new WienRuleItem(new TextRulerAnnotation(afs, doc)));
for (AnnotationFS afs : tailTokens)
tail.add(new WienRuleItem(new TextRulerAnnotation(afs, doc)));
PatternPair p = new PatternPair();
p.l.addAll(head);
p.r.addAll(tail);
headTailCache.put(key, p);
}
}
protected List<TextRulerRulePattern> getInterTupleSepatators(TextRulerExampleDocument doc) {
String key = doc.getCasFileName();
if (interTupelSeparatorsCache.containsKey(key)) {
return interTupelSeparatorsCache.get(key);
} else {
List<TextRulerRulePattern> result = new ArrayList<TextRulerRulePattern>();
CAS cas = doc.getCAS();
TypeSystem ts = cas.getTypeSystem();
Type tokenType = ts.getType(TextRulerToolkit.RUTA_ALL_TYPE_NAME);
List<TextRulerExample> examples = doc.getPositiveExamples();
for (int i = 0; i < examples.size() - 1; i++) {
// get separator between i'th and (i+1)'th example:
TextRulerAnnotation[] exampleAnnotations1 = examples.get(i).getAnnotations();
TextRulerAnnotation[] exampleAnnotations2 = examples.get(i + 1).getAnnotations();
TextRulerAnnotation lastOf1 = exampleAnnotations1[exampleAnnotations1.length - 1];
TextRulerAnnotation firstOf2 = exampleAnnotations2[0];
List<AnnotationFS> theTokens = TextRulerToolkit.getAnnotationsWithinBounds(cas, lastOf1
.getEnd(), firstOf2.getBegin(), TextRulerToolkit.getFilterSetWithSlotNames(
slotNames, filterSet), tokenType);
TextRulerRulePattern thePattern = new TextRulerRulePattern();
for (AnnotationFS afs : theTokens)
thePattern.add(new WienRuleItem(new TextRulerAnnotation(afs, doc)));
if (thePattern.size() > 0)
result.add(thePattern);
}
interTupelSeparatorsCache.put(key, result);
return result;
}
}
protected List<TextRulerRulePattern> getRightContextForSlot(TextRulerExampleDocument doc,
int slotIndex) {
List<TextRulerRulePattern> result = new ArrayList<TextRulerRulePattern>();
CAS cas = doc.getCAS();
TypeSystem ts = cas.getTypeSystem();
Type tokenType = ts.getType(TextRulerToolkit.RUTA_ALL_TYPE_NAME);
List<TextRulerExample> examples = doc.getPositiveExamples();
boolean isLastSlot = slotIndex >= slotNames.length - 1;
for (int ei = 0; ei < examples.size(); ei++) {
boolean isLastExample = ei == examples.size() - 1;
TextRulerExample e = examples.get(ei);
// get stuff between slot slotIndex and slotIndex+1
TextRulerAnnotation slotAnnotation = e.getAnnotations()[slotIndex];
TextRulerAnnotation nextSlotAnnotation;
if (!isLastSlot)
nextSlotAnnotation = e.getAnnotations()[slotIndex + 1];
else {
if (!isLastExample) // the next slot annotation is the first
// example annotation of the next template:
nextSlotAnnotation = examples.get(ei + 1).getAnnotations()[0];
else
nextSlotAnnotation = null;
}
List<AnnotationFS> theTokens;
if (nextSlotAnnotation == null)
theTokens = TextRulerToolkit.getAnnotationsAfterPosition(cas, slotAnnotation.getEnd(), 0,
TextRulerToolkit.getFilterSetWithSlotNames(slotNames, filterSet), tokenType);
else
theTokens = TextRulerToolkit.getAnnotationsWithinBounds(cas, slotAnnotation.getEnd(),
nextSlotAnnotation.getBegin(), TextRulerToolkit.getFilterSetWithSlotNames(
slotNames, filterSet), tokenType);
TextRulerRulePattern thePattern = new TextRulerRulePattern();
for (AnnotationFS afs : theTokens)
thePattern.add(new WienRuleItem(new TextRulerAnnotation(afs, doc)));
if (thePattern.size() > 0)
result.add(thePattern);
}
return result;
}
protected List<TextRulerRulePattern> getLeftContextForSlot(TextRulerExampleDocument doc,
int slotIndex) {
if (slotIndex == 0)
return null;
List<TextRulerRulePattern> result = new ArrayList<TextRulerRulePattern>();
CAS cas = doc.getCAS();
TypeSystem ts = cas.getTypeSystem();
Type tokenType = ts.getType(TextRulerToolkit.RUTA_ALL_TYPE_NAME);
List<TextRulerExample> examples = doc.getPositiveExamples();
boolean isFirstSlot = slotIndex == 0;
for (int ei = 0; ei < examples.size(); ei++) {
boolean isFirstExample = ei == 0;
TextRulerExample e = examples.get(ei);
// get stuff between slot slotIndex and slotIndex+1
TextRulerAnnotation slotAnnotation = e.getAnnotations()[slotIndex];
TextRulerAnnotation prevSlotAnnotation;
if (!isFirstSlot)
prevSlotAnnotation = e.getAnnotations()[slotIndex - 1];
else {
if (!isFirstExample)
prevSlotAnnotation = examples.get(ei - 1).getAnnotations()[slotNames.length - 1];
else
prevSlotAnnotation = null;
}
List<AnnotationFS> theTokens;
if (prevSlotAnnotation == null)
theTokens = TextRulerToolkit.getAnnotationsBeforePosition(cas, slotAnnotation.getBegin(),
0, TextRulerToolkit.getFilterSetWithSlotNames(slotNames, filterSet), tokenType);
else
theTokens = TextRulerToolkit.getAnnotationsWithinBounds(cas, prevSlotAnnotation.getEnd(),
slotAnnotation.getBegin(), TextRulerToolkit.getFilterSetWithSlotNames(slotNames,
filterSet), tokenType);
TextRulerRulePattern thePattern = new TextRulerRulePattern();
for (AnnotationFS afs : theTokens)
thePattern.add(new WienRuleItem(new TextRulerAnnotation(afs, doc), true));
if (thePattern.size() > 0)
result.add(thePattern);
}
return result;
}
protected List<TextRulerRulePattern> getSlotFillerPatterns(TextRulerExampleDocument doc,
int slotIndex) {
List<TextRulerRulePattern> result = new ArrayList<TextRulerRulePattern>();
CAS cas = doc.getCAS();
TypeSystem ts = cas.getTypeSystem();
Type tokenType = ts.getType(TextRulerToolkit.RUTA_ALL_TYPE_NAME);
List<TextRulerExample> examples = doc.getPositiveExamples();
for (TextRulerExample e : examples) {
TextRulerAnnotation slotAnnotation = e.getAnnotations()[slotIndex];
List<AnnotationFS> theTokens = TextRulerToolkit.getAnnotationsWithinBounds(cas,
slotAnnotation.getBegin(), slotAnnotation.getEnd(), TextRulerToolkit
.getFilterSetWithSlotNames(slotNames, filterSet), tokenType);
TextRulerRulePattern thePattern = new TextRulerRulePattern();
for (AnnotationFS afs : theTokens)
thePattern.add(new WienRuleItem(new TextRulerAnnotation(afs, doc)));
if (thePattern.size() > 0)
result.add(thePattern);
}
return result;
}
protected constraint3ReturnType testConstraint3(TextRulerRulePattern h, TextRulerRulePattern t,
TextRulerRulePattern l1) {
for (TextRulerExampleDocument doc : exampleDocuments.getDocuments()) {
constraint3ReturnType r = testConstraint3(doc, h, t, l1);
if (r != constraint3ReturnType.C3_SUCCESS)
return r;
}
return constraint3ReturnType.C3_SUCCESS;
}
protected boolean testConstraint1(TextRulerExampleDocument doc, TextRulerRulePattern rk, int k) {
List<TextRulerRulePattern> rightContexts = getRightContextForSlot(doc, k);
for (TextRulerRulePattern rx : rightContexts) {
if (rx.find(rk) != 0)
return false;
}
List<TextRulerRulePattern> contents = getSlotFillerPatterns(doc, k);
for (TextRulerRulePattern c : contents) {
if (c.find(rk) >= 0)
return false;
}
return true;
}
protected boolean testConstraint1(TextRulerRulePattern rk, int k) {
for (TextRulerExampleDocument doc : exampleDocuments.getDocuments()) {
if (!testConstraint1(doc, rk, k))
return false;
}
return true;
}
protected boolean testConstraint2(TextRulerExampleDocument doc, TextRulerRulePattern lk, int k) {
List<TextRulerRulePattern> leftContexts = getLeftContextForSlot(doc, k);
for (TextRulerRulePattern lx : leftContexts) {
if (lx.size() < lk.size())
return false;
int pos = lx.find(lk);
if (pos < 0 || pos != lx.size() - lk.size())
return false;
}
return true;
}
protected boolean testConstraint2(TextRulerRulePattern lk, int k) {
for (TextRulerExampleDocument doc : exampleDocuments.getDocuments()) {
if (!testConstraint2(doc, lk, k))
return false;
}
return true;
}
protected constraint3ReturnType testConstraint3(TextRulerExampleDocument doc,
TextRulerRulePattern h, TextRulerRulePattern t, TextRulerRulePattern l1) {
final boolean logReasons = false;
TextRulerRulePattern head = new TextRulerRulePattern();
TextRulerRulePattern tail = new TextRulerRulePattern();
getPageHeadAndTailPortion(doc, head, tail);
// 1: l1 must be a proper suffix of the portion between the end of h and
// the first slot filler:
// (head / h) / l1 = l1
int hPos = head.find(h);
// TOOD precalculate this outside this method ?
TextRulerRulePattern restForL1 = head.subPattern(hPos + h.size(), -1).copy();
for (TextRulerRuleItem it : restForL1)
((WienRuleItem) it).getWordConstraint().setGeneralizeLinkMarkUp(true);
int l1Pos = restForL1.find(l1);
if (l1Pos < 0 || l1Pos != restForL1.size() - l1.size()) {
TextRulerToolkit.logIf(logReasons, "REASON 1\n\tl1 \t" + l1 + "\n\trestforl1\t"
+ restForL1);
return constraint3ReturnType.C3_L1CandidateSuffixError;
}
// 2: t must not occur in the subpattern after h and before l1
if (l1Pos > 0) {
TextRulerRulePattern patternBetweenHandL1 = restForL1.subPattern(0, l1Pos);
if (patternBetweenHandL1.size() >= t.size()) {
if (patternBetweenHandL1.find(t) >= 0) {
TextRulerToolkit.logIf(logReasons, "REASON 2");
return constraint3ReturnType.C3_TailCandidateH_L1Error;
}
}
}
// 2a: addons, not specified in WIEN paper !!
TextRulerRulePattern lastSlotRightPattern = patternPairs.get(slotNames.length - 1).r;
if (t.find(lastSlotRightPattern) == 0) // the right boundary of the last
// slot may not be part of the
// tail pattern!
{
TextRulerToolkit.logIf(logReasons, "REASON 3: " + lastSlotRightPattern + "\tTail: " + t);
return constraint3ReturnType.C3_TailCandidateRK_PrefixError;
}
int tPos = tail.find(t);
if (tPos < 0) {
TextRulerToolkit.logIf(logReasons, "REASON 4");
return constraint3ReturnType.C3_TailCandidateNotFoundError;
} // this is an own constraint definition: if a document does not have
// the tail in it,
// what should we do then ? is this a n error or is this okay since the
// document may not have any tail after the data ?
// 3: l1 must not precede t in the page's tail:
int l1tPos = tail.find(l1);
if (l1tPos >= 0) // l1 occurs in the page's tail:
{
if (l1tPos < tPos) {
TextRulerToolkit.logIf(logReasons, "REASON 5");
return constraint3ReturnType.C3_TailCandidateSucceedsL1InTailError;
}
}
List<TextRulerRulePattern> interTupleSeparators = getInterTupleSepatators(doc);
for (TextRulerRulePattern itSep : interTupleSeparators) {
// 4: l1 must be a proper suffix of each of the inter-tuple
// separators:
TextRulerRulePattern itSepCopy = itSep.copy();
for (TextRulerRuleItem it : itSepCopy)
((WienRuleItem) it).getWordConstraint().setGeneralizeLinkMarkUp(true);
int l1itsPos = itSepCopy.find(l1);
if (l1itsPos < 0 || l1itsPos != itSepCopy.size() - l1.size()) {
TextRulerToolkit.logIf(logReasons, "REASON 6: \n\tl1\t" + l1 + "\n\titSep\t" + itSep);
return constraint3ReturnType.C3_L1CandidateInterTupleSeparatorSuffixError;
}
// 5: t must never precede l1 in any inter-tuple separator:
int itstPos = itSep.find(t);
if (itstPos >= 0 && itstPos < l1itsPos) {
TextRulerToolkit.logIf(logReasons, "REASON 7");
return constraint3ReturnType.C3_TailCandidatePrecedesL1InterTupleSeparatorError;
}
}
return constraint3ReturnType.C3_SUCCESS;
}
public String getResultString() {
if (theRule == null)
return "<no results yet>";
String result = getFileHeaderString(true) + "DECLARE wien_tail;\n" + "DECLARE wien_rulemark;\n"
+ "DECLARE wien_content;\n" + "BOOLEAN wien_redo;\n\n"
+ "// tail/head/content area stuff:\n";
TextRulerRulePattern hCopy = hPattern.copy();
((WienRuleItem) hCopy.get(0)).addCondition("-PARTOF(wien_content)");
result += hCopy + " ALL*?{->MARK(wien_content)};\n";
TextRulerRulePattern tCopy = tPattern.copy();
((WienRuleItem) tCopy.get(0)).addCondition("PARTOF(wien_content)");
result += tCopy + "{->MARK(wien_tail";
if (tPattern.size() > 1)
result += ", 1, " + tPattern.size();
result += ")};\n\n";
result += "BLOCK(findData) wien_content {\n"
+ "\t// find out if tail is before the next occurence of l1\n"
+ "\t"
+ theRule.getRuleString()
+ "\n"
+ "\tDocument{->ASSIGN(wien_redo, false)};\n"
+ "\twien_tail{PARTOF(wien_rulemark)->UNMARK(wien_tail), ASSIGN(wien_redo, true)}; // remove tail marks that are no longer relevant for us after the last rule !\n"
+ "\tDocument{IF(wien_redo)->CALL(filename.findData)};\n" + "}\n";
result += "\n// cleaning up:\n" + "wien_tail{->UNMARK(wien_tail)};\n"
+ "wien_rulemark{->UNMARK(wien_rulemark)};\n"
+ "wien_content{->UNMARK(wien_content)};\n";
return result;
}
public void setParameters(Map<String, Object> params) {
}
}