blob: 0d70554a441353e28f850b0797e5930438d6ceb9 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.apache.uima.ruta.textruler.learner.rapier;
import java.util.ArrayList;
import java.util.Iterator;
import org.apache.uima.ruta.textruler.core.TextRulerRuleItem;
import org.apache.uima.ruta.textruler.core.TextRulerRulePattern;
import org.apache.uima.ruta.textruler.core.TextRulerToolkit;
import org.apache.uima.ruta.textruler.core.TextRulerWordConstraint;
public class RapierGeneralizationHelper {
// ----------------------------------------------------------------------------------------------------------------------------------------
// --- ITEM(s) GENERALIZATION
// -------------------------------------------------------------------------------------------------------------
// ----------------------------------------------------------------------------------------------------------------------------------------
private static ArrayList<TextRulerRuleItem> getGeneralizationsForRuleItems(
TextRulerRuleItem item1, TextRulerRuleItem item2) {
ArrayList<TextRulerRuleItem> item1List = new ArrayList<TextRulerRuleItem>();
ArrayList<TextRulerRuleItem> item2List = new ArrayList<TextRulerRuleItem>();
if (item1 != null)
item1List.add(item1);
if (item2 != null)
item2List.add(item2);
return getGeneralizationsForRuleItemLists(item1List, item2List);
}
// generalize two given pattern elements or pattern element lists and return
// all possible generalizations as
// TextRulerRuleItem objects
private static ArrayList<TextRulerRuleItem> getGeneralizationsForRuleItemLists(
ArrayList<TextRulerRuleItem> item1List, ArrayList<TextRulerRuleItem> item2List) {
ArrayList<RapierRuleItem> proposedWordConstraints = new ArrayList<RapierRuleItem>();
ArrayList<RapierRuleItem> proposedTagConstraints = new ArrayList<RapierRuleItem>();
ArrayList<RapierRuleItem> proposedClassConstraints = new ArrayList<RapierRuleItem>();
ArrayList<TextRulerRuleItem> result = new ArrayList<TextRulerRuleItem>();
int resultListLen1 = 0;
int resultListLen2 = 0;
boolean oneListIsEmpty = false;
if (item1List.size() == 0 && item2List.size() == 0) {
TextRulerToolkit.log("ERROR !");
}
if (item1List.size() == 0 || item2List.size() == 0) {
// TextRulerToolkit.log("SPECIAL CASE WITH ONE LIST OF ZERO SIZE");
oneListIsEmpty = true;
}
boolean hasEmptyWordList = false;
int maxWordCount = 0;
boolean hasEmptyTagList = false;
int maxTagCount = 0;
for (TextRulerRuleItem rt : item2List) {
RapierRuleItem t = (RapierRuleItem) rt;
resultListLen2 += t.isListItem() ? t.listLen() : 1;
if (t.getWordConstraints().size() > maxWordCount)
maxWordCount = t.getWordConstraints().size();
if (t.getWordConstraints().size() == 0)
hasEmptyWordList = true;
if (t.getTagConstraints().size() > maxTagCount)
maxTagCount = t.getTagConstraints().size();
if (t.getTagConstraints().size() == 0)
hasEmptyTagList = true;
}
for (TextRulerRuleItem rt : item1List) {
RapierRuleItem t = (RapierRuleItem) rt;
resultListLen1 += t.isListItem() ? t.listLen() : 1;
if (t.getWordConstraints().size() > maxWordCount)
maxWordCount = t.getWordConstraints().size();
if (t.getWordConstraints().size() == 0)
hasEmptyWordList = true;
if (t.getTagConstraints().size() > maxTagCount)
maxTagCount = t.getTagConstraints().size();
if (t.getTagConstraints().size() == 0)
hasEmptyTagList = true;
}
int resultListLen = resultListLen1 > resultListLen2 ? resultListLen1 : resultListLen2; // take
// the
// bigger
// of
// both
if (resultListLen == 1 && !oneListIsEmpty)
resultListLen = 0; // lists with a length of 1 can only occur when
// one itemList is empty! THAT CANNOT HAPPEN
// HERE!
// generalize word constraints:
if (hasEmptyWordList) // at least one constraint of both is empty
{
// do nothing here, proposed.wordItems stays empty
proposedWordConstraints.add(new RapierRuleItem());
} else // create union of both constraints AND (if both constraints
// weren't the same) drop constraint
{
RapierRuleItem proposed = new RapierRuleItem();
for (TextRulerRuleItem t : item1List)
proposed.addWordConstraints(((RapierRuleItem) t).getWordConstraints());
for (TextRulerRuleItem t : item2List)
proposed.addWordConstraints(((RapierRuleItem) t).getWordConstraints());
proposedWordConstraints.add(proposed);
// if the union of both constraints is a real union (one does not
// subsume the other completely),
// we have to add the DROPPING OF THE CONSTRAINT as a second
// proposed word constraint
if (maxWordCount != proposed.getWordConstraints().size()) // the
// union
// is a
// real
// bigger
// set
// than
{
proposedWordConstraints.add(new RapierRuleItem());
}
}
if (hasEmptyTagList) // at least one constraint of both is empty
{
// do nothing here, proposed.tagItems stays empty
proposedTagConstraints.add(new RapierRuleItem());
} else // create union of both constraints AND (if both constraints
// weren't the same) drop constraint
{
RapierRuleItem proposed = new RapierRuleItem();
for (TextRulerRuleItem t : item1List)
proposed.addTagConstraints(((RapierRuleItem) t).getTagConstraints());
for (TextRulerRuleItem t : item2List)
proposed.addTagConstraints(((RapierRuleItem) t).getTagConstraints());
proposedTagConstraints.add(proposed);
// if the union of both constraints is a real union (one does not
// subsume the other completely),
// we have to add the DROPPING OF THE CONSTRAINT as a second
// proposed tag constraint
if (maxTagCount != proposed.getTagConstraints().size()) // the union
// is a real
// bigger
// set than
{
proposedTagConstraints.add(new RapierRuleItem());
}
}
// TODO semantic class generalization
proposedClassConstraints.add(new RapierRuleItem()); // add only NO
// class
// constraint
// version for
// now!
// finally, create all combinations of the above proposed items
for (RapierRuleItem wt : proposedWordConstraints) {
for (RapierRuleItem tt : proposedTagConstraints) {
for (RapierRuleItem ct : proposedClassConstraints) {
RapierRuleItem newItem = new RapierRuleItem();
for (TextRulerWordConstraint wi : wt.getWordConstraints())
newItem.addWordConstraint(wi.copy());
for (String ti : tt.getTagConstraints())
newItem.addTagConstraint(ti);
for (String tc : ct.getClassConstraints())
newItem.addClassConstraint(tc);
newItem.setListLen(resultListLen);
newItem.setListBeginsAtZero(oneListIsEmpty && resultListLen > 0);
result.add(newItem);
}
}
}
return result;
}
// ----------------------------------------------------------------------------------------------------------------------------------------
// --- EQUAL SIZE PATTERN GENERALIZATION
// --------------------------------------------------------------------------------------------------
// ----------------------------------------------------------------------------------------------------------------------------------------
private static ArrayList<TextRulerRulePattern> getGeneralizationsForRuleItemPatternsOfEqualSize(
TextRulerRulePattern pattern1, TextRulerRulePattern pattern2) {
ArrayList<TextRulerRulePattern> resultList = new ArrayList<TextRulerRulePattern>();
ArrayList<ArrayList<TextRulerRuleItem>> generalizationTable = new ArrayList<ArrayList<TextRulerRuleItem>>();
Iterator<TextRulerRuleItem> it2 = pattern2.iterator();
if (pattern1.size() != pattern2.size()) {
TextRulerToolkit.log("ERROR!");
}
for (TextRulerRuleItem item1 : pattern1) {
TextRulerRuleItem item2 = it2.next();
// get all possible LGGs of the current two elements and save them
// into the matrix
ArrayList<TextRulerRuleItem> allLGGs = getGeneralizationsForRuleItems(item1, item2);
generalizationTable.add(allLGGs);
// TextRulerToolkit.log("--- GET GENERALISATIONS FOR TWO TERMS: --"+t1+"-- --"+t2+"--");
// ArrayList<MLRapierRuleTerm> allLGGs =
// this.getGeneralizationsForRuleTerms(t1, t2);
// for (MLRapierRuleTerm term : allLGGs)
// TextRulerToolkit.log("--- "+term);
// TextRulerToolkit.log("--- END");
}
// now we have patternSize lists of possible generalizations, one list
// per original pattern item pair of
// pattern1 and pattern2. we now have to build all possible
// combinations. Each combination is a
// new pattern
recursiveBuildAllRuleItemCombinations(generalizationTable, 0, new TextRulerRulePattern(),
resultList);
return resultList;
}
private static void recursiveBuildAllRuleItemCombinations(
ArrayList<ArrayList<TextRulerRuleItem>> table, int curIndex,
TextRulerRulePattern currentPattern, ArrayList<TextRulerRulePattern> resultPatterns) {
if (curIndex >= table.size()) {
// make a deep copy of the current pattern:
TextRulerRulePattern copy = new TextRulerRulePattern();
for (TextRulerRuleItem item : currentPattern)
copy.add(item.copy());
resultPatterns.add(copy);
} else {
for (TextRulerRuleItem item : table.get(curIndex)) {
currentPattern.add(item);
recursiveBuildAllRuleItemCombinations(table, curIndex + 1, currentPattern, resultPatterns);
currentPattern.remove(currentPattern.size() - 1);
}
}
}
// ----------------------------------------------------------------------------------------------------------------------------------------
// --- DISTINCT SIZE PATTERN GENERALIZATION
// -----------------------------------------------------------------------------------------------
// ----------------------------------------------------------------------------------------------------------------------------------------
// this is the low level version for patterns of distinct sizes! it creates
// ALL possible combinations how to pair/map pattern items
// of the shorter with those of the longer pattern. the optimizing version
// (getOptimizedGeneralizationsForRuleItemPatternsOfDistinctSize)
// uses this method for its pattern segments! (see below)
private static ArrayList<TextRulerRulePattern> getGeneralizationsForRuleItemPatternsOfDistinctSize(
TextRulerRulePattern pattern1, TextRulerRulePattern pattern2) {
ArrayList<TextRulerRulePattern> resultList = new ArrayList<TextRulerRulePattern>();
if (pattern1.size() == pattern2.size()) {
TextRulerToolkit.log("ERROR! CALL getGeneralizationsForRuleItemPatternsOfEqualSize instead!");
if (TextRulerToolkit.DEBUG)
return null;
}
TextRulerRulePattern longerPattern = pattern1;
TextRulerRulePattern shorterPattern = pattern2;
if (pattern2.size() > pattern1.size()) {
longerPattern = pattern2;
shorterPattern = pattern1;
}
if (longerPattern.size() <= 1 || shorterPattern.size() <= 1) {
// Special case 1: one of the pattern terms list is empty AND
// special case 2: one has only ONE element
if (longerPattern.size() + shorterPattern.size() == 0) {
TextRulerToolkit.log("ERROR !! BOTH PATTERNS ARE EMPTY!");
if (TextRulerToolkit.DEBUG)
return null;
}
// get all possible generalizations of the two patterns. result of
// each generalization is ONE rule item, so we
// don't use TextRulerRulePattern here since this IS NOT a rule
// pattern! it's a list of possible generalizations:
ArrayList<TextRulerRuleItem> generalizations = getGeneralizationsForRuleItemLists(
longerPattern, shorterPattern);
// create a one element result pattern for each:
for (TextRulerRuleItem item : generalizations) {
TextRulerRulePattern p = new TextRulerRulePattern();
p.add(item);
resultList.add(p);
}
}
// else SPECIAL CASE 3 // TODO make those values configurable ?
else if (((longerPattern.size() - shorterPattern.size()) > 6) || (longerPattern.size() > 10)) {
int resultListLen1 = 0;
for (TextRulerRuleItem rt : shorterPattern)
resultListLen1 += ((RapierRuleItem) rt).isListItem() ? ((RapierRuleItem) rt).listLen() : 1;
int resultListLen2 = 0;
for (TextRulerRuleItem rt : longerPattern)
resultListLen2 += ((RapierRuleItem) rt).isListItem() ? ((RapierRuleItem) rt).listLen() : 1;
RapierRuleItem singleItem = new RapierRuleItem();
singleItem.setListLen(resultListLen1 > resultListLen2 ? resultListLen1 : resultListLen2);
TextRulerRulePattern singleItemPattern = new TextRulerRulePattern();
singleItemPattern.add(singleItem);
resultList.add(singleItemPattern);
} else { // sizes are different and both > 1
// create all possible generalization combinations, that is: how can
// we map elements of the shorter pattern
// to the ones of the longer pattern and then generalize each
// mapping (each group) of items?
ArrayList<ArrayList<RapierPatternItemMapping>> combinationList = new ArrayList<ArrayList<RapierPatternItemMapping>>();
recursiveBuildAllPossiblePatternMappingSequences(longerPattern, shorterPattern,
new ArrayList<RapierPatternItemMapping>(), combinationList);
for (ArrayList<RapierPatternItemMapping> mappingSequence : combinationList) {
resultList.addAll(getGeneralizationsForPatternMappingSequence(mappingSequence));
}
}
return resultList;
}
// creates all possible combinations how to pair together items from the
// longer and the shorter source pattern, e.g.
// 1 2 3 4 5 vs. 1 2 3 = 1/1+2+3 2/4 3/5, ...
private static void recursiveBuildAllPossiblePatternMappingSequences(
TextRulerRulePattern longerPattern, TextRulerRulePattern shorterPattern,
ArrayList<RapierPatternItemMapping> currentMappingSequence,
ArrayList<ArrayList<RapierPatternItemMapping>> resultList) {
int windowSize = longerPattern.size() - shorterPattern.size() + 1;
if (shorterPattern.size() > longerPattern.size()) {
TextRulerToolkit.log("ERROR: SHORTER > LONGER !!");
}
if (longerPattern.size() == 0 || shorterPattern.size() == 0) {
TextRulerToolkit.log("ERROR: SHORTER == LONGER == 0!");
} else {
// if the remaining (sub-)patterns are of equal size or one has only
// one element left, create one last item mapping and
// a final result mapping sequence:
if (shorterPattern.size() == 1 || (longerPattern.size() == shorterPattern.size())) {
RapierPatternItemMapping lastMapping = new RapierPatternItemMapping();
lastMapping.shorterPattern.addAll(shorterPattern);
lastMapping.longerPattern.addAll(longerPattern);
ArrayList<RapierPatternItemMapping> newMappingSequence = new ArrayList<RapierPatternItemMapping>();
newMappingSequence.addAll(currentMappingSequence);
newMappingSequence.add(lastMapping);
resultList.add(newMappingSequence);
} else { // otherwise we have to create all possible combinations of
// the longer and shorter remaining pattern:
TextRulerRuleItem firstItem = shorterPattern.get(0);
// combine with 0, 0/1, ... 0/1/2/.../windowSize-1
for (int maxi = 0; maxi < windowSize; maxi++) {
RapierPatternItemMapping newMapping = new RapierPatternItemMapping();
newMapping.shorterPattern.add(firstItem);
for (int li = 0; li <= maxi; li++)
newMapping.longerPattern.add(longerPattern.get(li));
currentMappingSequence.add(newMapping);
TextRulerRulePattern restLongerPattern = new TextRulerRulePattern();
TextRulerRulePattern restShorterPattern = new TextRulerRulePattern();
for (int i = 1; i < shorterPattern.size(); i++)
restShorterPattern.add(shorterPattern.get(i));
for (int i = maxi + 1; i < longerPattern.size(); i++)
restLongerPattern.add(longerPattern.get(i));
// recurse:
recursiveBuildAllPossiblePatternMappingSequences(restLongerPattern, restShorterPattern,
currentMappingSequence, resultList);
// remove last segment to get back to the same state as
// before the recursion:
currentMappingSequence.remove(currentMappingSequence.size() - 1);
}
}
}
}
// here the input is called a MAPPING instead of a pattern segmentation in
// order to distinguish between the two levels of
// dividing the problem: a pattern segmentation is a special mapping of
// equal items in the two to generalize source patterns;
// the segments that result through that segmentation still need to be
// generalized (see getGeneralizationsForPatternSegmentation)
// if sucha semgent has subpattersn of different size,
// getGeneralizationsForRuleItemPatternsOfDistinctSize is used to
// generalize it, which uses THIS METHOD HERE to get all generalizations for
// a special MAPPING. a mapping (in comparison to the
// segmentation!) is a mapping between the longer and shorter pattern items
// which then get directly generalized here!
// in order to show this difference, we use the (inernally exactly the
// same!) class RapierPatternItemMapping instead of
// RapierPatternSegment!)
private static ArrayList<TextRulerRulePattern> getGeneralizationsForPatternMappingSequence(
ArrayList<RapierPatternItemMapping> patternMappingSequence) {
ArrayList<TextRulerRulePattern> resultList = new ArrayList<TextRulerRulePattern>();
ArrayList<ArrayList<TextRulerRuleItem>> generalizationTable = new ArrayList<ArrayList<TextRulerRuleItem>>();
// every mapping has several possible generalizations, so we store all
// of them in that generalizationTable, one list of
// generalizations for each mapping:
for (RapierPatternItemMapping mapping : patternMappingSequence) {
ArrayList<TextRulerRuleItem> lggList = getGeneralizationsForRuleItemLists(
mapping.shorterPattern, mapping.longerPattern);
generalizationTable.add(lggList);
}
// afterwards we have again to create all possible combinations of those
// lists (like in the equalSizeGeneralization):
// Each combination is a new pattern
recursiveBuildAllRuleItemCombinations(generalizationTable, 0, new TextRulerRulePattern(),
resultList);
return resultList;
}
// ----------------------------------------------------------------------------------------------------------------------------------------
// --- FIND MATCHINGS BETWEEN PATTERNS FOR GENERALIZATION
// -----------------------------------------------------------------------------------------------
// ----------------------------------------------------------------------------------------------------------------------------------------
// find matches in two patterns by comparing the items of the patterns and
// return all possible segmentations of those two patterns.
private static void recursiveFindPatternSegmentsByMatchingPatternItems(
TextRulerRulePattern longerPattern, TextRulerRulePattern shorterPattern,
ArrayList<RapierPatternSegment> currentSegmentation,
ArrayList<ArrayList<RapierPatternSegment>> resultList) {
int cmpWindowSize = longerPattern.size() - shorterPattern.size() + 1;
// is one
// (rest-)pattern
// empty ?
if (longerPattern.size() == 0 || shorterPattern.size() == 0) {
// create result segmentation
ArrayList<RapierPatternSegment> newSegmentation = new ArrayList<RapierPatternSegment>();
newSegmentation.addAll(currentSegmentation); // add current
// add rest if
// recursive state
// anything is
// left one of
// the
// patterns
if (longerPattern.size() + shorterPattern.size() > 0) {
RapierPatternSegment lastSegment = new RapierPatternSegment();
for (TextRulerRuleItem i : shorterPattern)
lastSegment.shorterPattern.add(i);
for (TextRulerRuleItem i : longerPattern)
lastSegment.longerPattern.add(i);
newSegmentation.add(lastSegment);
}
resultList.add(newSegmentation);
} else {
boolean matched = false;
for (int si = 0; si < shorterPattern.size(); si++) {
// compare element si with si, si+1, ... si+cmpWindowSize-1
for (int li = si; li < si + cmpWindowSize; li++) {
if (longerPattern.get(li).equals(shorterPattern.get(si))) {
// matched pair found!
matched = true;
// --> calculate pattern segments, add them to the
// current one and pass the rest of the
// patterns to the next recursion level:
RapierPatternSegment newSegment = new RapierPatternSegment();
for (int i = 0; i < si; i++)
newSegment.shorterPattern.add(shorterPattern.get(i));
for (int i = 0; i < li; i++)
newSegment.longerPattern.add(longerPattern.get(i));
boolean addedLeftSegmentation = false;
if (newSegment.longerPattern.size() > 0 || newSegment.shorterPattern.size() > 0) {
// only add if the segmentation is not empty!
currentSegmentation.add(newSegment);
addedLeftSegmentation = true;
}
RapierPatternSegment matchedSegment = new RapierPatternSegment();
matchedSegment.shorterPattern.add(shorterPattern.get(si));
matchedSegment.longerPattern.add(longerPattern.get(li));
currentSegmentation.add(matchedSegment);
// the rest is now the rest to the right of both (so
// li+1 and si+1 to the ends...)
TextRulerRulePattern restLongerPattern = new TextRulerRulePattern();
TextRulerRulePattern restShorterPattern = new TextRulerRulePattern();
for (int i = li + 1; i < longerPattern.size(); i++)
restLongerPattern.add(longerPattern.get(i));
for (int i = si + 1; i < shorterPattern.size(); i++)
restShorterPattern.add(shorterPattern.get(i));
// recurse...
if (restLongerPattern.size() > restShorterPattern.size())
recursiveFindPatternSegmentsByMatchingPatternItems(restLongerPattern,
restShorterPattern, currentSegmentation, resultList);
else
recursiveFindPatternSegmentsByMatchingPatternItems(restShorterPattern,
restLongerPattern, currentSegmentation, resultList);
// remove added segments so that we are in the same
// state as before the recursion:
if (addedLeftSegmentation)
currentSegmentation.remove(currentSegmentation.size() - 1); // remove the left side
// segment
currentSegmentation.remove(currentSegmentation.size() - 1); // remove
// the
// matched
// segment
}
}
}
if (!matched) // add remaining items of both lists in one pattern
// segment
{
ArrayList<RapierPatternSegment> newSegmentation = new ArrayList<RapierPatternSegment>();
newSegmentation.addAll(currentSegmentation);
RapierPatternSegment lastSegment = new RapierPatternSegment();
for (TextRulerRuleItem i : shorterPattern)
lastSegment.shorterPattern.add(i);
for (TextRulerRuleItem i : longerPattern)
lastSegment.longerPattern.add(i);
newSegmentation.add(lastSegment);
resultList.add(newSegmentation);
}
}
}
private static ArrayList<TextRulerRulePattern> getGeneralizationsForPatternSegmentation(
ArrayList<RapierPatternSegment> patternSegmentation) {
// for creating those, we need a table:
// each segment of the patternSegmentation creates a bunch of possible
// new generalized sub patterns (that's the inner
// ArrayList<TextRulerRulePattern>)
// since we have a whole sequence of pattern semgents (a whole
// segmentation), we need the outer ArrayList to collect
// all generalizations of all pattern segments:
ArrayList<ArrayList<TextRulerRulePattern>> generalizationTable = new ArrayList<ArrayList<TextRulerRulePattern>>();
// now, we create all generalizations of each pattern segment and
// collect them in that table:
for (RapierPatternSegment pSeg : patternSegmentation) {
ArrayList<TextRulerRulePattern> pSegGeneralizations;
if (pSeg.longerPattern.size() == pSeg.shorterPattern.size())
pSegGeneralizations = getGeneralizationsForRuleItemPatternsOfEqualSize(pSeg.longerPattern,
pSeg.shorterPattern);
else
pSegGeneralizations = getGeneralizationsForRuleItemPatternsOfDistinctSize(
pSeg.longerPattern, pSeg.shorterPattern);
generalizationTable.add(pSegGeneralizations);
}
// finally, we have to build all combinations of them in form of
// MLRulePatterns:
ArrayList<TextRulerRulePattern> resultList = new ArrayList<TextRulerRulePattern>(); // the
// result
// is
// a
// list
// of
// new
// generalized
// patterns
recursiveBuildAllRuleItemCombinationsFromPatterns(generalizationTable, 0,
new TextRulerRulePattern(), resultList);
return resultList;
}
private static void recursiveBuildAllRuleItemCombinationsFromPatterns(
ArrayList<ArrayList<TextRulerRulePattern>> table, int curIndex,
TextRulerRulePattern currentPattern, ArrayList<TextRulerRulePattern> resultPatterns) {
if (curIndex >= table.size()) {
// make a deep copy of the current pattern:
TextRulerRulePattern copy = new TextRulerRulePattern();
for (TextRulerRuleItem item : currentPattern)
copy.add(item.copy());
resultPatterns.add(copy);
} else {
for (TextRulerRulePattern pattern : table.get(curIndex)) {
currentPattern.addAll(pattern);
recursiveBuildAllRuleItemCombinationsFromPatterns(table, curIndex + 1, currentPattern,
resultPatterns);
for (int i = 0; i < pattern.size(); i++)
currentPattern.remove(currentPattern.size() - 1);
}
}
}
// ----------------------------------------------------------------------------------------------------------------------------------------
// --- WORKING ON PATTERNS OF DISTINCT LENGTH - OPTIMIZED
// ---------------------------------------------------------------------------------
// ----------------------------------------------------------------------------------------------------------------------------------------
// "optimized", because this method uses the pattern item matching
// optimization (search for equal items and make a segmentation, etc.)
private static ArrayList<TextRulerRulePattern> getOptimizedGeneralizationsForRuleItemPatternsOfDistinctSize(
TextRulerRulePattern pattern1, TextRulerRulePattern pattern2) {
ArrayList<ArrayList<RapierPatternSegment>> matchedCombinationList = new ArrayList<ArrayList<RapierPatternSegment>>();
// in order to reduce the amount of possible combinations how to combine
// the elements of the shorter pattern with the
// elements of the longer pattern, we first search for equal pattern
// items in both patterns. those equal patterns get hardwired
// in the combination and the left and right remaining subpatterns stay
// as a "divided smaller problem" that needs to be conquered...
// the result of the search is a list of possible segmentations of the
// patterns. all semgementations are those "smaller"
// left problems that we then need to generalize in the original manner.
// if no equal items are found, one segmentation with only one segment
// (the orignal longer and shoter pattern) is returned and
// has to be generalized.
if (pattern1.size() > pattern2.size())
recursiveFindPatternSegmentsByMatchingPatternItems(pattern1, pattern2,
new ArrayList<RapierPatternSegment>(), matchedCombinationList);
else
recursiveFindPatternSegmentsByMatchingPatternItems(pattern2, pattern1,
new ArrayList<RapierPatternSegment>(), matchedCombinationList);
// if (TextRulerToolkit.DEBUG && matchedCombinationList.size() > 1)
// {
// TextRulerToolkit.log("PATTERN SEQUENCES FOUND: "+matchedCombinationList.size());
// for (ArrayList<RapierPatternSegment> patternSequence :
// matchedCombinationList)
// {
// TextRulerToolkit.log("\tNEXT SEQUENCE");
// for (RapierPatternSegment pSeg : patternSequence)
// {
// TextRulerToolkit.log("\t\t"+pSeg.longerPattern);
// TextRulerToolkit.log("\t\t"+pSeg.shorterPattern);
// }
// }
// }
ArrayList<TextRulerRulePattern> resultList = new ArrayList<TextRulerRulePattern>();
for (ArrayList<RapierPatternSegment> patternSegmentation : matchedCombinationList) {
// TODO filter out possible duplicates ?
resultList.addAll(getGeneralizationsForPatternSegmentation(patternSegmentation));
}
return resultList;
}
// ----------------------------------------------------------------------------------------------------------------------------------------
// --- THE FINAL RESULT: a genarlization method
// -------------------------------------------------------------------------------------------
// ----------------------------------------------------------------------------------------------------------------------------------------
// input: two sequences of rule items (=patterns) that shall be
// generalized... matchings are searched for a optimized search
// and to get a not too big count of generalizations...
// result: a (probably very large!) list of possible generalizations, e.g.
// used for all slotfiller generalizations of two rules...
public static ArrayList<TextRulerRulePattern> getGeneralizationsForRuleItemPatterns(
TextRulerRulePattern pattern1, TextRulerRulePattern pattern2) {
ArrayList<TextRulerRulePattern> result = new ArrayList<TextRulerRulePattern>();
// if (TextRulerToolkit.DEBUG)
// {
// TextRulerToolkit.log("\tgetGeneralizationsForRuleItemPatterns:");
// TextRulerToolkit.log("\tPattern1:"+pattern1);
// TextRulerToolkit.log("\tPattern2:"+pattern2);
// }
if (pattern1.size() == 0 && pattern2.size() == 0) {
return result; // return empty list
} else if (pattern1.size() == pattern2.size()) // both have the same
// pattern item count
{
// generalizing is easy then: simply generalize each pair of items:
result = getGeneralizationsForRuleItemPatternsOfEqualSize(pattern1, pattern2);
} else {
// TextRulerToolkit.logIf(TextRulerToolkit.DEBUG && pattern1.size()
// == 0 || pattern2.size() == 0, "SpecialCaseWithZeroLength");
result = getOptimizedGeneralizationsForRuleItemPatternsOfDistinctSize(pattern1, pattern2);
}
// if (TextRulerToolkit.DEBUG)
// {
// TextRulerToolkit.log("\t\tGeneralizations: "+result.size());
// for (TextRulerRulePattern lggPattern : result)
// TextRulerToolkit.log("\t\t\t"+lggPattern);
// }
return result;
}
}