| /* |
| * Licensed to the Apache Software Foundation (ASF) under one |
| * or more contributor license agreements. See the NOTICE file |
| * distributed with this work for additional information |
| * regarding copyright ownership. The ASF licenses this file |
| * to you under the Apache License, Version 2.0 (the |
| * "License"); you may not use this file except in compliance |
| * with the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, |
| * software distributed under the License is distributed on an |
| * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
| * KIND, either express or implied. See the License for the |
| * specific language governing permissions and limitations |
| * under the License. |
| */ |
| |
| package org.apache.uima.ruta.textruler.core; |
| |
| import java.util.ArrayList; |
| import java.util.Collections; |
| import java.util.Comparator; |
| import java.util.List; |
| import java.util.Set; |
| |
| import org.apache.uima.analysis_engine.AnalysisEngine; |
| import org.apache.uima.analysis_engine.AnalysisEngineProcessException; |
| import org.apache.uima.cas.CAS; |
| import org.apache.uima.cas.FSIterator; |
| import org.apache.uima.cas.FeatureStructure; |
| import org.apache.uima.cas.Type; |
| import org.apache.uima.cas.TypeSystem; |
| import org.apache.uima.cas.text.AnnotationFS; |
| import org.apache.uima.ruta.textruler.core.TextRulerTarget.MLTargetType; |
| import org.apache.uima.util.CasCopier; |
| |
| /** |
| * |
| * TextRulerExampleDocument stands for one document usually loaded from an XMI file. It uses the |
| * given CasCache for storing its CAS with the XMI filename as the key. |
| * |
| * It holds ArrayLists for positive and negative MLExamples which can be filled on demand for a |
| * given learning target. E.g. single slot algorithms learn rules for each slot separately, so the |
| * work-flow is to clear the current examples and create new for the next slot target. The same is |
| * with single slot boundary algorithms like LP2: It first creates all left boundary examples, |
| * learns from them, clears the examples and creates the right boundary examples and so on. |
| * |
| * This class also provides the functionality extract and created MLExmaples of a given document or |
| * test CAS for a given TextRulerTarget. |
| * |
| * Especially for boundary algorithms you can call createBoundaryAnnotationsForCas to get boundary |
| * annotations at the beginnings and endings of an example slot. |
| * |
| * Caution (this is quite a bit inconvenient at the moment!): If a CAS gets loaded from the |
| * casCache, you have to call createBoundaryAnnotationsForCas again, so your casLoader must be aware |
| * of that (see BasicLP2 for an example) ! |
| * |
| * hint: this could be renamed to MLDocument instead of TextRulerExampleDocument ? |
| */ |
| public class TextRulerExampleDocument { |
| |
| protected String casFileName; |
| |
| protected CasCache casCache; |
| |
| protected List<TextRulerExample> positiveExamples = new ArrayList<TextRulerExample>(); |
| |
| protected List<TextRulerExample> negativeExamples = new ArrayList<TextRulerExample>(); |
| |
| public TextRulerExampleDocument(String casFileName, CasCache casCache) { |
| this.casCache = casCache; |
| this.casFileName = casFileName; |
| } |
| |
| public CAS getCAS() { |
| // ask CACHE |
| return casCache.getCAS(casFileName); |
| } |
| |
| public List<TextRulerExample> getPositiveExamples() { |
| return positiveExamples; |
| } |
| |
| public List<TextRulerExample> getNegativeExamples() { |
| return negativeExamples; |
| } |
| |
| protected void createPositiveExamplesForTarget(TextRulerTarget target) { |
| positiveExamples = createSlotInstancesForCAS(getCAS(), target, true); |
| } |
| |
| public List<TextRulerExample> createSlotInstancesForCAS(CAS aCas, TextRulerTarget target, |
| boolean createFromRawTypeName) { |
| List<TextRulerExample> result = new ArrayList<TextRulerExample>(); |
| |
| if (target.isMultiSlot()) { |
| TypeSystem ts = aCas.getTypeSystem(); |
| int currentSlotIndex = 0; |
| TextRulerAnnotation[] currentAnnotations = new TextRulerAnnotation[target.slotNames.length]; |
| List<Type> slotTypes = new ArrayList<Type>(); |
| for (String s : target.slotNames) |
| slotTypes.add(ts.getType(s)); |
| |
| for (FSIterator<AnnotationFS> it = aCas.getAnnotationIndex().iterator(true); it.isValid(); it |
| .moveToNext()) { |
| AnnotationFS fs = (AnnotationFS) it.get(); |
| Type theType = fs.getType(); |
| if (slotTypes.contains(theType)) { |
| int idx = slotTypes.indexOf(theType); |
| if (idx < currentSlotIndex) // the previous example was not |
| // complete, so we have to write |
| // it down: |
| { |
| result.add(new TextRulerExample(this, currentAnnotations, true, target)); |
| currentAnnotations = new TextRulerAnnotation[target.slotNames.length]; |
| } |
| currentAnnotations[idx] = new TextRulerAnnotation(fs, this); |
| if (idx >= target.slotNames.length - 1) { |
| result.add(new TextRulerExample(this, currentAnnotations, true, target)); |
| currentAnnotations = new TextRulerAnnotation[target.slotNames.length]; |
| currentSlotIndex = 0; |
| } else |
| currentSlotIndex = idx + 1; |
| } |
| } |
| if (currentSlotIndex > 0) { |
| result.add(new TextRulerExample(this, currentAnnotations, true, target)); |
| } |
| |
| } else if (target.isLeftCorrection() || target.isRightCorrection()) { |
| // TODO |
| TextRulerBasicLearner learner = target.getLearner(); |
| Set<String> filterSet = learner.getFilterSet(); |
| CAS testCAS = learner.getTestCAS(); |
| TextRulerStatisticsCollector c = new TextRulerStatisticsCollector(); |
| resetAndFillTestCAS(testCAS, target); |
| CAS docCAS = getCAS(); |
| TypeSystem ts = docCAS.getTypeSystem(); |
| Type tokensRootType = ts.getType(TextRulerToolkit.RUTA_ANY_TYPE_NAME); |
| AnalysisEngine analysisEngine = learner.getAnalysisEngine(); |
| try { |
| analysisEngine.process(testCAS); |
| } catch (AnalysisEngineProcessException e) { |
| // TODO add log here |
| } |
| TextRulerTarget newTarget = new TextRulerTarget(target.slotNames, target.getLearner()); |
| if (target.isLeftCorrection()) { |
| newTarget.type = TextRulerTarget.MLTargetType.SINGLE_LEFT_BOUNDARY; |
| } else { |
| newTarget.type = TextRulerTarget.MLTargetType.SINGLE_RIGHT_BOUNDARY; |
| } |
| createExamplesForTarget(newTarget); |
| learner.compareOriginalDocumentWithTestCAS(this, testCAS, newTarget, c, true); |
| List<TextRulerExample> correctTags = getPositiveExamples(); |
| List<TextRulerExample> wrongTags = new ArrayList<TextRulerExample>( |
| c.getCoveredNegativeExamples()); |
| for (TextRulerExample wrongTag : wrongTags) { |
| // test, if there's a corresponding positive example |
| // somewhere around (within maxDistance) |
| List<AnnotationFS> left = TextRulerToolkit.getAnnotationsBeforePosition(docCAS, wrongTag |
| .getAnnotation().getBegin(), target.getMaxShiftDistance(), TextRulerToolkit |
| .getFilterSetWithSlotNames(target.slotNames, filterSet), tokensRootType); |
| List<AnnotationFS> right = TextRulerToolkit.getAnnotationsAfterPosition(docCAS, wrongTag |
| .getAnnotation().getEnd(), target.getMaxShiftDistance() + 1, TextRulerToolkit |
| .getFilterSetWithSlotNames(target.slotNames, filterSet), tokensRootType); |
| |
| right.remove(0); |
| |
| // TODO stop after the first found match or create one bad |
| // example for each found occurence ??!! |
| // for now: stop after one ! so create only ONE bad |
| // example... |
| int leftDistance = 0; |
| TextRulerExample leftCorrectTag = null; |
| for (int i = left.size() - 1; i >= 0; i--) { |
| leftDistance++; |
| TextRulerAnnotation needle = TextRulerToolkit.convertToTargetAnnotation(left.get(i), |
| this, target, docCAS.getTypeSystem()); |
| // Only checks the beginning of needle |
| leftCorrectTag = TextRulerExampleDocument.exampleListContainsAnnotation(correctTags, |
| needle); |
| if (leftCorrectTag != null) |
| break; |
| } |
| |
| int rightDistance = 0; |
| TextRulerExample rightCorrectTag = null; |
| for (AnnotationFS fs : right) { |
| rightDistance++; |
| TextRulerAnnotation needle = TextRulerToolkit.convertToTargetAnnotation(fs, this, target, |
| docCAS.getTypeSystem()); |
| // Only checks the beginning of needle |
| rightCorrectTag = TextRulerExampleDocument.exampleListContainsAnnotation(correctTags, |
| needle); |
| if (rightCorrectTag != null) |
| break; |
| } |
| |
| TextRulerExample theCorrectTag = null; |
| if (rightDistance < leftDistance && rightCorrectTag != null) |
| theCorrectTag = rightCorrectTag; |
| else if (rightDistance > leftDistance && leftCorrectTag != null) |
| theCorrectTag = leftCorrectTag; |
| else // use the one that would lie in the slot filler: |
| { |
| if (target.type == MLTargetType.SINGLE_LEFT_BOUNDARY && rightCorrectTag != null) |
| theCorrectTag = rightCorrectTag; |
| else |
| theCorrectTag = leftCorrectTag; |
| } |
| |
| if (theCorrectTag != null) { |
| TextRulerToolkit.log("FOUND BAD EXAMPLE FOR SHIFTING !!"); |
| TextRulerShiftExample shiftExample = new TextRulerShiftExample(this, |
| wrongTag.getAnnotation(), theCorrectTag.getAnnotation(), true, target); |
| result.add(shiftExample); |
| } |
| } |
| // GlobalCASSource.releaseCAS(testCAS); |
| } else { |
| List<AnnotationFS> slots = TextRulerToolkit.extractAnnotationsForSlotName( |
| aCas, |
| createFromRawTypeName ? target.getSingleSlotRawTypeName() : target |
| .getSingleSlotTypeName()); // do not use |
| // boundary type |
| // here since we |
| // seek for the |
| // orignial slot |
| // ! |
| for (AnnotationFS a : slots) { |
| result.add(new TextRulerExample(this, TextRulerToolkit.convertToTargetAnnotation(a, this, |
| target, aCas.getTypeSystem()), true, target)); |
| } |
| } |
| return result; |
| } |
| |
| protected void createNegativeExamplesForTarget(TextRulerTarget target) { |
| // the default implementation does not support negative examples, |
| // subclasses can overwrite |
| // this if needed... or we could pass this as an argument to the |
| // constructor.... |
| } |
| |
| public void createExamplesForTarget(TextRulerTarget target) { |
| createPositiveExamplesForTarget(target); |
| createNegativeExamplesForTarget(target); |
| } |
| |
| public void clearCurrentExamples() { |
| positiveExamples.clear(); |
| negativeExamples.clear(); |
| } |
| |
| // pass your test CAS object and the corresponding learning target to get a |
| // filled |
| // test-CAS for testing e.g. rule or rule set.. |
| // caution: testCas gets reset fist! |
| public void resetAndFillTestCAS(CAS testCas, TextRulerTarget target) { |
| testCas.reset(); |
| CAS docCas = getCAS(); |
| |
| CasCopier cc = new CasCopier(docCas, testCas); |
| testCas.setDocumentText(docCas.getDocumentText()); |
| |
| // copy all annotations except the target-annotations: |
| TypeSystem ts = docCas.getTypeSystem(); |
| |
| List<Type> slotTypes = new ArrayList<Type>(); |
| |
| for (String s : target.getSlotTypeNames()) |
| slotTypes.add(ts.getType(s)); |
| |
| if (target.isBoundary()) { |
| // add the base types (without START and END markers) also ! |
| for (String s : target.slotNames) |
| slotTypes.add(ts.getType(s)); |
| } |
| |
| for (AnnotationFS fs : docCas.getAnnotationIndex()) { |
| if (!slotTypes.contains(fs.getType()) |
| && !fs.getType().equals(docCas.getDocumentAnnotation().getType())) { |
| FeatureStructure copyFs = cc.copyFs(fs); |
| testCas.addFsToIndexes(copyFs); |
| } |
| } |
| } |
| |
| public String getCasFileName() { |
| return casFileName; |
| } |
| |
| public static void createBoundaryAnnotationsForCas(CAS aCas, String slotName, |
| Set<String> tokenFilterSet) { |
| List<AnnotationFS> slots = TextRulerToolkit.extractAnnotationsForSlotName(aCas, slotName); |
| TypeSystem ts = aCas.getTypeSystem(); |
| for (AnnotationFS a : slots) { |
| |
| List<AnnotationFS> slotTokens = TextRulerToolkit.getAnnotationsWithinBounds(aCas, |
| a.getBegin(), a.getEnd(), |
| TextRulerToolkit.getFilterSetWithSlotName(slotName, tokenFilterSet), |
| ts.getType(TextRulerToolkit.RUTA_ANY_TYPE_NAME)); |
| if (!slotTokens.isEmpty()) { |
| AnnotationFS first = slotTokens.get(0); |
| AnnotationFS last = slotTokens.get(slotTokens.size() - 1); |
| Type typeLB = ts.getType(slotName + TextRulerToolkit.LEFT_BOUNDARY_EXTENSION); |
| aCas.addFsToIndexes(aCas.createAnnotation(typeLB, first.getBegin(), first.getEnd())); |
| Type typeRB = ts.getType(slotName + TextRulerToolkit.RIGHT_BOUNDARY_EXTENSION); |
| aCas.addFsToIndexes(aCas.createAnnotation(typeRB, last.getBegin(), last.getEnd())); |
| } |
| } |
| } |
| |
| public static void removeBoundaryAnnotationsFromCas(CAS aCas, String slotName) { |
| // this method is not tested yet! |
| TypeSystem ts = aCas.getTypeSystem(); |
| Type startType = ts.getType(slotName + TextRulerToolkit.LEFT_BOUNDARY_EXTENSION); |
| Type endType = ts.getType(slotName + TextRulerToolkit.RIGHT_BOUNDARY_EXTENSION); |
| List<AnnotationFS> removeList = new ArrayList<AnnotationFS>(); |
| for (FSIterator<AnnotationFS> it = aCas.getAnnotationIndex(startType).iterator(true); it |
| .isValid(); it.moveToNext()) { |
| AnnotationFS fs = it.get(); |
| removeList.add(fs); |
| } |
| for (FSIterator<AnnotationFS> it = aCas.getAnnotationIndex(endType).iterator(true); it |
| .isValid(); it.moveToNext()) { |
| AnnotationFS fs = it.get(); |
| removeList.add(fs); |
| } |
| for (AnnotationFS fs : removeList) |
| aCas.removeFsFromIndexes(fs); |
| } |
| |
| public static synchronized TextRulerExample exampleListContainsAnnotation( |
| List<TextRulerExample> list, TextRulerAnnotation ann) { |
| TextRulerExample needle = new TextRulerExample(null, ann, true, null); |
| |
| int index = Collections.binarySearch(list, needle, new Comparator<TextRulerExample>() { |
| public int compare(TextRulerExample o1, TextRulerExample o2) { |
| TextRulerAnnotation afs1 = o1.getAnnotation(); |
| TextRulerAnnotation afs2 = o2.getAnnotation(); |
| if (afs1.getBegin() < afs2.getBegin()) |
| return -1; |
| else if (afs1.getBegin() > afs2.getBegin()) |
| return 1; |
| else |
| return 0; |
| } |
| }); |
| if (index >= 0) |
| return list.get(index); |
| else |
| return null; |
| } |
| |
| } |