Merge pull request #17 from apache/UIMA-6268-internal-indexing
Uima 6268 internal indexing
diff --git a/ruta-core/src/main/java/org/apache/uima/ruta/ReindexUpdateMode.java b/ruta-core/src/main/java/org/apache/uima/ruta/ReindexUpdateMode.java
new file mode 100644
index 0000000..a81647f
--- /dev/null
+++ b/ruta-core/src/main/java/org/apache/uima/ruta/ReindexUpdateMode.java
@@ -0,0 +1,58 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.uima.ruta;
+
+/**
+ * The mode how the internal RutaBasics are updated. The update depends on the given relevant types
+ * and annotations which can be specified using various configuration parameters.
+ *
+ */
+public enum ReindexUpdateMode {
+
+ /**
+ * Updates the internal information completely. First all internal indexing information is removed
+ * for all relevant types. Then, all relevant annotations are added anew.
+ */
+ COMPLETE,
+
+ /**
+ * Updates the internal information additively. The relevant annotations are checked if they are
+ * already registers in the internal indexing. If not, then they are added. This mode does not
+ * ensure a valid internal indexing as it can miss modification by previous analysis engines (in
+ * between two RutaEngines)
+ */
+ ADDITIVE,
+
+ /**
+ * This mode compares the internal indexing information with the annotation indexes and removes
+ * relevant annotations that are no longer in the annotation indexes. Then, the ADDITIVE mode is
+ * applied. This mode does not ensure a valid internal indexing as it can miss modification
+ * concerning the offsets of an annotation.
+ *
+ * It is currently not recommended to use this mode since it's implementation is slower than the
+ * mode COMPLETE.
+ */
+ SAFE_ADDITIVE,
+
+ /**
+ * This mode does not update the internal indexing information.
+ */
+ NONE;
+
+}
diff --git a/ruta-core/src/main/java/org/apache/uima/ruta/RutaBasicUtils.java b/ruta-core/src/main/java/org/apache/uima/ruta/RutaBasicUtils.java
new file mode 100644
index 0000000..5eb2841
--- /dev/null
+++ b/ruta-core/src/main/java/org/apache/uima/ruta/RutaBasicUtils.java
@@ -0,0 +1,132 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.uima.ruta;
+
+import java.util.List;
+
+import org.apache.uima.cas.CAS;
+import org.apache.uima.cas.Type;
+import org.apache.uima.cas.TypeSystem;
+import org.apache.uima.cas.text.AnnotationFS;
+import org.apache.uima.cas.text.AnnotationIndex;
+import org.apache.uima.fit.util.CasUtil;
+import org.apache.uima.ruta.type.RutaBasic;
+
+/**
+ * Utility methods for modifying and updating RutaBasics outside of a Ruta script, without a
+ * RutaStream.
+ *
+ */
+public class RutaBasicUtils {
+
+ private RutaBasicUtils() {
+ // nothing here
+ }
+
+ /**
+ * Adding a given annotation to the internal indexing in the covered RutaBasics. This extends the
+ * information for begin, end and part of.
+ *
+ * @param annotation
+ * the annotation that should be added to the internal RutaBasic indexing
+ *
+ * @return true if RutaBasics have been updated. Returns false, if the given annotation is a
+ * RutaBasic, if there are no RutaBasics, or if there are no RutaBAsics covered by the
+ * given annotation.
+ */
+ public static boolean addAnnotation(AnnotationFS annotation) {
+ CAS cas = annotation.getCAS();
+ TypeSystem typeSystem = cas.getTypeSystem();
+ Type basicType = typeSystem.getType(RutaBasic.class.getName());
+ Type type = annotation.getType();
+
+ if (typeSystem.subsumes(basicType, type)) {
+ return false;
+ }
+
+ AnnotationIndex<AnnotationFS> basicIndex = cas.getAnnotationIndex(basicType);
+ if (basicIndex.size() == 0) {
+ return false;
+ }
+
+ List<AnnotationFS> coveredBasics = CasUtil.selectCovered(basicType, annotation);
+ if (coveredBasics.size() == 0) {
+ return false;
+ }
+
+ RutaBasic firstBasic = (RutaBasic) coveredBasics.get(0);
+ RutaBasic lastBasic = (RutaBasic) coveredBasics.get(coveredBasics.size() - 1);
+
+ firstBasic.addBegin(annotation, type);
+ lastBasic.addEnd(annotation, type);
+
+ for (AnnotationFS each : coveredBasics) {
+ RutaBasic rutaBasic = (RutaBasic) each;
+ rutaBasic.addPartOf(type);
+ }
+
+ return true;
+ }
+
+ /**
+ * Removing a given annotation to the internal indexing in the covered RutaBasics. This reduces
+ * the information for begin, end and part of.
+ *
+ * @param annotation
+ * the annotation that should be added to the internal RutaBasic indexing
+ *
+ * @return true if RutaBasics have been updated. Returns false, if the given annotation is a
+ * RutaBasic, if there are no RutaBasics, or if there are no RutaBAsics covered by the
+ * given annotation.
+ */
+ public static boolean removeAnnotation(AnnotationFS annotation) {
+ CAS cas = annotation.getCAS();
+ TypeSystem typeSystem = cas.getTypeSystem();
+ Type basicType = typeSystem.getType(RutaBasic.class.getName());
+ Type type = annotation.getType();
+
+ if (typeSystem.subsumes(basicType, type)) {
+ return false;
+ }
+
+ AnnotationIndex<AnnotationFS> basicIndex = cas.getAnnotationIndex(basicType);
+ if (basicIndex.size() == 0) {
+ return false;
+ }
+
+ List<AnnotationFS> coveredBasics = CasUtil.selectCovered(basicType, annotation);
+ if (coveredBasics.size() == 0) {
+ return false;
+ }
+
+ RutaBasic firstBasic = (RutaBasic) coveredBasics.get(0);
+ RutaBasic lastBasic = (RutaBasic) coveredBasics.get(coveredBasics.size() - 1);
+
+ firstBasic.removeBegin(annotation, type);
+ lastBasic.removeEnd(annotation, type);
+
+ for (AnnotationFS each : coveredBasics) {
+ RutaBasic rutaBasic = (RutaBasic) each;
+ rutaBasic.removePartOf(type);
+ }
+
+ return true;
+ }
+
+}
diff --git a/ruta-core/src/main/java/org/apache/uima/ruta/RutaIndexingConfiguration.java b/ruta-core/src/main/java/org/apache/uima/ruta/RutaIndexingConfiguration.java
new file mode 100644
index 0000000..1747f22
--- /dev/null
+++ b/ruta-core/src/main/java/org/apache/uima/ruta/RutaIndexingConfiguration.java
@@ -0,0 +1,117 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.uima.ruta;
+
+public class RutaIndexingConfiguration {
+
+ private String[] indexOnly;
+
+ private String[] indexSkipTypes;
+
+ private boolean indexOnlyMentionedTypes;
+
+ private String[] indexAdditionally;
+
+ private String[] reindexOnly;
+
+ private String[] reindexSkipTypes;
+
+ private boolean reindexOnlyMentionedTypes;
+
+ private String[] reindexAdditionally;
+
+ private ReindexUpdateMode reindexUpdateMode;
+
+ public RutaIndexingConfiguration() {
+ super();
+ }
+
+ public String[] getIndexSkipTypes() {
+ return indexSkipTypes;
+ }
+
+ public void setIndexSkipTypes(String[] indexSkipTypes) {
+ this.indexSkipTypes = indexSkipTypes;
+ }
+
+ public String[] getReindexOnly() {
+ return reindexOnly;
+ }
+
+ public void setReindexOnly(String[] reindexOnly) {
+ this.reindexOnly = reindexOnly;
+ }
+
+ public boolean isReindexOnlyMentionedTypes() {
+ return reindexOnlyMentionedTypes;
+ }
+
+ public void setReindexOnlyMentionedTypes(boolean reindexOnlyMentionedTypes) {
+ this.reindexOnlyMentionedTypes = reindexOnlyMentionedTypes;
+ }
+
+ public String[] getReindexSkipTypes() {
+ return reindexSkipTypes;
+ }
+
+ public void setReindexSkipTypes(String[] reindexSkipTypes) {
+ this.reindexSkipTypes = reindexSkipTypes;
+ }
+
+ public ReindexUpdateMode getReindexUpdateMode() {
+ return reindexUpdateMode;
+ }
+
+ public void setReindexUpdateMode(ReindexUpdateMode reindexUpdateMode) {
+ this.reindexUpdateMode = reindexUpdateMode;
+ }
+
+ public String[] getIndexOnly() {
+ return indexOnly;
+ }
+
+ public void setIndexOnly(String[] indexOnly) {
+ this.indexOnly = indexOnly;
+ }
+
+ public boolean isIndexOnlyMentionedTypes() {
+ return indexOnlyMentionedTypes;
+ }
+
+ public void setIndexOnlyMentionedTypes(boolean indexOnlyMentionedTypes) {
+ this.indexOnlyMentionedTypes = indexOnlyMentionedTypes;
+ }
+
+ public String[] getIndexAdditionally() {
+ return indexAdditionally;
+ }
+
+ public void setIndexAdditionally(String[] indexAdditionally) {
+ this.indexAdditionally = indexAdditionally;
+ }
+
+ public String[] getReindexAdditionally() {
+ return reindexAdditionally;
+ }
+
+ public void setReindexAdditionally(String[] reindexAdditionally) {
+ this.reindexAdditionally = reindexAdditionally;
+ }
+
+}
diff --git a/ruta-core/src/main/java/org/apache/uima/ruta/RutaStream.java b/ruta-core/src/main/java/org/apache/uima/ruta/RutaStream.java
index f9a2d07..53bce8a 100644
--- a/ruta-core/src/main/java/org/apache/uima/ruta/RutaStream.java
+++ b/ruta-core/src/main/java/org/apache/uima/ruta/RutaStream.java
@@ -23,10 +23,10 @@
import static java.util.Collections.emptySet;
import java.util.ArrayList;
-import java.util.Arrays;
import java.util.Collection;
import java.util.Collections;
import java.util.HashSet;
+import java.util.LinkedHashSet;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
@@ -35,7 +35,6 @@
import java.util.NoSuchElementException;
import java.util.Set;
import java.util.TreeMap;
-import java.util.TreeSet;
import org.apache.commons.collections.CollectionUtils;
import org.apache.commons.lang3.StringUtils;
@@ -44,6 +43,7 @@
import org.apache.uima.cas.CASException;
import org.apache.uima.cas.ConstraintFactory;
import org.apache.uima.cas.DoubleArrayFS;
+import org.apache.uima.cas.FSIndex;
import org.apache.uima.cas.FSIterator;
import org.apache.uima.cas.FSMatchConstraint;
import org.apache.uima.cas.Feature;
@@ -53,6 +53,7 @@
import org.apache.uima.cas.StringArrayFS;
import org.apache.uima.cas.Type;
import org.apache.uima.cas.TypeSystem;
+import org.apache.uima.cas.impl.TypeImpl;
import org.apache.uima.cas.text.AnnotationFS;
import org.apache.uima.cas.text.AnnotationIndex;
import org.apache.uima.fit.util.CasUtil;
@@ -83,7 +84,6 @@
import org.apache.uima.ruta.rule.AbstractRule;
import org.apache.uima.ruta.rule.AbstractRuleMatch;
import org.apache.uima.ruta.rule.MatchContext;
-import org.apache.uima.ruta.rule.RuleElement;
import org.apache.uima.ruta.type.RutaAnnotation;
import org.apache.uima.ruta.type.RutaBasic;
import org.apache.uima.ruta.type.RutaOptional;
@@ -213,113 +213,281 @@
currentIt = filter.createFilteredIterator(cas, basicType);
}
+ @Deprecated
public void initalizeBasics(String[] reindexOnly, boolean reindexOnlyMentionedTypes) {
+ RutaIndexingConfiguration config = new RutaIndexingConfiguration();
+ config.setReindexOnly(reindexOnly);
+ config.setReindexOnlyMentionedTypes(reindexOnlyMentionedTypes);
+ config.setReindexUpdateMode(ReindexUpdateMode.ADDITIVE);
+ initalizeBasics(config);
+ }
+
+ public void initalizeBasics(RutaIndexingConfiguration config) {
+
AnnotationIndex<AnnotationFS> basicIndex = cas.getAnnotationIndex(basicType);
+ boolean basicsAvailable = basicIndex.size() != 0;
- Collection<Type> reindexTypeList;
- if (reindexOnlyMentionedTypes) {
- reindexTypeList = removeSubsumedTypes(typeUsage.getUsedTypes(), cas.getTypeSystem());
- } else {
- reindexTypeList = removeSubsumedTypes(Arrays.asList(reindexOnly), cas.getTypeSystem());
+ if (config.getReindexUpdateMode() == ReindexUpdateMode.NONE && basicsAvailable) {
+
+ // there are already some ruta basics and we do not want to update anything, since we know we
+ // do not need to. Only set internal maps
+ initializeInternalAnchorMaps(basicIndex);
+ return;
}
- final List<AnnotationFS> allAnnotations = new LinkedList<>();
- for (Type type : reindexTypeList) {
- AnnotationIndex<AnnotationFS> annotationIndex = null;
+ if (!basicsAvailable) {
+ // indexing
+ createBasics(config);
+ } else {
+ // reindexing
+ updateBasics(basicIndex, config);
+ }
+ }
+
+ private void createBasics(RutaIndexingConfiguration config) {
+ TypeSystem typeSystem = cas.getTypeSystem();
+ Collection<Type> indexTypes;
+ if (config.isIndexOnlyMentionedTypes()) {
+ indexTypes = convertNamesToTypes(typeUsage.getUsedTypes().toArray(new String[0]), typeSystem);
+ } else {
+ indexTypes = convertNamesToTypes(config.getIndexOnly(), typeSystem);
+ }
+ Collection<Type> indexSkipTypes = convertNamesToTypes(config.getIndexSkipTypes(), typeSystem);
+ Collection<Type> indexParentTypes = removeSubsumedTypes(indexTypes, typeSystem);
+ Collection<Type> allIndexTypes = expandToAllSubtypes(indexTypes, indexSkipTypes, typeSystem);
+
+ List<FSIndex<AnnotationFS>> relevantIndexes = getRelevantIndexes(typeSystem, indexParentTypes,
+ indexSkipTypes);
+ createBasics(relevantIndexes, allIndexTypes);
+ }
+
+ private void updateBasics(AnnotationIndex<AnnotationFS> basicIndex,
+ RutaIndexingConfiguration config) {
+ TypeSystem typeSystem = cas.getTypeSystem();
+ Collection<Type> reindexTypes;
+ if (config.isReindexOnlyMentionedTypes()) {
+ reindexTypes = convertNamesToTypes(typeUsage.getUsedTypes().toArray(new String[0]),
+ typeSystem);
+ } else {
+ reindexTypes = convertNamesToTypes(config.getReindexOnly(), typeSystem);
+ }
+ Collection<Type> reindexSkipTypes = convertNamesToTypes(config.getReindexSkipTypes(),
+ typeSystem);
+ Collection<Type> reindexParentTypes = removeSubsumedTypes(reindexTypes, typeSystem);
+ Collection<Type> allReindexTypes = expandToAllSubtypes(reindexTypes, reindexSkipTypes,
+ typeSystem);
+ List<FSIndex<AnnotationFS>> relevantIndexes = getRelevantIndexes(typeSystem, reindexParentTypes,
+ reindexSkipTypes);
+ updateBasics(basicIndex, relevantIndexes, allReindexTypes, config.getReindexUpdateMode());
+
+ }
+
+ private List<FSIndex<AnnotationFS>> getRelevantIndexes(TypeSystem typeSystem,
+ Collection<Type> rootReindexTypeList, Collection<Type> skipTypeList) {
+ List<FSIndex<AnnotationFS>> relevantIndexes = new ArrayList<>();
+
+ for (Type type : rootReindexTypeList) {
+
+ if (skipTypeForIndexing(type, skipTypeList, typeSystem)) {
+ continue;
+ }
+
if (StringUtils.equals(type.getName(), CAS.TYPE_NAME_ANNOTATION)) {
- annotationIndex = cas.getAnnotationIndex();
+ relevantIndexes.add(cas.getAnnotationIndex().withSnapshotIterators());
} else {
- annotationIndex = cas.getAnnotationIndex(type);
+ relevantIndexes.add(cas.getAnnotationIndex(type).withSnapshotIterators());
}
- for (AnnotationFS a : annotationIndex) {
- if (a.getBegin() != a.getEnd() || a.equals(cas.getDocumentAnnotation())) {
- allAnnotations.add(a);
+ }
+ return relevantIndexes;
+ }
+
+ private boolean skipTypeForIndexing(Type type, Collection<Type> skipTypeList,
+ TypeSystem typeSystem) {
+ // collect no ruta basics
+ if (typeSystem.subsumes(basicType, type)) {
+ return true;
+ }
+ if (skipTypeList != null) {
+ if (skipTypeList.contains(type)) {
+ return true;
+ } else {
+ for (Type skipType : skipTypeList) {
+ if (typeSystem.subsumes(skipType, type)) {
+ return true;
+ }
+ }
+ }
+ }
+ return false;
+ }
+
+ private void createBasics(List<FSIndex<AnnotationFS>> relevantIndexes,
+ Collection<Type> allIndexTypes) {
+
+ List<Integer> anchors = getSortedUniqueAnchors(relevantIndexes);
+ createBasicsForAnchors(anchors);
+
+ // add all annotations
+ for (FSIndex<AnnotationFS> index : relevantIndexes) {
+ for (AnnotationFS a : index) {
+ // consider skipped types
+ if (allIndexTypes == null || allIndexTypes.contains(a.getType())) {
+ addAnnotation(a, false, false, null);
}
}
}
- if (basicIndex.size() == 0) {
- TreeSet<Integer> anchors = new TreeSet<>();
- for (AnnotationFS a : allAnnotations) {
- anchors.add(a.getBegin());
- anchors.add(a.getEnd());
- }
- if (anchors.size() == 0) {
- // empty document
- createRutaBasic(0, 0);
- } else if (anchors.size() == 1) {
- Integer first = anchors.pollFirst();
- createRutaBasic(first, first);
- } else {
- while (true) {
- Integer first = anchors.pollFirst();
- if (first == null || anchors.isEmpty()) {
- break;
- }
- Integer second = anchors.first();
- if (first < second) {
- createRutaBasic(first, second);
- }
- }
- }
- for (AnnotationFS a : allAnnotations) {
- addAnnotation(a, false, false, null);
- }
- updateIterators(documentAnnotation);
- } else {
- for (AnnotationFS e : basicIndex) {
- beginAnchors.put(e.getBegin(), (RutaBasic) e);
- endAnchors.put(e.getEnd(), (RutaBasic) e);
- }
+ updateIterators(documentAnnotation);
+ }
- RutaBasic firstBasic = (RutaBasic) basicIndex.iterator().get();
- if (firstBasic.isLowMemoryProfile() != lowMemoryProfile) {
- for (AnnotationFS each : basicIndex) {
- RutaBasic eachBasic = (RutaBasic) each;
- eachBasic.setLowMemoryProfile(lowMemoryProfile);
- }
- }
- // TODO: find a better solution for this:
- for (AnnotationFS a : allAnnotations) {
- Type type = a.getType();
- if (!type.equals(basicType)) {
- RutaBasic beginAnchor = getBeginAnchor(a.getBegin());
- RutaBasic endAnchor = getEndAnchor(a.getEnd());
- boolean shouldBeAdded = false;
- if (beginAnchor == null || endAnchor == null) {
- shouldBeAdded = true;
- } else {
- Collection<AnnotationFS> set = beginAnchor.getBeginAnchors(type);
- if (!set.contains(a)) {
- shouldBeAdded = true;
- }
- }
- if (shouldBeAdded) {
- addAnnotation(a, false, false, null);
- }
+ private void createBasicsForAnchors(List<Integer> anchors) {
+ if (anchors.size() == 0) {
+ // empty document
+ createRutaBasic(0, 0);
+ } else if (anchors.size() == 1) {
+ Integer first = anchors.get(0);
+ createRutaBasic(first, first);
+ } else {
+ for (int i = 0; i < anchors.size() - 1; i++) {
+ Integer first = anchors.get(i);
+ Integer second = anchors.get(i + 1);
+ if (first < second) { // not really needed
+ createRutaBasic(first, second);
}
}
}
}
- private Collection<Type> removeSubsumedTypes(Collection<String> typeNames,
- TypeSystem typeSystem) {
- Collection<Type> allTypes = new HashSet<>();
- for (String each : typeNames) {
- Type type = typeSystem.getType(each);
- if (type != null) {
- allTypes.add(type);
+ private List<Integer> getSortedUniqueAnchors(List<FSIndex<AnnotationFS>> relevantIndexes) {
+ Set<Integer> anchorSet = new HashSet<>();
+ for (FSIndex<AnnotationFS> annotationIndex : relevantIndexes) {
+ for (AnnotationFS a : annotationIndex) {
+ anchorSet.add(a.getBegin());
+ anchorSet.add(a.getEnd());
}
}
- List<Type> rootTypes = new ArrayList<>(allTypes);
- for (Type type1 : allTypes) {
- for (Type type2 : allTypes) {
- if (type1 != type2 && typeSystem.subsumes(type1, type2)) {
- rootTypes.remove(type2);
+ List<Integer> anchors = new ArrayList<>(anchorSet);
+ Collections.sort(anchors);
+ return anchors;
+ }
+
+ private void updateBasics(AnnotationIndex<AnnotationFS> basicIndex,
+ List<FSIndex<AnnotationFS>> relevantIndexes, Collection<Type> allReindexTypes,
+ ReindexUpdateMode indexUpdateMode) {
+
+ initializeInternalAnchorMaps(basicIndex);
+
+ updateRutaBasicMemoryProfile(basicIndex);
+
+ switch (indexUpdateMode) {
+ case COMPLETE:
+ updateBasicsComplete(basicIndex, relevantIndexes, allReindexTypes);
+ break;
+ case ADDITIVE:
+ updateBasicsAdditive(basicIndex, relevantIndexes);
+ break;
+ case SAFE_ADDITIVE:
+ updateBasicsSafeAdditive(basicIndex, relevantIndexes, allReindexTypes);
+ break;
+ case NONE:
+ // do nothing
+ break;
+
+ default:
+ throw new IllegalArgumentException(
+ "The given IndexUpdateMode is not supported: " + indexUpdateMode);
+ }
+
+ }
+
+ private void updateBasicsComplete(AnnotationIndex<AnnotationFS> basicIndex,
+ List<FSIndex<AnnotationFS>> relevantIndexes, Collection<Type> completeReindexTypeList) {
+
+ // cleanup index info for given types
+ for (AnnotationFS each : basicIndex) {
+ RutaBasic rutaBasic = (RutaBasic) each;
+ for (Type type : completeReindexTypeList) {
+ int code = ((TypeImpl) type).getCode();
+ rutaBasic.getPartOf()[code] = 0;
+ rutaBasic.getBeginMap()[code] = null;
+ rutaBasic.getEndMap()[code] = null;
+ }
+ }
+
+ // add all annotations
+ for (FSIndex<AnnotationFS> index : relevantIndexes) {
+ for (AnnotationFS a : index) {
+ // consider skipped types
+ if (completeReindexTypeList.contains(a.getType())) {
+ addAnnotation(a, false, false, null);
}
}
}
- return rootTypes;
+ }
+
+ private void updateBasicsAdditive(AnnotationIndex<AnnotationFS> basicIndex,
+ List<FSIndex<AnnotationFS>> relevantIndexes) {
+
+ // adds annotation only if not already known and included
+ for (FSIndex<AnnotationFS> annotationIndex : relevantIndexes) {
+ for (AnnotationFS a : annotationIndex) {
+ Type type = a.getType();
+ RutaBasic beginAnchor = getBeginAnchor(a.getBegin());
+ RutaBasic endAnchor = getEndAnchor(a.getEnd());
+ boolean shouldBeAdded = false;
+ if (beginAnchor == null || endAnchor == null) {
+ shouldBeAdded = true;
+ } else {
+ Collection<AnnotationFS> set = beginAnchor.getBeginAnchors(type);
+ if (!set.contains(a)) {
+ shouldBeAdded = true;
+ }
+ }
+ if (shouldBeAdded) {
+ addAnnotation(a, false, false, null);
+ }
+ }
+ }
+ }
+
+ private void updateBasicsSafeAdditive(AnnotationIndex<AnnotationFS> basicIndex,
+ List<FSIndex<AnnotationFS>> relevantIndexes, Collection<Type> completeReindexTypeList) {
+
+ // search for removed annotations, and remove them
+ for (AnnotationFS each : basicIndex) {
+ RutaBasic rutaBasic = (RutaBasic) each;
+ for (Type type : completeReindexTypeList) {
+ // it's sufficient to check begin anchors, end should be consistent
+ Collection<AnnotationFS> beginAnchors = rutaBasic.getBeginAnchors(type);
+ Collection<AnnotationFS> toRemove = new ArrayList<>();
+ for (AnnotationFS annotationAtAnchor : beginAnchors) {
+ if (!annotationAtAnchor.getCAS().getAnnotationIndex().contains(annotationAtAnchor)) {
+ // not in index? -> was removed
+ toRemove.add(annotationAtAnchor);
+ }
+ }
+ toRemove.forEach(a -> removeAnnotation(a));
+ }
+ }
+
+ updateBasicsAdditive(basicIndex, relevantIndexes);
+ }
+
+ private void initializeInternalAnchorMaps(AnnotationIndex<AnnotationFS> basicIndex) {
+ for (AnnotationFS e : basicIndex) {
+ beginAnchors.put(e.getBegin(), (RutaBasic) e);
+ endAnchors.put(e.getEnd(), (RutaBasic) e);
+ }
+ }
+
+ private void updateRutaBasicMemoryProfile(AnnotationIndex<AnnotationFS> basicIndex) {
+ RutaBasic firstBasic = (RutaBasic) basicIndex.iterator().get();
+ if (firstBasic.isLowMemoryProfile() != lowMemoryProfile) {
+ for (AnnotationFS each : basicIndex) {
+ RutaBasic eachBasic = (RutaBasic) each;
+ eachBasic.setLowMemoryProfile(lowMemoryProfile);
+ }
+ }
}
private RutaBasic createRutaBasic(int begin, int end) {
@@ -344,7 +512,10 @@
public void addAnnotation(AnnotationFS annotation, boolean addToIndex, boolean updateInternal,
AbstractRuleMatch<? extends AbstractRule> creator) {
Type type = annotation.getType();
- if (type.equals(basicType)) {
+ // no internal indexing for basics themselves or for zero-length annotations, exception for
+ // DocumentAnnotation
+ if (type.equals(basicType) || (annotation.getBegin() >= annotation.getEnd()
+ && !annotation.equals(cas.getDocumentAnnotation()))) {
return;
}
if (indexType(annotation.getType())) {
@@ -1462,6 +1633,64 @@
return null;
}
+ private Collection<Type> removeSubsumedTypes(Collection<Type> types, TypeSystem typeSystem) {
+ List<Type> rootTypes = new ArrayList<>(types);
+ for (Type type1 : types) {
+ for (Type type2 : types) {
+ if (type1 != type2 && typeSystem.subsumes(type1, type2)) {
+ rootTypes.remove(type2);
+ }
+ }
+ }
+ return rootTypes;
+ }
+
+ private Collection<Type> expandToAllSubtypes(Collection<Type> reindexTypeList,
+ Collection<Type> reindexSkipTypes, TypeSystem typeSystem) {
+ if (reindexTypeList.isEmpty()) {
+ return Collections.emptyList();
+ }
+
+ Collection<Type> result = new LinkedHashSet<>();
+ for (Type type : reindexTypeList) {
+
+ if (skipTypeForIndexing(type, reindexSkipTypes, typeSystem)) {
+ continue;
+ }
+
+ result.add(type);
+ List<Type> properlySubsumedTypes = typeSystem.getProperlySubsumedTypes(type);
+ for (Type subType : properlySubsumedTypes) {
+ if (skipTypeForIndexing(subType, reindexSkipTypes, typeSystem)) {
+ continue;
+ }
+ result.add(subType);
+ }
+
+ // if we started with uima.tcas.Annotation, we already collected all
+ if (type.getName().equals(CAS.TYPE_NAME_ANNOTATION)) {
+ return result;
+ }
+ }
+
+ return result;
+ }
+
+ private Collection<Type> convertNamesToTypes(String[] typeNames, TypeSystem typeSystem) {
+ if (typeNames == null) {
+ return Collections.emptyList();
+ }
+
+ Collection<Type> result = new ArrayList<>(typeNames.length);
+ for (String each : typeNames) {
+ Type type = typeSystem.getType(each);
+ if (type != null) {
+ result.add(type);
+ }
+ }
+ return result;
+ }
+
public void setMaxRuleMatches(long maxRuleMatches) {
this.maxRuleMatches = maxRuleMatches;
}
@@ -1477,4 +1706,5 @@
public long getMaxRuleElementMatches() {
return maxRuleElementMatches;
}
+
}
diff --git a/ruta-core/src/main/java/org/apache/uima/ruta/engine/RutaEngine.java b/ruta-core/src/main/java/org/apache/uima/ruta/engine/RutaEngine.java
index 6101979..d676528 100644
--- a/ruta-core/src/main/java/org/apache/uima/ruta/engine/RutaEngine.java
+++ b/ruta-core/src/main/java/org/apache/uima/ruta/engine/RutaEngine.java
@@ -61,8 +61,10 @@
import org.apache.uima.resource.ResourceInitializationException;
import org.apache.uima.resource.ResourceManager;
import org.apache.uima.ruta.FilterManager;
+import org.apache.uima.ruta.ReindexUpdateMode;
import org.apache.uima.ruta.RutaConstants;
import org.apache.uima.ruta.RutaEnvironment;
+import org.apache.uima.ruta.RutaIndexingConfiguration;
import org.apache.uima.ruta.RutaModule;
import org.apache.uima.ruta.RutaScriptFactory;
import org.apache.uima.ruta.RutaStream;
@@ -429,6 +431,18 @@
private String[] varValues;
/**
+ * This parameter specifies the annotation types which should be indexed for ruta's internal
+ * annotations. All annotation types that are relevant need to be listed here. The value of this
+ * parameter needs only be adapted for performance and memory optimization in pipelines that
+ * contains several ruta analysis engines. Default value is uima.tcas.Annotation
+ */
+ public static final String PARAM_INDEX_ONLY = "indexOnly";
+
+ @ConfigurationParameter(name = PARAM_INDEX_ONLY, mandatory = false, defaultValue = {
+ "uima.tcas.Annotation" })
+ private String[] indexOnly;
+
+ /**
* This parameter specifies the annotation types which should be reindexed for ruta's internal
* annotations. All annotation types that changed since the last call of a ruta script need to be
* listed here. The value of this parameter needs only be adapted for performance optimization in
@@ -441,16 +455,6 @@
private String[] reindexOnly;
/**
- * If this parameter is activated, then only annotations of types are internally reindexed at
- * beginning that are mentioned with in the rules. This parameter overrides the values of the
- * parameter 'reindexOnly' with the types that are mentioned in the rules.
- */
- public static final String PARAM_REINDEX_ONLY_MENTIONED_TYPES = "reindexOnlyMentionedTypes";
-
- @ConfigurationParameter(name = PARAM_REINDEX_ONLY_MENTIONED_TYPES, mandatory = true, defaultValue = "false")
- private boolean reindexOnlyMentionedTypes;
-
- /**
* If this parameter is activated, then only annotations of types are internally indexed that are
* mentioned with in the rules. This optimization of the internal indexing can improve the speed
* and reduce the memory footprint. However, several features of the rule matching require the
@@ -463,8 +467,38 @@
private boolean indexOnlyMentionedTypes;
/**
+ * If this parameter is activated, then only annotations of types are internally reindexed at
+ * beginning that are mentioned with in the rules. This parameter overrides the values of the
+ * parameter 'reindexOnly' with the types that are mentioned in the rules.
+ */
+ public static final String PARAM_REINDEX_ONLY_MENTIONED_TYPES = "reindexOnlyMentionedTypes";
+
+ @ConfigurationParameter(name = PARAM_REINDEX_ONLY_MENTIONED_TYPES, mandatory = true, defaultValue = "false")
+ private boolean reindexOnlyMentionedTypes;
+
+ /**
+ * This parameter specifies annotation types that should not be indexed at all. These types
+ * normally include annotations that provide no meaningful semantics for text processing, e.g.,
+ * types concerning ruta debug information.
+ */
+ public static final String PARAM_INDEX_SKIP_TYPES = "indexSkipTypes";
+
+ @ConfigurationParameter(name = PARAM_INDEX_SKIP_TYPES, mandatory = true, defaultValue = {})
+ private String[] indexSkipTypes;
+
+ /**
+ * This parameter specifies annotation types that should not be reindexed. These types normally
+ * include annotations that are added once and are not changed in the following pipeline, e.g.,
+ * Tokens or TokenSeed (like CW).
+ */
+ public static final String PARAM_REINDEX_SKIP_TYPES = "reindexSkipTypes";
+
+ @ConfigurationParameter(name = PARAM_REINDEX_SKIP_TYPES, mandatory = true, defaultValue = {})
+ private String[] reindexSkipTypes;
+
+ /**
* This parameter specifies annotation types (resolvable mentions are also supported) that should
- * be index additionally to types mentioned in the rules. This parameter is only used if the
+ * be indexed additionally to types mentioned in the rules. This parameter is only used if the
* parameter 'indexOnlyMentionedTypes' is activated.
*
*/
@@ -474,6 +508,27 @@
private String[] indexAdditionally;
/**
+ * This parameter specifies annotation types that should be reindexed additionally to types
+ * mentioned in the rules. This parameter is only used if the parameter
+ * 'reindexOnlyMentionedTypes' is activated.
+ */
+ public static final String PARAM_REINDEX_ADDITONALLY = "reindexAdditionally";
+
+ @ConfigurationParameter(name = PARAM_REINDEX_ADDITONALLY, mandatory = false, defaultValue = {})
+ private String[] reindexAdditionally;
+
+ /**
+ * This parameter specifies the mode for updating the internal indexing in RutaBasic annotations.
+ * This is a technical parameter for optimizing the runtime performance/speed of RutaEngines.
+ * Available modes are: COMPLETE, ADDITIVE, SAFE_ADDITIVE, NONE. Default value is ADDITIVE.
+ *
+ */
+ public static final String PARAM_REINDEX_UPDATE_MODE = "reindexUpdateMode";
+
+ @ConfigurationParameter(name = PARAM_REINDEX_UPDATE_MODE, mandatory = true, defaultValue = "ADDITIVE")
+ private ReindexUpdateMode reindexUpdateMode;
+
+ /**
* This parameter determines positions as invisible if the internal indexing of the corresponding
* RutaBasic annotation is empty.
*/
@@ -810,11 +865,24 @@
seedTypes = seedAnnotations(cas);
RutaStream stream = new RutaStream(cas, basicType, filter, lowMemoryProfile,
simpleGreedyForComposed, emptyIsInvisible, typeUsageInformation, crowd);
-
- stream.initalizeBasics(reindexOnly, reindexOnlyMentionedTypes);
+ stream.initalizeBasics(createRutaIndexingConfiguration());
return stream;
}
+ private RutaIndexingConfiguration createRutaIndexingConfiguration() {
+ RutaIndexingConfiguration indexingConfig = new RutaIndexingConfiguration();
+ indexingConfig.setIndexOnly(indexOnly);
+ indexingConfig.setIndexSkipTypes(indexSkipTypes);
+ indexingConfig.setIndexOnlyMentionedTypes(indexOnlyMentionedTypes);
+ indexingConfig.setIndexAdditionally(indexAdditionally);
+ indexingConfig.setReindexOnly(reindexOnly);
+ indexingConfig.setReindexSkipTypes(reindexSkipTypes);
+ indexingConfig.setReindexOnlyMentionedTypes(reindexOnlyMentionedTypes);
+ indexingConfig.setReindexAdditionally(reindexAdditionally);
+ indexingConfig.setReindexUpdateMode(reindexUpdateMode);
+ return indexingConfig;
+ }
+
private List<Type> seedAnnotations(CAS cas) throws AnalysisEngineProcessException {
List<Type> result = new ArrayList<Type>();
if (seeders != null) {
diff --git a/ruta-core/src/test/java/org/apache/uima/ruta/EmptyDocumentTest.java b/ruta-core/src/test/java/org/apache/uima/ruta/EmptyDocumentTest.java
index 0ad98f1..4476a55 100644
--- a/ruta-core/src/test/java/org/apache/uima/ruta/EmptyDocumentTest.java
+++ b/ruta-core/src/test/java/org/apache/uima/ruta/EmptyDocumentTest.java
@@ -38,7 +38,7 @@
String name = RuleInference1Test.class.getSimpleName();
String namespace1 = RuleInference1Test.class.getPackage().getName().replaceAll("\\.", "/");
String namespace2 = EmptyDocumentTest.class.getPackage().getName().replaceAll("\\.", "/");
- CAS cas = RutaTestUtils.process(namespace1 + "/" + name + RutaEngine.SCRIPT_FILE_EXTENSION,
+ CAS cas = RutaTestUtils.process(namespace1 + "/" + name + RutaEngine.SCRIPT_FILE_EXTENSION,
namespace2 + "/" + "EmptyDocumentTest.txt", 50);
AnnotationIndex<AnnotationFS> ai = null;
@@ -60,7 +60,6 @@
CAS cas = RutaTestUtils.getCAS("");
Ruta.apply(cas, "Document{IS(uima.tcas.DocumentAnnotation) -> T1};");
RutaTestUtils.assertAnnotationsEquals(cas, 1, 1, "");
-
}
}
diff --git a/ruta-core/src/test/java/org/apache/uima/ruta/IndexSkipTypesTest.java b/ruta-core/src/test/java/org/apache/uima/ruta/IndexSkipTypesTest.java
new file mode 100644
index 0000000..84a23b2
--- /dev/null
+++ b/ruta-core/src/test/java/org/apache/uima/ruta/IndexSkipTypesTest.java
@@ -0,0 +1,70 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.uima.ruta;
+
+import java.util.LinkedHashMap;
+import java.util.Map;
+
+import org.apache.uima.cas.CAS;
+import org.apache.uima.cas.text.AnnotationFS;
+import org.apache.uima.ruta.engine.Ruta;
+import org.apache.uima.ruta.engine.RutaEngine;
+import org.apache.uima.ruta.engine.RutaTestUtils;
+import org.apache.uima.ruta.type.EvalAnnotation;
+import org.apache.uima.ruta.type.FalseNegative;
+import org.apache.uima.ruta.type.TruePositive;
+import org.junit.Test;
+
+public class IndexSkipTypesTest {
+
+ @Test
+ public void test() throws Exception {
+
+ Map<String, Object> params = new LinkedHashMap<>();
+ params.put(RutaEngine.PARAM_REINDEX_UPDATE_MODE, ReindexUpdateMode.COMPLETE);
+ params.put(RutaEngine.PARAM_INDEX_SKIP_TYPES, new String[] { EvalAnnotation.class.getName() });
+ params.put(RutaEngine.PARAM_REINDEX_SKIP_TYPES,
+ new String[] { EvalAnnotation.class.getName() });
+
+ CAS cas = RutaTestUtils.getCAS("Test this");
+
+ AnnotationFS fn = cas
+ .createAnnotation(cas.getTypeSystem().getType(FalseNegative.class.getName()), 0, 4);
+ cas.addFsToIndexes(fn);
+
+ Ruta.apply(cas, "ANY{PARTOF(CW)->T1};", params);
+ Ruta.apply(cas, "CW{->FalsePositive};", params);
+ // index anyway if a rule creates the annotation
+ Ruta.apply(cas, "CW{PARTOF(FalseNegative)->T2};", params);
+ Ruta.apply(cas, "CW{PARTOF(FalsePositive)->T3};", params);
+
+ AnnotationFS tp = cas
+ .createAnnotation(cas.getTypeSystem().getType(TruePositive.class.getName()), 0, 4);
+ cas.addFsToIndexes(tp);
+
+ Ruta.apply(cas, "CW{PARTOF(TruePositive)-> T4};", params);
+
+ RutaTestUtils.assertAnnotationsEquals(cas, 1, 1, "Test");
+ RutaTestUtils.assertAnnotationsEquals(cas, 2, 0);
+ RutaTestUtils.assertAnnotationsEquals(cas, 3, 1, "Test");
+ RutaTestUtils.assertAnnotationsEquals(cas, 4, 0);
+
+ }
+
+}
diff --git a/ruta-core/src/test/java/org/apache/uima/ruta/ReindexSkipTypesTest.java b/ruta-core/src/test/java/org/apache/uima/ruta/ReindexSkipTypesTest.java
new file mode 100644
index 0000000..cec3eb4
--- /dev/null
+++ b/ruta-core/src/test/java/org/apache/uima/ruta/ReindexSkipTypesTest.java
@@ -0,0 +1,66 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.uima.ruta;
+
+import java.util.LinkedHashMap;
+import java.util.Map;
+
+import org.apache.uima.cas.CAS;
+import org.apache.uima.fit.util.CasUtil;
+import org.apache.uima.ruta.engine.Ruta;
+import org.apache.uima.ruta.engine.RutaEngine;
+import org.apache.uima.ruta.engine.RutaTestUtils;
+import org.apache.uima.ruta.type.CW;
+import org.apache.uima.ruta.type.SW;
+import org.apache.uima.ruta.type.TokenSeed;
+import org.junit.Test;
+
+public class ReindexSkipTypesTest {
+
+ @Test
+ public void test() throws Exception {
+
+ Map<String, Object> params = new LinkedHashMap<>();
+ params.put(RutaEngine.PARAM_REINDEX_UPDATE_MODE, ReindexUpdateMode.COMPLETE);
+ params.put(RutaEngine.PARAM_REINDEX_SKIP_TYPES, new String[] { TokenSeed.class.getName() });
+
+ CAS cas = RutaTestUtils.getCAS("Test this");
+
+ // index and add some annotations base on internal indexing
+ Ruta.apply(cas, "ANY{PARTOF(CW)->T1};ANY{PARTOF(SW)->T2};", params);
+
+ // remove CW and SW
+ CasUtil.select(cas, cas.getTypeSystem().getType(CW.class.getName())).forEach(a -> {
+ cas.removeFsFromIndexes(a);
+ });
+ CasUtil.select(cas, cas.getTypeSystem().getType(SW.class.getName())).forEach(a -> {
+ cas.removeFsFromIndexes(a);
+ });
+
+ // redo. Cannot use ANY, because its gone with CW/SW
+ Ruta.apply(cas, "T1{PARTOF(CW)->T3};T2{PARTOF(SW)->T4};", params);
+
+ RutaTestUtils.assertAnnotationsEquals(cas, 1, 1, "Test");
+ RutaTestUtils.assertAnnotationsEquals(cas, 2, 1, "this");
+ RutaTestUtils.assertAnnotationsEquals(cas, 3, 1, "Test");
+ RutaTestUtils.assertAnnotationsEquals(cas, 4, 1, "this");
+
+ }
+
+}
diff --git a/ruta-core/src/test/java/org/apache/uima/ruta/ReindexUpdateModeTest.java b/ruta-core/src/test/java/org/apache/uima/ruta/ReindexUpdateModeTest.java
new file mode 100644
index 0000000..86e24e9
--- /dev/null
+++ b/ruta-core/src/test/java/org/apache/uima/ruta/ReindexUpdateModeTest.java
@@ -0,0 +1,429 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.uima.ruta;
+
+import java.util.LinkedHashMap;
+import java.util.Map;
+
+import org.apache.uima.cas.CAS;
+import org.apache.uima.cas.text.AnnotationFS;
+import org.apache.uima.fit.util.CasUtil;
+import org.apache.uima.ruta.engine.Ruta;
+import org.apache.uima.ruta.engine.RutaEngine;
+import org.apache.uima.ruta.engine.RutaTestUtils;
+import org.junit.Test;
+
+public class ReindexUpdateModeTest {
+
+ @Test
+ public void testComplete() throws Exception {
+
+ Map<String, Object> params = new LinkedHashMap<>();
+ params.put(RutaEngine.PARAM_REINDEX_UPDATE_MODE, ReindexUpdateMode.COMPLETE);
+
+ CAS cas = RutaTestUtils.getCAS("This is 1 TEST.");
+
+ // initial indexing and create some annotations
+ Ruta.apply(cas, "NUM{-> T1}; CAP{-> T2};", params);
+
+ RutaTestUtils.assertAnnotationsEquals(cas, 1, 1, "1");
+ RutaTestUtils.assertAnnotationsEquals(cas, 2, 1, "TEST");
+
+ // add T1 on CW, remove T2 on CAP
+ AnnotationFS at3 = cas.createAnnotation(RutaTestUtils.getTestType(cas, 3), 0, 4);
+ cas.addFsToIndexes(at3);
+ CasUtil.select(cas, RutaTestUtils.getTestType(cas, 2)).forEach(a -> {
+ cas.removeFsFromIndexes(a);
+ });
+
+ RutaTestUtils.assertAnnotationsEquals(cas, 1, 1, "1");
+ RutaTestUtils.assertAnnotationsEquals(cas, 2, 0);
+ RutaTestUtils.assertAnnotationsEquals(cas, 3, 1, "This");
+
+ // apply rules on modifications, 1x partof, 2x startswith, 3x endswith
+ Ruta.apply(cas, "ANY{PARTOF(T1)->T11};ANY{PARTOF(T2)->T12};ANY{PARTOF(T3)->T13};", params);
+ Ruta.apply(cas, "ANY{STARTSWITH(T1)->T21};ANY{STARTSWITH(T2)->T22};ANY{STARTSWITH(T3)->T23};",
+ params);
+ Ruta.apply(cas, "ANY{ENDSWITH(T1)->T31};ANY{ENDSWITH(T2)->T32};ANY{ENDSWITH(T3)->T33};",
+ params);
+
+ // partof
+ RutaTestUtils.assertAnnotationsEquals(cas, 11, 1, "1");
+ RutaTestUtils.assertAnnotationsEquals(cas, 12, 0);
+ RutaTestUtils.assertAnnotationsEquals(cas, 13, 1, "This");
+
+ // startswith
+ RutaTestUtils.assertAnnotationsEquals(cas, 21, 1, "1");
+ RutaTestUtils.assertAnnotationsEquals(cas, 22, 0);
+ RutaTestUtils.assertAnnotationsEquals(cas, 23, 1, "This");
+
+ // endswith
+ RutaTestUtils.assertAnnotationsEquals(cas, 31, 1, "1");
+ RutaTestUtils.assertAnnotationsEquals(cas, 32, 0);
+ RutaTestUtils.assertAnnotationsEquals(cas, 33, 1, "This");
+
+ // new round with multiple overlapping annotations
+ Ruta.apply(cas, "PERIOD{-> T4, T4}; SW{-> T5};", params);
+
+ // remove one T4, add another T5
+ AnnotationFS at4 = CasUtil.select(cas, RutaTestUtils.getTestType(cas, 4)).iterator().next();
+ cas.removeFsFromIndexes(at4);
+ AnnotationFS at5 = cas.createAnnotation(RutaTestUtils.getTestType(cas, 5), 5, 7);
+ cas.addFsToIndexes(at5);
+
+ RutaTestUtils.assertAnnotationsEquals(cas, 4, 1, ".");
+ RutaTestUtils.assertAnnotationsEquals(cas, 5, 2, "is", "is");
+
+ // apply rules on modifications, 1x partof, 2x startswith, 3x endswith
+ Ruta.apply(cas, "ANY{PARTOF(T4)->T14};ANY{PARTOF(T5)->T15};", params);
+ Ruta.apply(cas, "ANY{STARTSWITH(T4)->T24};ANY{STARTSWITH(T5)->T25};", params);
+ Ruta.apply(cas, "ANY{ENDSWITH(T4)->T34};ANY{ENDSWITH(T5)->T35};", params);
+
+ // partof
+ RutaTestUtils.assertAnnotationsEquals(cas, 14, 1, ".");
+ RutaTestUtils.assertAnnotationsEquals(cas, 15, 1, "is");
+
+ // startswith
+ RutaTestUtils.assertAnnotationsEquals(cas, 24, 1, ".");
+ RutaTestUtils.assertAnnotationsEquals(cas, 25, 1, "is");
+
+ // endswith
+ RutaTestUtils.assertAnnotationsEquals(cas, 34, 1, ".");
+ RutaTestUtils.assertAnnotationsEquals(cas, 35, 1, "is");
+
+ // further modify in ruta, remove T4 completely, remove one T5
+ Ruta.apply(cas, "t4:T4{-> UNMARK(t4)};", params);
+ Ruta.apply(cas, "t5:T5{CONTAINS(T5,2,10) -> UNMARK(t5)};", params);
+ // T4 -> Tx6, T5 -> Tx7
+ Ruta.apply(cas, "ANY{PARTOF(T4)->T16};ANY{PARTOF(T5)->T17};");
+ Ruta.apply(cas, "ANY{STARTSWITH(T4)->T26};ANY{STARTSWITH(T5)->T27};", params);
+ Ruta.apply(cas, "ANY{ENDSWITH(T4)->T36};ANY{ENDSWITH(T5)->T37};", params);
+
+ // partof
+ RutaTestUtils.assertAnnotationsEquals(cas, 16, 0);
+ RutaTestUtils.assertAnnotationsEquals(cas, 17, 1, "is");
+
+ // startswith
+ RutaTestUtils.assertAnnotationsEquals(cas, 26, 0);
+ RutaTestUtils.assertAnnotationsEquals(cas, 27, 1, "is");
+
+ // endswith
+ RutaTestUtils.assertAnnotationsEquals(cas, 36, 0);
+ RutaTestUtils.assertAnnotationsEquals(cas, 37, 1, "is");
+ }
+
+ @Test
+ public void testAdditiveWithExternalUpdate() throws Exception {
+
+ Map<String, Object> params = new LinkedHashMap<>();
+ params.put(RutaEngine.PARAM_REINDEX_UPDATE_MODE, ReindexUpdateMode.ADDITIVE);
+
+ CAS cas = RutaTestUtils.getCAS("This is 1 TEST.");
+
+ // initial indexing and create some annotations
+ Ruta.apply(cas, "NUM{-> T1}; CAP{-> T2};", params);
+
+ RutaTestUtils.assertAnnotationsEquals(cas, 1, 1, "1");
+ RutaTestUtils.assertAnnotationsEquals(cas, 2, 1, "TEST");
+
+ // add T1 on CW, remove T2 on CAP
+ AnnotationFS at3 = cas.createAnnotation(RutaTestUtils.getTestType(cas, 3), 0, 4);
+ cas.addFsToIndexes(at3);
+ // RutaBasicUtils.addAnnotation(at3);
+ CasUtil.select(cas, RutaTestUtils.getTestType(cas, 2)).forEach(a -> {
+ cas.removeFsFromIndexes(a);
+ RutaBasicUtils.removeAnnotation(a);
+ });
+
+ RutaTestUtils.assertAnnotationsEquals(cas, 1, 1, "1");
+ RutaTestUtils.assertAnnotationsEquals(cas, 2, 0);
+ RutaTestUtils.assertAnnotationsEquals(cas, 3, 1, "This");
+
+ // apply rules on modifications, 1x partof, 2x startswith, 3x endswith
+ Ruta.apply(cas, "ANY{PARTOF(T1)->T11};ANY{PARTOF(T2)->T12};ANY{PARTOF(T3)->T13};", params);
+ Ruta.apply(cas, "ANY{STARTSWITH(T1)->T21};ANY{STARTSWITH(T2)->T22};ANY{STARTSWITH(T3)->T23};",
+ params);
+ Ruta.apply(cas, "ANY{ENDSWITH(T1)->T31};ANY{ENDSWITH(T2)->T32};ANY{ENDSWITH(T3)->T33};",
+ params);
+
+ // partof
+ RutaTestUtils.assertAnnotationsEquals(cas, 11, 1, "1");
+ RutaTestUtils.assertAnnotationsEquals(cas, 12, 0);
+ RutaTestUtils.assertAnnotationsEquals(cas, 13, 1, "This");
+
+ // startswith
+ RutaTestUtils.assertAnnotationsEquals(cas, 21, 1, "1");
+ RutaTestUtils.assertAnnotationsEquals(cas, 22, 0);
+ RutaTestUtils.assertAnnotationsEquals(cas, 23, 1, "This");
+
+ // endswith
+ RutaTestUtils.assertAnnotationsEquals(cas, 31, 1, "1");
+ RutaTestUtils.assertAnnotationsEquals(cas, 32, 0);
+ RutaTestUtils.assertAnnotationsEquals(cas, 33, 1, "This");
+
+ // new round with multiple overlapping annotations
+ Ruta.apply(cas, "PERIOD{-> T4, T4}; SW{-> T5};", params);
+
+ // remove one T4, add another T5
+ AnnotationFS at4 = CasUtil.select(cas, RutaTestUtils.getTestType(cas, 4)).iterator().next();
+ cas.removeFsFromIndexes(at4);
+ RutaBasicUtils.removeAnnotation(at4);
+ AnnotationFS at5 = cas.createAnnotation(RutaTestUtils.getTestType(cas, 5), 5, 7);
+ cas.addFsToIndexes(at5);
+ // RutaBasicUtils.addAnnotation(at5);
+
+ RutaTestUtils.assertAnnotationsEquals(cas, 4, 1, ".");
+ RutaTestUtils.assertAnnotationsEquals(cas, 5, 2, "is", "is");
+
+ // apply rules on modifications, 1x partof, 2x startswith, 3x endswith
+ Ruta.apply(cas, "ANY{PARTOF(T4)->T14};ANY{PARTOF(T5)->T15};", params);
+ Ruta.apply(cas, "ANY{STARTSWITH(T4)->T24};ANY{STARTSWITH(T5)->T25};", params);
+ Ruta.apply(cas, "ANY{ENDSWITH(T4)->T34};ANY{ENDSWITH(T5)->T35};", params);
+
+ // partof
+ RutaTestUtils.assertAnnotationsEquals(cas, 14, 1, ".");
+ RutaTestUtils.assertAnnotationsEquals(cas, 15, 1, "is");
+
+ // startswith
+ RutaTestUtils.assertAnnotationsEquals(cas, 24, 1, ".");
+ RutaTestUtils.assertAnnotationsEquals(cas, 25, 1, "is");
+
+ // endswith
+ RutaTestUtils.assertAnnotationsEquals(cas, 34, 1, ".");
+ RutaTestUtils.assertAnnotationsEquals(cas, 35, 1, "is");
+
+ // further modify in ruta, remove T4 completely, remove one T5
+ Ruta.apply(cas, "t4:T4{-> UNMARK(t4)};", params);
+ Ruta.apply(cas, "t5:T5{CONTAINS(T5,2,10) -> UNMARK(t5)};", params);
+ // T4 -> Tx6, T5 -> Tx7
+ Ruta.apply(cas, "ANY{PARTOF(T4)->T16};ANY{PARTOF(T5)->T17};");
+ Ruta.apply(cas, "ANY{STARTSWITH(T4)->T26};ANY{STARTSWITH(T5)->T27};", params);
+ Ruta.apply(cas, "ANY{ENDSWITH(T4)->T36};ANY{ENDSWITH(T5)->T37};", params);
+
+ // partof
+ RutaTestUtils.assertAnnotationsEquals(cas, 16, 0);
+ RutaTestUtils.assertAnnotationsEquals(cas, 17, 1, "is");
+
+ // startswith
+ RutaTestUtils.assertAnnotationsEquals(cas, 26, 0);
+ RutaTestUtils.assertAnnotationsEquals(cas, 27, 1, "is");
+
+ // endswith
+ RutaTestUtils.assertAnnotationsEquals(cas, 36, 0);
+ RutaTestUtils.assertAnnotationsEquals(cas, 37, 1, "is");
+ }
+
+ @Test
+ public void testSafeAdditive() throws Exception {
+
+ Map<String, Object> params = new LinkedHashMap<>();
+ params.put(RutaEngine.PARAM_REINDEX_UPDATE_MODE, ReindexUpdateMode.SAFE_ADDITIVE);
+
+ CAS cas = RutaTestUtils.getCAS("This is 1 TEST.");
+
+ // initial indexing and create some annotations
+ Ruta.apply(cas, "NUM{-> T1}; CAP{-> T2};", params);
+
+ RutaTestUtils.assertAnnotationsEquals(cas, 1, 1, "1");
+ RutaTestUtils.assertAnnotationsEquals(cas, 2, 1, "TEST");
+
+ // add T1 on CW, remove T2 on CAP
+ AnnotationFS at3 = cas.createAnnotation(RutaTestUtils.getTestType(cas, 3), 0, 4);
+ cas.addFsToIndexes(at3);
+ CasUtil.select(cas, RutaTestUtils.getTestType(cas, 2)).forEach(a -> {
+ cas.removeFsFromIndexes(a);
+ });
+
+ RutaTestUtils.assertAnnotationsEquals(cas, 1, 1, "1");
+ RutaTestUtils.assertAnnotationsEquals(cas, 2, 0);
+ RutaTestUtils.assertAnnotationsEquals(cas, 3, 1, "This");
+
+ // apply rules on modifications, 1x partof, 2x startswith, 3x endswith
+ Ruta.apply(cas, "ANY{PARTOF(T1)->T11};ANY{PARTOF(T2)->T12};ANY{PARTOF(T3)->T13};", params);
+ Ruta.apply(cas, "ANY{STARTSWITH(T1)->T21};ANY{STARTSWITH(T2)->T22};ANY{STARTSWITH(T3)->T23};",
+ params);
+ Ruta.apply(cas, "ANY{ENDSWITH(T1)->T31};ANY{ENDSWITH(T2)->T32};ANY{ENDSWITH(T3)->T33};",
+ params);
+
+ // partof
+ RutaTestUtils.assertAnnotationsEquals(cas, 11, 1, "1");
+ RutaTestUtils.assertAnnotationsEquals(cas, 12, 0);
+ RutaTestUtils.assertAnnotationsEquals(cas, 13, 1, "This");
+
+ // startswith
+ RutaTestUtils.assertAnnotationsEquals(cas, 21, 1, "1");
+ RutaTestUtils.assertAnnotationsEquals(cas, 22, 0);
+ RutaTestUtils.assertAnnotationsEquals(cas, 23, 1, "This");
+
+ // endswith
+ RutaTestUtils.assertAnnotationsEquals(cas, 31, 1, "1");
+ RutaTestUtils.assertAnnotationsEquals(cas, 32, 0);
+ RutaTestUtils.assertAnnotationsEquals(cas, 33, 1, "This");
+
+ // new round with multiple overlapping annotations
+ Ruta.apply(cas, "PERIOD{-> T4, T4}; SW{-> T5};", params);
+
+ // remove one T4, add another T5
+ AnnotationFS at4 = CasUtil.select(cas, RutaTestUtils.getTestType(cas, 4)).iterator().next();
+ cas.removeFsFromIndexes(at4);
+ AnnotationFS at5 = cas.createAnnotation(RutaTestUtils.getTestType(cas, 5), 5, 7);
+ cas.addFsToIndexes(at5);
+
+ RutaTestUtils.assertAnnotationsEquals(cas, 4, 1, ".");
+ RutaTestUtils.assertAnnotationsEquals(cas, 5, 2, "is", "is");
+
+ // apply rules on modifications, 1x partof, 2x startswith, 3x endswith
+ Ruta.apply(cas, "ANY{PARTOF(T4)->T14};ANY{PARTOF(T5)->T15};", params);
+ Ruta.apply(cas, "ANY{STARTSWITH(T4)->T24};ANY{STARTSWITH(T5)->T25};", params);
+ Ruta.apply(cas, "ANY{ENDSWITH(T4)->T34};ANY{ENDSWITH(T5)->T35};", params);
+
+ // partof
+ RutaTestUtils.assertAnnotationsEquals(cas, 14, 1, ".");
+ RutaTestUtils.assertAnnotationsEquals(cas, 15, 1, "is");
+
+ // startswith
+ RutaTestUtils.assertAnnotationsEquals(cas, 24, 1, ".");
+ RutaTestUtils.assertAnnotationsEquals(cas, 25, 1, "is");
+
+ // endswith
+ RutaTestUtils.assertAnnotationsEquals(cas, 34, 1, ".");
+ RutaTestUtils.assertAnnotationsEquals(cas, 35, 1, "is");
+
+ // further modify in ruta, remove T4 completely, remove one T5
+ Ruta.apply(cas, "t4:T4{-> UNMARK(t4)};", params);
+ Ruta.apply(cas, "t5:T5{CONTAINS(T5,2,10) -> UNMARK(t5)};", params);
+ // T4 -> Tx6, T5 -> Tx7
+ Ruta.apply(cas, "ANY{PARTOF(T4)->T16};ANY{PARTOF(T5)->T17};");
+ Ruta.apply(cas, "ANY{STARTSWITH(T4)->T26};ANY{STARTSWITH(T5)->T27};", params);
+ Ruta.apply(cas, "ANY{ENDSWITH(T4)->T36};ANY{ENDSWITH(T5)->T37};", params);
+
+ // partof
+ RutaTestUtils.assertAnnotationsEquals(cas, 16, 0);
+ RutaTestUtils.assertAnnotationsEquals(cas, 17, 1, "is");
+
+ // startswith
+ RutaTestUtils.assertAnnotationsEquals(cas, 26, 0);
+ RutaTestUtils.assertAnnotationsEquals(cas, 27, 1, "is");
+
+ // endswith
+ RutaTestUtils.assertAnnotationsEquals(cas, 36, 0);
+ RutaTestUtils.assertAnnotationsEquals(cas, 37, 1, "is");
+ }
+
+ @Test
+ public void testNoneWithExternalUpdate() throws Exception {
+
+ Map<String, Object> params = new LinkedHashMap<>();
+ params.put(RutaEngine.PARAM_REINDEX_UPDATE_MODE, ReindexUpdateMode.NONE);
+
+ CAS cas = RutaTestUtils.getCAS("This is 1 TEST.");
+
+ // initial indexing and create some annotations
+ Ruta.apply(cas, "NUM{-> T1}; CAP{-> T2};", params);
+
+ RutaTestUtils.assertAnnotationsEquals(cas, 1, 1, "1");
+ RutaTestUtils.assertAnnotationsEquals(cas, 2, 1, "TEST");
+
+ // add T1 on CW, remove T2 on CAP
+ AnnotationFS at3 = cas.createAnnotation(RutaTestUtils.getTestType(cas, 3), 0, 4);
+ cas.addFsToIndexes(at3);
+ RutaBasicUtils.addAnnotation(at3);
+ CasUtil.select(cas, RutaTestUtils.getTestType(cas, 2)).forEach(a -> {
+ cas.removeFsFromIndexes(a);
+ RutaBasicUtils.removeAnnotation(a);
+ });
+
+ RutaTestUtils.assertAnnotationsEquals(cas, 1, 1, "1");
+ RutaTestUtils.assertAnnotationsEquals(cas, 2, 0);
+ RutaTestUtils.assertAnnotationsEquals(cas, 3, 1, "This");
+
+ // apply rules on modifications, 1x partof, 2x startswith, 3x endswith
+ Ruta.apply(cas, "ANY{PARTOF(T1)->T11};ANY{PARTOF(T2)->T12};ANY{PARTOF(T3)->T13};", params);
+ Ruta.apply(cas, "ANY{STARTSWITH(T1)->T21};ANY{STARTSWITH(T2)->T22};ANY{STARTSWITH(T3)->T23};",
+ params);
+ Ruta.apply(cas, "ANY{ENDSWITH(T1)->T31};ANY{ENDSWITH(T2)->T32};ANY{ENDSWITH(T3)->T33};",
+ params);
+
+ // partof
+ RutaTestUtils.assertAnnotationsEquals(cas, 11, 1, "1");
+ RutaTestUtils.assertAnnotationsEquals(cas, 12, 0);
+ RutaTestUtils.assertAnnotationsEquals(cas, 13, 1, "This");
+
+ // startswith
+ RutaTestUtils.assertAnnotationsEquals(cas, 21, 1, "1");
+ RutaTestUtils.assertAnnotationsEquals(cas, 22, 0);
+ RutaTestUtils.assertAnnotationsEquals(cas, 23, 1, "This");
+
+ // endswith
+ RutaTestUtils.assertAnnotationsEquals(cas, 31, 1, "1");
+ RutaTestUtils.assertAnnotationsEquals(cas, 32, 0);
+ RutaTestUtils.assertAnnotationsEquals(cas, 33, 1, "This");
+
+ // new round with multiple overlapping annotations
+ Ruta.apply(cas, "PERIOD{-> T4, T4}; SW{-> T5};", params);
+
+ // remove one T4, add another T5
+ AnnotationFS at4 = CasUtil.select(cas, RutaTestUtils.getTestType(cas, 4)).iterator().next();
+ cas.removeFsFromIndexes(at4);
+ RutaBasicUtils.removeAnnotation(at4);
+ AnnotationFS at5 = cas.createAnnotation(RutaTestUtils.getTestType(cas, 5), 5, 7);
+ cas.addFsToIndexes(at5);
+ RutaBasicUtils.addAnnotation(at5);
+
+ RutaTestUtils.assertAnnotationsEquals(cas, 4, 1, ".");
+ RutaTestUtils.assertAnnotationsEquals(cas, 5, 2, "is", "is");
+
+ // apply rules on modifications, 1x partof, 2x startswith, 3x endswith
+ Ruta.apply(cas, "ANY{PARTOF(T4)->T14};ANY{PARTOF(T5)->T15};", params);
+ Ruta.apply(cas, "ANY{STARTSWITH(T4)->T24};ANY{STARTSWITH(T5)->T25};", params);
+ Ruta.apply(cas, "ANY{ENDSWITH(T4)->T34};ANY{ENDSWITH(T5)->T35};", params);
+
+ // partof
+ RutaTestUtils.assertAnnotationsEquals(cas, 14, 1, ".");
+ RutaTestUtils.assertAnnotationsEquals(cas, 15, 1, "is");
+
+ // startswith
+ RutaTestUtils.assertAnnotationsEquals(cas, 24, 1, ".");
+ RutaTestUtils.assertAnnotationsEquals(cas, 25, 1, "is");
+
+ // endswith
+ RutaTestUtils.assertAnnotationsEquals(cas, 34, 1, ".");
+ RutaTestUtils.assertAnnotationsEquals(cas, 35, 1, "is");
+
+ // further modify in ruta, remove T4 completely, remove one T5
+ Ruta.apply(cas, "t4:T4{-> UNMARK(t4)};", params);
+ Ruta.apply(cas, "t5:T5{CONTAINS(T5,2,10) -> UNMARK(t5)};", params);
+ // T4 -> Tx6, T5 -> Tx7
+ Ruta.apply(cas, "ANY{PARTOF(T4)->T16};ANY{PARTOF(T5)->T17};");
+ Ruta.apply(cas, "ANY{STARTSWITH(T4)->T26};ANY{STARTSWITH(T5)->T27};", params);
+ Ruta.apply(cas, "ANY{ENDSWITH(T4)->T36};ANY{ENDSWITH(T5)->T37};", params);
+
+ // partof
+ RutaTestUtils.assertAnnotationsEquals(cas, 16, 0);
+ RutaTestUtils.assertAnnotationsEquals(cas, 17, 1, "is");
+
+ // startswith
+ RutaTestUtils.assertAnnotationsEquals(cas, 26, 0);
+ RutaTestUtils.assertAnnotationsEquals(cas, 27, 1, "is");
+
+ // endswith
+ RutaTestUtils.assertAnnotationsEquals(cas, 36, 0);
+ RutaTestUtils.assertAnnotationsEquals(cas, 37, 1, "is");
+ }
+}
diff --git a/ruta-core/src/test/java/org/apache/uima/ruta/ZeroLengthAnnotationTest.java b/ruta-core/src/test/java/org/apache/uima/ruta/ZeroLengthAnnotationTest.java
index 849352e..df3bd27 100644
--- a/ruta-core/src/test/java/org/apache/uima/ruta/ZeroLengthAnnotationTest.java
+++ b/ruta-core/src/test/java/org/apache/uima/ruta/ZeroLengthAnnotationTest.java
@@ -28,50 +28,35 @@
public class ZeroLengthAnnotationTest {
-
@Test
- public void testMatchType() {
+ public void testMatchType() throws Exception {
String document = "Some text.";
String script = "";
script += "W W{-> T1};";
- CAS cas = null;
- try {
- cas = RutaTestUtils.getCAS(document);
- Type type = cas.getTypeSystem().getType("org.apache.uima.ruta.type.W");
- // call for seeding
- Ruta.apply(cas, "");
- addZeroLengthAnnotations(cas, type);
- Ruta.apply(cas, script);
- } catch (Exception e) {
- e.printStackTrace();
- }
+ CAS cas = RutaTestUtils.getCAS(document);
+ Type type = cas.getTypeSystem().getType("org.apache.uima.ruta.type.W");
+ // call for seeding
+ Ruta.apply(cas, "");
+ addZeroLengthAnnotations(cas, type);
+ Ruta.apply(cas, script);
RutaTestUtils.assertAnnotationsEquals(cas, 1, 1, "text");
-
- cas.release();
}
@Test
- public void testOtherType() {
+ public void testOtherType() throws Exception {
String document = "Some text.";
String script = "";
script += "W W{-> T1};";
- CAS cas = null;
- try {
- cas = RutaTestUtils.getCAS(document);
- Type type = cas.getTypeSystem().getType("org.apache.uima.ruta.type.TruePositive");
- addZeroLengthAnnotations(cas, type);
-
- Ruta.apply(cas, script);
- } catch (Exception e) {
- e.printStackTrace();
- }
+ CAS cas = RutaTestUtils.getCAS(document);
+ Type type = cas.getTypeSystem().getType("org.apache.uima.ruta.type.TruePositive");
+ addZeroLengthAnnotations(cas, type);
+
+ Ruta.apply(cas, script);
RutaTestUtils.assertAnnotationsEquals(cas, 1, 1, "text");
-
- cas.release();
}
-
+
private void addZeroLengthAnnotations(CAS cas, Type type) {
AnnotationFS a0 = cas.createAnnotation(type, 0, 0);
AnnotationFS a5 = cas.createAnnotation(type, 5, 5);
@@ -81,5 +66,5 @@
cas.addFsToIndexes(a5);
cas.addFsToIndexes(a10);
}
-
+
}
diff --git a/ruta-core/src/test/java/org/apache/uima/ruta/resource/TreeWordListTest.java b/ruta-core/src/test/java/org/apache/uima/ruta/resource/TreeWordListTest.java
index 2669e11..fa701f0 100644
--- a/ruta-core/src/test/java/org/apache/uima/ruta/resource/TreeWordListTest.java
+++ b/ruta-core/src/test/java/org/apache/uima/ruta/resource/TreeWordListTest.java
@@ -30,6 +30,8 @@
import org.apache.uima.fit.factory.JCasFactory;
import org.apache.uima.jcas.JCas;
import org.apache.uima.ruta.FilterManager;
+import org.apache.uima.ruta.ReindexUpdateMode;
+import org.apache.uima.ruta.RutaIndexingConfiguration;
import org.apache.uima.ruta.RutaStream;
import org.apache.uima.ruta.engine.Ruta;
import org.apache.uima.ruta.engine.RutaTestUtils;
@@ -89,7 +91,12 @@
seeder.seed(text, cas);
InferenceCrowd crowd = new InferenceCrowd(new ArrayList<>());
RutaStream stream = new RutaStream(cas, basicType, filter, false, false, true, null, crowd);
- stream.initalizeBasics(new String[] { CAS.TYPE_NAME_ANNOTATION }, false);
+
+ RutaIndexingConfiguration config = new RutaIndexingConfiguration();
+ config.setIndexOnly(new String[] { CAS.TYPE_NAME_ANNOTATION });
+ config.setReindexOnly(new String[] { CAS.TYPE_NAME_ANNOTATION });
+ config.setReindexUpdateMode(ReindexUpdateMode.ADDITIVE);
+ stream.initalizeBasics(config);
return stream;
}
diff --git a/ruta-docbook/src/docbook/tools.ruta.language.internal_indexing.xml b/ruta-docbook/src/docbook/tools.ruta.language.internal_indexing.xml
new file mode 100644
index 0000000..74a7ebb
--- /dev/null
+++ b/ruta-docbook/src/docbook/tools.ruta.language.internal_indexing.xml
@@ -0,0 +1,149 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE section PUBLIC "-//OASIS//DTD DocBook XML V4.4//EN"
+"http://www.oasis-open.org/docbook/xml/4.4/docbookx.dtd"[
+<!ENTITY imgroot "images/tools/ruta/language/" >
+<!ENTITY % uimaents SYSTEM "../../target/docbook-shared/entities.ent" >
+%uimaents;
+]>
+<!-- Licensed to the Apache Software Foundation (ASF) under one or more contributor
+ license agreements. See the NOTICE file distributed with this work for additional
+ information regarding copyright ownership. The ASF licenses this file to
+ you under the Apache License, Version 2.0 (the "License"); you may not use
+ this file except in compliance with the License. You may obtain a copy of
+ the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required
+ by applicable law or agreed to in writing, software distributed under the
+ License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS
+ OF ANY KIND, either express or implied. See the License for the specific
+ language governing permissions and limitations under the License. -->
+
+<section id="ugr.tools.ruta.language.internal_indxexing">
+ <title>Internal indexing and reindexing</title>
+ <para>
+ UIMA Ruta, or to be more precise the main analysis engine RutaEngine, creates,
+ stores and updates additional indexing information directly in the CAS.
+ This indexing is not related to the annotation indexes of UIMA itself.
+ The internal indexing provides additional information, which is only utilized
+ by the Ruta rules. This section provides an overview why and how it is included in
+ UIMA Ruta. And how Ruta can be configured in order to optimize its performance.
+ </para>
+ <section id="ugr.tools.ruta.language.internal_indxexing.why">
+ <title>Why additional indexing?</title>
+ <para>
+ The internal indexing is utilized for many different parts of functionality within Ruta.
+ The need for the indexing is motivated by two main and important features.
+ </para>
+ <para>
+ Ruta provides different language elements, for example conditions, which are fulfill
+ depending on some investigation of the CAS annotation indexes. There are several
+ condition like PARTOF which require many index operations in worst case. Here, potentially
+ the complete index needed to be iterated in order to validate if a specific annotation
+ is part of another annotation of a specific type. And this check need to be performed
+ for each considered annotation and for each rule match and for each rule where a PARTOF
+ condition is used. Without additional internal indexing Ruta would be too slow to
+ actually be useful. With this feature, it is just a fast lookup. This situation applies also for many other language elements and
+ conditions like STARTSWITH and ENDSWITH.
+ </para>
+ <para>
+ A second necessity is the coverage-based visibility concept of Ruta.
+ Annotations and any text spans are invisible if their begin or end is covered by some
+ invisible annotation, i.e., an annotation of a type that is configured to be invisible.
+ This is a powerful feature that enables many different engineering approaches and makes
+ rules also more maintainable. For a (reasonably fast) implementation of this features,
+ it is necessary to know for each position if it is covered by annotations of specific types.
+ </para>
+ <para>
+ The internal indexing comes, however, with some costs. The indexing requires time and memory.
+ the information needs to be collected and/or updated for every Ruta script (RutaEngine)
+ in a pipeline. This may require many operations if many annotations are available.
+ Straightforward, the storage of this information at potentially all text positions
+ requires a lot memory. Nevertheless, the advantages outweigh the disadvantages considerably.
+ </para>
+ </section>
+ <section id="ugr.tools.ruta.language.internal_indxexing.how">
+ <title>How is it stored, created and updated?</title>
+ <para>
+ The internal indexing refers to three types of information that is additionally stored:
+ </para>
+ <orderedlist numeration="arabic">
+ <listitem>
+ <para>
+ All annotations of all relevant types that begin at a position.
+ </para>
+ </listitem>
+ <listitem>
+ <para>
+ All annotations of all relevant types that end at a position.
+ </para>
+ </listitem>
+ <listitem>
+ <para>
+ All types of annotations that cover a position.
+ </para>
+ </listitem>
+ </orderedlist>
+ <para>
+ The information is stored in additional annotations of the type RutaBasic,
+ which provides by implementation, and not by features, additional fields for
+ these three kinds of information. RutaBasic provide a complete disjunct
+ partitioning of the document. They begin and end at every position where an
+ annotation starts and ends. This also includes, for examples, one RutaBasic for each
+ SPACE annotation, registering which annotation start and end at these offsets.
+ They are automatically created and also extended if new smaller annotations are added.
+ Their initial creation is called <quote>indexing</quote> and their updating
+ if RutaBasics are available, but other Java analysis engines potentially added or
+ removed annotations, is called <quote>reindexing</quote>.
+ </para>
+ <para>
+ There are several configuration
+ parameters (see parameters with INDEX and REINDEX in their name) that can influence what types and annotations are indexed and reindexed.
+ In the default configuration, all annotations are indexed, but only new annotations
+ are reindexed (ReindexUpdateMode ADDITIVE). This means that if an analysis engine in between
+ two RutaEngine removes some annotations, the second RutaEngine will not be up to date.
+ A rule which relies on the internal indexing will match differently for these annotations,
+ e.g., a PARTOF condition is still fulfilled although the annotation is not present in the
+ UIMA indexes anymore. This problem can be avoided (if necessary) either by switching to a more costly
+ ReindexUpdateMode COMPLETE, or by updating the internal indexing directly in the Java analysis
+ engine if necessary by using the class RutaBasicUtils.
+ </para>
+ </section>
+ <section id="ugr.tools.ruta.language.internal_indxexing.optimize">
+ <title>How to optimize the performance?</title>
+ <para>
+ The are many different options and possibilities to optimize the runtime performance and
+ memory footprint of Ruta script, by configuring the RutaEngine. The most useful configuration,
+ however, depends on the actual situation: How much information is available about the pipeline
+ and the types of annotations and their update operations? In the following a selection
+ of optimizations are discussed.
+ </para>
+ <para>
+ If there is a RutaEngine in a pipeline, and either the previous analysis engine was also
+ a RutaEngine or it is known that the analysis engines before (until the last RutaEngine) did not
+ modify any (relevant) annotations, then the ReindexUpdateMode NONE can be applied, which simply
+ skips the internal reindexing. This can improve the runtime performance.
+ </para>
+ <para>
+ The configuration parameters indexOnly can be restricted to relevant types.
+ The parameter indexSkipTypes can be utilized to specify types of annotations that are not relevant.
+ These types can include more technical annotations for metadata, logging or debug information.
+ Thus, the set of types that need to be considered for internal indexing can be restricted, which
+ makes the indexing faster and requires less memory.
+ </para>
+ For a reindexing/updating step the corresponding reindex parameters need to be considered.
+ Even relevant annotations do not need to be reindexed/updated all the time.
+ The updating can, for example, be restricted to
+ types that have been potentially modified by previous Java analysis engines according to their capabilities.
+ Additionally, some types are rather final considering their offsets. They are only create once
+ and are not modified by later analysis engines. These types commonly include
+ Tokens and similar annotations. They do not need to be reindexed, which can be configured using the
+ reindexSkipTypes parameter.
+ <para>
+ An extension to this is the parameter indexOnlyMentionTypes/reindexOnlyMentionedTypes.
+ Here, the relevant types are collected using the
+ actual script: the types that are actually used in the rules and thus their internal indexing needs
+ to be up to date. This mainly can increase the indexing speed. This feature is highlighted with example:
+ Considering a larger pipeline with many annotations of different types, and also with many
+ modifications since the last RutaEngine, a script with one rule does not require much reindexing,
+ only the types that are used in this rule.
+ </para>
+ </section>
+</section>
\ No newline at end of file
diff --git a/ruta-docbook/src/docbook/tools.ruta.language.xml b/ruta-docbook/src/docbook/tools.ruta.language.xml
index f92fd28..1fa0d5c 100644
--- a/ruta-docbook/src/docbook/tools.ruta.language.xml
+++ b/ruta-docbook/src/docbook/tools.ruta.language.xml
@@ -1361,4 +1361,7 @@
</para>
</section>
</section>
+
+ <xi:include xmlns:xi="http://www.w3.org/2001/XInclude" href="tools.ruta.language.internal_indexing.xml" />
+
</chapter>
diff --git a/ruta-docbook/src/docbook/tools.ruta.overview.xml b/ruta-docbook/src/docbook/tools.ruta.overview.xml
index 56a12f2..3107a97 100644
--- a/ruta-docbook/src/docbook/tools.ruta.overview.xml
+++ b/ruta-docbook/src/docbook/tools.ruta.overview.xml
@@ -855,19 +855,19 @@
</row>
<row>
<entry>
- <link linkend='ugr.tools.ruta.ae.basic.parameter.reindexOnly'>reindexOnly</link>
+ <link linkend='ugr.tools.ruta.ae.basic.parameter.indexOnly'>indexOnly</link>
</entry>
- <entry>Option to select annotation types that should be reindex internally in ruta.
+ <entry>Option to select annotation types that should be indexed internally in ruta.
</entry>
<entry>Multi String</entry>
</row>
<row>
<entry>
- <link linkend='ugr.tools.ruta.ae.basic.parameter.reindexOnlyMentionedTypes'>reindexOnlyMentionedTypes</link>
+ <link linkend='ugr.tools.ruta.ae.basic.parameter.indexSkipTypes'>indexSkipTypes</link>
</entry>
- <entry>Option to reindex only mentioned types internally in ruta.
+ <entry>Option to skip annotation types in the internal indexing.
</entry>
- <entry>Single Boolean</entry>
+ <entry>Multi String</entry>
</row>
<row>
<entry>
@@ -881,12 +881,52 @@
<entry>
<link linkend='ugr.tools.ruta.ae.basic.parameter.indexAdditionally'>indexAdditionally</link>
</entry>
- <entry>Option to index types additional to the mentioned ones internally in ruta.
+ <entry>Option to index types additionally to the mentioned ones internally in ruta.
</entry>
<entry>Multi String</entry>
</row>
<row>
<entry>
+ <link linkend='ugr.tools.ruta.ae.basic.parameter.reindexOnly'>reindexOnly</link>
+ </entry>
+ <entry>Option to select annotation types that should be reindexed internally in ruta.
+ </entry>
+ <entry>Multi String</entry>
+ </row>
+ <row>
+ <entry>
+ <link linkend='ugr.tools.ruta.ae.basic.parameter.reindexSkipTypes'>reindexSkipTypes</link>
+ </entry>
+ <entry>Option to skip annotation types in the internal reindexing.
+ </entry>
+ <entry>Multi String</entry>
+ </row>
+ <row>
+ <entry>
+ <link linkend='ugr.tools.ruta.ae.basic.parameter.reindexOnlyMentionedTypes'>reindexOnlyMentionedTypes</link>
+ </entry>
+ <entry>Option to reindex only mentioned types internally in ruta.
+ </entry>
+ <entry>Single Boolean</entry>
+ </row>
+ <row>
+ <entry>
+ <link linkend='ugr.tools.ruta.ae.basic.parameter.reindexAdditionally'>reindexAdditionally</link>
+ </entry>
+ <entry>Option to reindex types additionally to the mentioned ones internally in ruta.
+ </entry>
+ <entry>Multi String</entry>
+ </row>
+ <row>
+ <entry>
+ <link linkend='ugr.tools.ruta.ae.basic.parameter.indexUpdateMode'>indexUpdateMode</link>
+ </entry>
+ <entry>Mode how internal indexing should be applied.
+ </entry>
+ <entry>Single String</entry>
+ </row>
+ <row>
+ <entry>
<link linkend='ugr.tools.ruta.ae.basic.parameter.emptyIsInvisible'>emptyIsInvisible</link>
</entry>
<entry>Option to define empty text positions as invisible.
@@ -1167,22 +1207,21 @@
The default value is set to false.
</para>
</section>
- <section id="ugr.tools.ruta.ae.basic.parameter.reindexOnly">
- <title>reindexOnly</title>
+ <section id="ugr.tools.ruta.ae.basic.parameter.indexOnly">
+ <title>indexOnly</title>
<para>
- This parameter specifies the annotation types which should be reindex for ruta's internal annotations
- All annotation types that changed since the last call of a ruta script need to be listed here.
- The value of this parameter needs only be adapted for performance optimization in pipelines that
- contains several ruta analysis engines.
- Default value is uima.tcas.Annotation
+ This parameter specifies the annotation types which should be indexed for ruta's internal
+ annotations. All annotation types that are relevant need to be listed here. The value of this
+ parameter needs only be adapted for performance and memory optimization in pipelines that
+ contains several ruta analysis engines. Default value is uima.tcas.Annotation
</para>
</section>
- <section id="ugr.tools.ruta.ae.basic.parameter.reindexOnlyMentionedTypes">
- <title>reindexOnlyMentionedTypes</title>
+ <section id="ugr.tools.ruta.ae.basic.parameter.indexSkipTypes">
+ <title>indexSkipTypes</title>
<para>
- If this parameter is activated, then only annotations of types are internally reindexed at
- beginning that are mentioned with in the rules. This parameter overrides the values of the parameter
- 'reindexOnly' with the types that are mentioned in the rules. Default value is false.
+ This parameter specifies annotation types that should not be indexed at all. These types
+ normally include annotations that provide no meaningful semantics for text processing, e.g.,
+ types concerning ruta debug information.
</para>
</section>
<section id="ugr.tools.ruta.ae.basic.parameter.indexOnlyMentionedTypes">
@@ -1198,11 +1237,54 @@
<section id="ugr.tools.ruta.ae.basic.parameter.indexAdditionally">
<title>indexAdditionally</title>
<para>
- This parameter specifies annotation types (resolvable mentions are also supported) that should
+ This parameter specifies annotation types that should
be index additionally to types mentioned in the rules. This parameter is only used if the
parameter 'indexOnlyMentionedTypes' is activated.
</para>
</section>
+ <section id="ugr.tools.ruta.ae.basic.parameter.reindexOnly">
+ <title>reindexOnly</title>
+ <para>
+ This parameter specifies the annotation types which should be reindexed for ruta's internal annotations
+ All annotation types that changed since the last call of a ruta script need to be listed here.
+ The value of this parameter needs only be adapted for performance optimization in pipelines that
+ contains several ruta analysis engines.
+ Default value is uima.tcas.Annotation
+ </para>
+ </section>
+ <section id="ugr.tools.ruta.ae.basic.parameter.reindexSkipTypes">
+ <title>reindexSkipTypes</title>
+ <para>
+ This parameter specifies annotation types that should not be reindexed. These types normally
+ include annotations that are added once and are not changed in the following pipeline, e.g.,
+ Tokens or TokenSeed (like CW).
+ </para>
+ </section>
+ <section id="ugr.tools.ruta.ae.basic.parameter.reindexOnlyMentionedTypes">
+ <title>reindexOnlyMentionedTypes</title>
+ <para>
+ If this parameter is activated, then only annotations of types are internally reindexed at
+ beginning that are mentioned with in the rules. This parameter overrides the values of the parameter
+ 'reindexOnly' with the types that are mentioned in the rules. Default value is false.
+ </para>
+ </section>
+ <section id="ugr.tools.ruta.ae.basic.parameter.reindexAdditionally">
+ <title>reindexAdditionally</title>
+ <para>
+ This parameter specifies annotation types that should be reindexed additionally to types
+ mentioned in the rules. This parameter is only used if the parameter
+ 'reindexOnlyMentionedTypes' is activated.
+ </para>
+ </section>
+ <section id="ugr.tools.ruta.ae.basic.parameter.indexUpdateMode">
+ <title>indexUpdateMode</title>
+ <para>
+ This parameter specifies the mode for updating the internal indexing in RutaBasic annotations.
+ This is a technical parameter for optimizing the runtime performance/speed of RutaEngines.
+ Available modes are: COMPLETE, ADDITIVE, SAFE_ADDITIVE, NONE.
+ Default value is ADDITIVE.
+ </para>
+ </section>
<section id="ugr.tools.ruta.ae.basic.parameter.emptyIsInvisible">
<title>emptyIsInvisible</title>
<para>