Merge pull request #18 from apache/UIMA-6271-validate-internal-ruta-indexing

UIMA-6271: Ruta: option to validate internal indexing in RutaEngine
diff --git a/ruta-core/pom.xml b/ruta-core/pom.xml
index 76072d2..cc4e9b9 100644
--- a/ruta-core/pom.xml
+++ b/ruta-core/pom.xml
@@ -150,6 +150,24 @@
       <artifactId>junit</artifactId>

       <scope>test</scope>

     </dependency>

+    

+    <dependency>

+      <groupId>org.apache.uima</groupId>

+      <artifactId>uimafit-junit</artifactId>

+      <version>${uimafit-version}</version>

+      <scope>test</scope>

+      <!-- Exclude aop stuff, which is not need by uimafit and only introduces a non-asl license -->

+      <exclusions>

+        <exclusion>

+          <groupId>org.springframework</groupId>

+          <artifactId>spring-aop</artifactId>

+        </exclusion>

+        <exclusion>

+          <groupId>aopalliance</groupId>

+          <artifactId>aopalliance</artifactId>

+        </exclusion>

+      </exclusions>

+    </dependency>

 

     <dependency>

       <groupId>org.slf4j</groupId>

diff --git a/ruta-core/src/main/java/org/apache/uima/ruta/RutaBasicUtils.java b/ruta-core/src/main/java/org/apache/uima/ruta/RutaBasicUtils.java
index 5eb2841..f371d49 100644
--- a/ruta-core/src/main/java/org/apache/uima/ruta/RutaBasicUtils.java
+++ b/ruta-core/src/main/java/org/apache/uima/ruta/RutaBasicUtils.java
@@ -18,14 +18,22 @@
  */
 package org.apache.uima.ruta;
 
+import java.util.Collection;
+import java.util.LinkedHashMap;
 import java.util.List;
+import java.util.Map;
 
+import org.apache.commons.lang3.StringUtils;
+import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
 import org.apache.uima.cas.CAS;
 import org.apache.uima.cas.Type;
 import org.apache.uima.cas.TypeSystem;
 import org.apache.uima.cas.text.AnnotationFS;
 import org.apache.uima.cas.text.AnnotationIndex;
 import org.apache.uima.fit.util.CasUtil;
+import org.apache.uima.fit.util.JCasUtil;
+import org.apache.uima.jcas.JCas;
+import org.apache.uima.jcas.tcas.Annotation;
 import org.apache.uima.ruta.type.RutaBasic;
 
 /**
@@ -129,4 +137,112 @@
     return true;
   }
 
+  /**
+   * This method validated the internal indexing, i.e. the information stored in the RutaBasics, and
+   * throw exceptions if a invalid state is discovered.
+   * 
+   * @param jcas
+   *          the JCas that should be validated
+   * @param ignoreTypeNames
+   *          the names of types that should not be validated
+   * @throws AnalysisEngineProcessException
+   *           if some problem was detected
+   */
+  public static void validateInternalIndexing(JCas jcas, Collection<String> ignoreTypeNames)
+          throws AnalysisEngineProcessException {
+
+    Map<Integer, RutaBasic> beginMap = new LinkedHashMap<>();
+    Map<Integer, RutaBasic> endMap = new LinkedHashMap<>();
+
+    Collection<RutaBasic> basics = JCasUtil.select(jcas, RutaBasic.class);
+
+    if (basics.isEmpty()) {
+      throw new AnalysisEngineProcessException(
+              new IllegalStateException("No RutaBasics available!"));
+    }
+    for (RutaBasic rutaBasic : basics) {
+
+      int begin = rutaBasic.getBegin();
+      int end = rutaBasic.getEnd();
+
+      if (beginMap.get(begin) != null || endMap.get(end) != null) {
+        throw new AnalysisEngineProcessException(new IllegalStateException(
+                "RutaBasic must be disjunct! Problem at offset " + begin));
+      }
+
+      beginMap.put(begin, rutaBasic);
+      endMap.put(end, rutaBasic);
+    }
+
+    for (Annotation annotation : JCasUtil.select(jcas, Annotation.class)) {
+
+      Type type = annotation.getType();
+      if (ignoreType(type, ignoreTypeNames, jcas)) {
+        continue;
+      }
+
+      int begin = annotation.getBegin();
+      int end = annotation.getEnd();
+
+      RutaBasic beginBasic = beginMap.get(begin);
+      RutaBasic endBasic = endMap.get(end);
+      if (beginBasic == null) {
+        throw new AnalysisEngineProcessException(new IllegalStateException(
+                "No RutaBasic for begin of annotation at offset " + begin));
+      }
+      if (endBasic == null) {
+        throw new AnalysisEngineProcessException(
+                new IllegalStateException("No RutaBasic for end of annotation at offset " + end));
+      }
+
+      Collection<AnnotationFS> beginAnchors = beginBasic.getBeginAnchors(type);
+      if (beginAnchors == null || !beginAnchors.contains(annotation)) {
+        throw new AnalysisEngineProcessException(new IllegalStateException("Annotation of type '"
+                + type.getName() + "' not registered as begin at offset " + begin));
+      }
+      Collection<AnnotationFS> endAnchors = endBasic.getEndAnchors(type);
+      if (endAnchors == null || !endAnchors.contains(annotation)) {
+        throw new AnalysisEngineProcessException(new IllegalStateException("Annotation of type '"
+                + type.getName() + "' not registered as end at offset " + begin));
+      }
+
+      List<RutaBasic> coveredBasics = JCasUtil.selectCovered(RutaBasic.class, annotation);
+      for (RutaBasic coveredBasic : coveredBasics) {
+        if (!coveredBasic.isPartOf(type)) {
+          throw new AnalysisEngineProcessException(
+                  new IllegalStateException("Annotation of type '" + type.getName()
+                          + "' not registered as partof at offset [" + begin + "," + end + "]"));
+        }
+      }
+    }
+  }
+
+  private static boolean ignoreType(Type type, Collection<String> ignoreTypeNames, JCas jcas) {
+
+    if (type == null) {
+      return false;
+    }
+
+    if (StringUtils.equals(type.getName(), RutaBasic.class.getName())) {
+      return true;
+    }
+
+    if (ignoreTypeNames == null) {
+      return false;
+    }
+
+    TypeSystem typeSystem = jcas.getTypeSystem();
+
+    for (String typeName : ignoreTypeNames) {
+      Type ignoreType = typeSystem.getType(typeName);
+      if (ignoreType == null) {
+        continue;
+      }
+      if (typeSystem.subsumes(ignoreType, type)) {
+        return true;
+      }
+    }
+    return false;
+  }
+
 }
diff --git a/ruta-core/src/main/java/org/apache/uima/ruta/engine/RutaEngine.java b/ruta-core/src/main/java/org/apache/uima/ruta/engine/RutaEngine.java
index d676528..093e35a 100644
--- a/ruta-core/src/main/java/org/apache/uima/ruta/engine/RutaEngine.java
+++ b/ruta-core/src/main/java/org/apache/uima/ruta/engine/RutaEngine.java
@@ -62,6 +62,7 @@
 import org.apache.uima.resource.ResourceManager;

 import org.apache.uima.ruta.FilterManager;

 import org.apache.uima.ruta.ReindexUpdateMode;

+import org.apache.uima.ruta.RutaBasicUtils;

 import org.apache.uima.ruta.RutaConstants;

 import org.apache.uima.ruta.RutaEnvironment;

 import org.apache.uima.ruta.RutaIndexingConfiguration;

@@ -529,6 +530,17 @@
   private ReindexUpdateMode reindexUpdateMode;

 

   /**

+   * Option to validate the internal indexing in RutaBasic with the current CAS after the indexing

+   * and reindexing is performed. Annotations that are not correctly indexing in RutaBasics cause

+   * Exceptions. Annotations of types listed in parameter 'indexSkipTypes' and 'reindexSkipTypes'

+   * are ignored. Default value is false.

+   */

+  public static final String PARAM_VALIDATE_INTERNAL_INDEXING = "validateInternalIndexing";

+

+  @ConfigurationParameter(name = PARAM_VALIDATE_INTERNAL_INDEXING, mandatory = true, defaultValue = "false")

+  private boolean validateInternalIndexing;

+

+  /**

    * This parameter determines positions as invisible if the internal indexing of the corresponding

    * RutaBasic annotation is empty.

    */

@@ -663,6 +675,14 @@
     stream.setGreedyRule(greedyRule);

     stream.setMaxRuleMatches(maxRuleMatches);

     stream.setMaxRuleElementMatches(maxRuleElementMatches);

+

+    if (validateInternalIndexing) {

+      Collection<String> ignoreTypeNames = new ArrayList<>();

+      ignoreTypeNames.addAll(Arrays.asList(indexSkipTypes));

+      ignoreTypeNames.addAll(Arrays.asList(reindexSkipTypes));

+      RutaBasicUtils.validateInternalIndexing(jcas, ignoreTypeNames);

+    }

+

     try {

       script.apply(stream, crowd);

     } catch (Throwable e) {

diff --git a/ruta-core/src/test/java/org/apache/uima/ruta/RutaBasicUtilsTest.java b/ruta-core/src/test/java/org/apache/uima/ruta/RutaBasicUtilsTest.java
new file mode 100644
index 0000000..89e11e2
--- /dev/null
+++ b/ruta-core/src/test/java/org/apache/uima/ruta/RutaBasicUtilsTest.java
@@ -0,0 +1,116 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ * http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.uima.ruta;
+
+import java.util.Arrays;
+
+import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
+import org.apache.uima.cas.CAS;
+import org.apache.uima.fit.testing.junit.ManagedJCas;
+import org.apache.uima.jcas.JCas;
+import org.apache.uima.ruta.engine.Ruta;
+import org.apache.uima.ruta.type.CW;
+import org.apache.uima.ruta.type.RutaBasic;
+import org.junit.Rule;
+import org.junit.Test;
+
+public class RutaBasicUtilsTest {
+
+  public @Rule ManagedJCas managedJCas = new ManagedJCas();
+
+  @Test(expected = AnalysisEngineProcessException.class)
+  public void testBreakOnNoBasics() throws AnalysisEngineProcessException {
+
+    RutaBasicUtils.validateInternalIndexing(managedJCas.get(), null);
+  }
+
+  @Test(expected = AnalysisEngineProcessException.class)
+  public void testBreakOnDuplicateBasics() throws AnalysisEngineProcessException {
+    JCas jcas = managedJCas.get();
+    new RutaBasic(jcas, 0, 1).addToIndexes();
+    new RutaBasic(jcas, 0, 1).addToIndexes();
+    RutaBasicUtils.validateInternalIndexing(jcas, null);
+  }
+
+  @Test(expected = AnalysisEngineProcessException.class)
+  public void testBreakOnMissingBasicAtBegin() throws AnalysisEngineProcessException {
+    JCas jcas = managedJCas.get();
+    new RutaBasic(jcas, 1, 2).addToIndexes();
+    new CW(jcas, 0, 2).addToIndexes();
+    RutaBasicUtils.validateInternalIndexing(jcas, null);
+  }
+
+  @Test(expected = AnalysisEngineProcessException.class)
+  public void testBreakOnMissingBasicAtEnd() throws AnalysisEngineProcessException {
+    JCas jcas = managedJCas.get();
+    new RutaBasic(jcas, 0, 1).addToIndexes();
+    new CW(jcas, 0, 2).addToIndexes();
+    RutaBasicUtils.validateInternalIndexing(jcas, null);
+  }
+
+  @Test(expected = AnalysisEngineProcessException.class)
+  public void testBreakOnMissingAnnotationAtBegin() throws AnalysisEngineProcessException {
+    JCas jcas = managedJCas.get();
+    CW cw = new CW(jcas, 0, 1);
+    cw.addToIndexes();
+    RutaBasic rb = new RutaBasic(jcas, 0, 1);
+    rb.addEnd(cw, cw.getType());
+    rb.addToIndexes();
+    RutaBasicUtils.validateInternalIndexing(jcas, null);
+  }
+
+  @Test(expected = AnalysisEngineProcessException.class)
+  public void testBreakOnMissingAnnotationAtEnd() throws AnalysisEngineProcessException {
+    JCas jcas = managedJCas.get();
+    CW cw = new CW(jcas, 0, 1);
+    cw.addToIndexes();
+    RutaBasic rb = new RutaBasic(jcas, 0, 1);
+    rb.addBegin(cw, cw.getType());
+    rb.addToIndexes();
+    RutaBasicUtils.validateInternalIndexing(jcas, null);
+  }
+
+  @Test(expected = AnalysisEngineProcessException.class)
+  public void testBreakOnMissingPartof() throws AnalysisEngineProcessException {
+    JCas jcas = managedJCas.get();
+    CW cw = new CW(jcas, 0, 1);
+    cw.addToIndexes();
+    RutaBasic rb = new RutaBasic(jcas, 0, 1);
+    rb.addBegin(cw, cw.getType());
+    rb.addEnd(cw, cw.getType());
+    rb.addToIndexes();
+    RutaBasicUtils.validateInternalIndexing(jcas, null);
+  }
+
+  @Test
+  public void testIgnoreTypeNames() throws AnalysisEngineProcessException {
+    JCas jcas = managedJCas.get();
+    new RutaBasic(jcas, 0, 1).addToIndexes();
+    new CW(jcas, 0, 1).addToIndexes();
+    RutaBasicUtils.validateInternalIndexing(jcas, Arrays.asList(CAS.TYPE_NAME_ANNOTATION));
+  }
+
+  @Test
+  public void testAllGood() throws Exception {
+    JCas jcas = managedJCas.get();
+    jcas.setDocumentText("This is 1 TEST.");
+    Ruta.apply(jcas.getCas(), "CW{-> TruePositive};");
+    RutaBasicUtils.validateInternalIndexing(jcas, null);
+  }
+}
diff --git a/ruta-docbook/src/docbook/tools.ruta.overview.xml b/ruta-docbook/src/docbook/tools.ruta.overview.xml
index 3107a97..1bce6ab 100644
--- a/ruta-docbook/src/docbook/tools.ruta.overview.xml
+++ b/ruta-docbook/src/docbook/tools.ruta.overview.xml
@@ -927,6 +927,14 @@
                 </row>

                 <row>

                   <entry>

+                    <link linkend='ugr.tools.ruta.ae.basic.parameter.indexUpdateMode'>validateInternalIndexing</link>

+                  </entry>

+                  <entry>Option to validate the internal indexing.

+                  </entry>

+                  <entry>Single String</entry>

+                </row>

+                <row>

+                  <entry>

                     <link linkend='ugr.tools.ruta.ae.basic.parameter.emptyIsInvisible'>emptyIsInvisible</link>

                   </entry>

                   <entry>Option to define empty text positions as invisible.

@@ -1285,6 +1293,16 @@
            Default value is ADDITIVE.

           </para>

         </section>

+        <section id="ugr.tools.ruta.ae.basic.parameter.validateInternalIndexing">

+          <title>validateInternalIndexing</title>

+          <para>

+            Option to validate the internal indexing in RutaBasic with the current CAS after the indexing

+            and reindexing is performed. Annotations that are not correctly indexing in RutaBasics cause

+            Exceptions. Annotations of types listed in parameter 'indexSkipTypes' and 'reindexSkipTypes'

+            are ignored. Default value is false.

+          </para>

+        </section>

+        validateInternalIndexing

         <section id="ugr.tools.ruta.ae.basic.parameter.emptyIsInvisible">

           <title>emptyIsInvisible</title>

           <para>

diff --git a/ruta-parent/pom.xml b/ruta-parent/pom.xml
index 0b0c451..5f70f13 100644
--- a/ruta-parent/pom.xml
+++ b/ruta-parent/pom.xml
@@ -131,7 +131,7 @@
       Creative Commons Attribution 3.0 License.

     </postNoticeText>

     <uimaVersion>2.10.4</uimaVersion>

-    <uimafit-version>2.4.0</uimafit-version>

+    <uimafit-version>2.5.1-SNAPSHOT</uimafit-version>

     <spring-version>4.3.22.RELEASE</spring-version>

     <!--

       BACKWARD_COMPATIBLE_IMPLEMENTER - patch version (=.=.+)