Merge pull request #28 from apache/UIMA-6319-TextSeeder-creates-MARKUP-annotations

UIMA-6319: TextSeeder creates MARKUP annotations
diff --git a/ruta-core/src/main/java/org/apache/uima/ruta/seed/DefaultSeeder.java b/ruta-core/src/main/java/org/apache/uima/ruta/seed/DefaultSeeder.java
index 97fd53d..d294da1 100644
--- a/ruta-core/src/main/java/org/apache/uima/ruta/seed/DefaultSeeder.java
+++ b/ruta-core/src/main/java/org/apache/uima/ruta/seed/DefaultSeeder.java
@@ -40,6 +40,9 @@
   private final Pattern markupPattern = Pattern.compile(

           "</?\\w[\\w-]*((\\s+[\\w-]+(\\s*=\\s*(?:\".*?\"|'.*?'|[^'\">\\s]+))?)+\\s*|\\s*)/?>");

 

+  private final Pattern xmlCommentPattern = Pattern.compile("<!--[\\s\\S\n]*?-->");

+

+  @Override

   public Type seed(String text, CAS cas) {

     Type result = super.seed(text, cas);

     JCas jCas = null;

@@ -52,20 +55,26 @@
     // FIXME: lexer rules for html markup won't work. Therefore, those rules where removed in the

     // grammar and the functionality is included directly with regex

     if (text != null) {

-      Matcher matcher = markupPattern.matcher(text);

       Collection<AnnotationFS> toRemove = new LinkedList<AnnotationFS>();

-      while (matcher.find()) {

-        int begin = matcher.start();

-        int end = matcher.end();

-        MARKUP markup = new MARKUP(jCas, begin, end);

-        markup.addToIndexes();

-        List<AnnotationFS> selectCovered = CasUtil.selectCovered(result, markup);

-        toRemove.addAll(selectCovered);

-      }

+      addMarkupAnnotations(text, result, xmlCommentPattern, jCas, toRemove);

+      addMarkupAnnotations(text, result, markupPattern, jCas, toRemove);

       for (AnnotationFS each : toRemove) {

         cas.removeFsFromIndexes(each);

       }

     }

     return result;

   }

+

+  private void addMarkupAnnotations(String text, Type result, Pattern pattern, JCas jCas,

+          Collection<AnnotationFS> toRemove) {

+    Matcher matcher = pattern.matcher(text);

+    while (matcher.find()) {

+      int begin = matcher.start();

+      int end = matcher.end();

+      MARKUP markup = new MARKUP(jCas, begin, end);

+      markup.addToIndexes();

+      List<AnnotationFS> selectCovered = CasUtil.selectCovered(result, markup);

+      toRemove.addAll(selectCovered);

+    }

+  }

 }

diff --git a/ruta-core/src/main/jflex/org/apache/uima/ruta/seed/SeedLexer.flex b/ruta-core/src/main/jflex/org/apache/uima/ruta/seed/SeedLexer.flex
index 8ebaad8..d4b1a0e 100644
--- a/ruta-core/src/main/jflex/org/apache/uima/ruta/seed/SeedLexer.flex
+++ b/ruta-core/src/main/jflex/org/apache/uima/ruta/seed/SeedLexer.flex
@@ -19,8 +19,6 @@
 

 

 package org.apache.uima.ruta.seed;

-import java.util.*;

-import java.util.regex.*;

 

 import org.apache.uima.cas.text.AnnotationFS;

 import org.apache.uima.jcas.JCas;

@@ -32,7 +30,6 @@
 import org.apache.uima.ruta.type.COMMA;

 import org.apache.uima.ruta.type.CW;

 import org.apache.uima.ruta.type.EXCLAMATION;

-import org.apache.uima.ruta.type.MARKUP;

 import org.apache.uima.ruta.type.NBSP;

 import org.apache.uima.ruta.type.NUM;

 import org.apache.uima.ruta.type.PERIOD;

@@ -68,23 +65,6 @@
 

 <YYINITIAL> {

     

-    \<[/][!][^>]*> {

-                MARKUP t = new MARKUP(cas);

-                t.setBegin(yychar);

-                t.setEnd(yychar + yytext().length());

-                

-                return t;

-    }

-                    

-    \<[!][^>]*> {

-                MARKUP t = new MARKUP(cas);

-                t.setBegin(yychar);

-                t.setEnd(yychar + yytext().length());

-                

-                return t;

-    }

-    

-                                       

     \u00A0|\u202F|\uFEFF|\u2007|\u180E|&nbsp;|&NBSP; {

                 NBSP t = new NBSP(cas);

                 t.setBegin(yychar);

diff --git a/ruta-core/src/test/java/org/apache/uima/ruta/engine/RutaModifierTest.java b/ruta-core/src/test/java/org/apache/uima/ruta/engine/RutaModifierTest.java
index e3e0af6..a4f229f 100644
--- a/ruta-core/src/test/java/org/apache/uima/ruta/engine/RutaModifierTest.java
+++ b/ruta-core/src/test/java/org/apache/uima/ruta/engine/RutaModifierTest.java
@@ -42,13 +42,13 @@
     String namespace = this.getClass().getPackage().getName().replaceAll("\\.", "/");

     URL url = HtmlAnnotator.class.getClassLoader().getResource("Modifier.xml");

     if (url == null) {

-      url = HtmlAnnotator.class.getClassLoader().getResource(

-              "org/apache/uima/ruta/engine/Modifier.xml");

+      url = HtmlAnnotator.class.getClassLoader()

+              .getResource("org/apache/uima/ruta/engine/Modifier.xml");

     }

     XMLInputSource in = new XMLInputSource(url);

     ResourceSpecifier specifier = UIMAFramework.getXMLParser().parseResourceSpecifier(in);

     AnalysisEngineDescription aed = (AnalysisEngineDescription) specifier;

-    

+

     TypeSystemDescription basicTypeSystem = aed.getAnalysisEngineMetaData().getTypeSystem();

     for (int i = 1; i <= 20; i++) {

       basicTypeSystem.addType(RutaTestUtils.TYPE + i, "Type for Testing", "uima.tcas.Annotation");

@@ -62,26 +62,19 @@
     String viewName = "modified_for_testing";

     ae.setConfigParameterValue(RutaModifier.PARAM_OUTPUT_VIEW, viewName);

     ae.reconfigure();

-    

+

     String scriptName = this.getClass().getSimpleName();

-    CAS cas = null;

-    try {

-      cas = RutaTestUtils.process(namespace + "/" + scriptName + RutaEngine.SCRIPT_FILE_EXTENSION, namespace + "/test.html", 50);

-    } catch (Exception e) {

-      e.printStackTrace();

-      assert (false);

-    }

+    CAS cas = RutaTestUtils.process(namespace + "/" + scriptName + RutaEngine.SCRIPT_FILE_EXTENSION,

+            namespace + "/test.html", 50);

     ae.process(cas);

-    

+

     CAS modifiedView = cas.getView(viewName);

     String text = modifiedView.getDocumentText();

-    

-    assertEquals("start of bodynormal BOLDend of body" , text);

-    

-    

+

+    assertEquals("start of bodynormal BOLDend of body", text);

+

     cas.release();

     ae.destroy();

   }

 

-  

 }

diff --git a/ruta-core/src/test/java/org/apache/uima/ruta/seed/DefaultSeederTest.java b/ruta-core/src/test/java/org/apache/uima/ruta/seed/DefaultSeederTest.java
index e39a59c..4e93c3e 100644
--- a/ruta-core/src/test/java/org/apache/uima/ruta/seed/DefaultSeederTest.java
+++ b/ruta-core/src/test/java/org/apache/uima/ruta/seed/DefaultSeederTest.java
@@ -22,6 +22,8 @@
 import static org.junit.Assert.assertEquals;

 

 import java.net.URL;

+import java.util.LinkedHashMap;

+import java.util.Map;

 

 import org.apache.uima.UIMAFramework;

 import org.apache.uima.analysis_engine.AnalysisEngine;

@@ -173,4 +175,24 @@
     RutaTestUtils.assertAnnotationsEquals(cas, 1, 1, "
");

   }

 

+  @Test

+  public void testMultiLineXmlComment() throws Exception {

+

+    String document = "Text text <!-- some \n\r more text --> text text.";

+    String script = "ALL{-> T1};\n";

+    script += "ADDRETAINTYPE(MARKUP);\n";

+    script += "ALL{-> T2};\n";

+    script += "MARKUP{-> T3};\n";

+

+    CAS cas = RutaTestUtils.getCAS(document);

+    Map<String, Object> params = new LinkedHashMap<>();

+    params.put(RutaEngine.PARAM_SEEDERS, new String[] { DefaultSeeder.class.getName() });

+    Ruta.apply(cas, script, params);

+

+    RutaTestUtils.assertAnnotationsEquals(cas, 1, 5, "Text", "text", "text", "text", ".");

+    RutaTestUtils.assertAnnotationsEquals(cas, 2, 6, "Text", "text", "<!-- some \n\r more text -->",

+            "text", "text", ".");

+    RutaTestUtils.assertAnnotationsEquals(cas, 3, 1, "<!-- some \n\r more text -->");

+  }

+

 }

diff --git a/ruta-core/src/test/java/org/apache/uima/ruta/seed/TextSeederTest.java b/ruta-core/src/test/java/org/apache/uima/ruta/seed/TextSeederTest.java
new file mode 100644
index 0000000..ee20d58
--- /dev/null
+++ b/ruta-core/src/test/java/org/apache/uima/ruta/seed/TextSeederTest.java
@@ -0,0 +1,59 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ * http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.uima.ruta.seed;
+
+import java.util.LinkedHashMap;
+import java.util.Map;
+
+import org.apache.uima.cas.CAS;
+import org.apache.uima.fit.util.JCasUtil;
+import org.apache.uima.ruta.engine.Ruta;
+import org.apache.uima.ruta.engine.RutaEngine;
+import org.apache.uima.ruta.engine.RutaTestUtils;
+import org.apache.uima.ruta.type.MARKUP;
+import org.junit.Assert;
+import org.junit.Test;
+
+public class TextSeederTest {
+
+  @Test
+  public void testNoMarkupForXmlComment() throws Exception {
+
+    String document = "Text text <!-- some more text --> text text.";
+    String script = "ALL{-> T1};\n";
+    script += "ADDRETAINTYPE(MARKUP);\n";
+    script += "ALL{-> T2};\n";
+    script += "MARKUP{-> T3};\n";
+
+    CAS cas = RutaTestUtils.getCAS(document);
+    Map<String, Object> params = new LinkedHashMap<>();
+    params.put(RutaEngine.PARAM_SEEDERS, new String[] { TextSeeder.class.getName() });
+    Ruta.apply(cas, script, params);
+
+    RutaTestUtils.assertAnnotationsEquals(cas, 1, 15, "Text", "text", "<", "!", "-", "-", "some",
+            "more", "text", "-", "-", ">", "text", "text", ".");
+    RutaTestUtils.assertAnnotationsEquals(cas, 2, 15, "Text", "text", "<", "!", "-", "-", "some",
+            "more", "text", "-", "-", ">", "text", "text", ".");
+    RutaTestUtils.assertAnnotationsEquals(cas, 3, 0);
+
+    Assert.assertTrue(JCasUtil.select(cas.getJCas(), MARKUP.class).isEmpty());
+  }
+
+}