Merge pull request #28 from apache/UIMA-6319-TextSeeder-creates-MARKUP-annotations
UIMA-6319: TextSeeder creates MARKUP annotations
diff --git a/ruta-core/src/main/java/org/apache/uima/ruta/seed/DefaultSeeder.java b/ruta-core/src/main/java/org/apache/uima/ruta/seed/DefaultSeeder.java
index 97fd53d..d294da1 100644
--- a/ruta-core/src/main/java/org/apache/uima/ruta/seed/DefaultSeeder.java
+++ b/ruta-core/src/main/java/org/apache/uima/ruta/seed/DefaultSeeder.java
@@ -40,6 +40,9 @@
private final Pattern markupPattern = Pattern.compile(
"</?\\w[\\w-]*((\\s+[\\w-]+(\\s*=\\s*(?:\".*?\"|'.*?'|[^'\">\\s]+))?)+\\s*|\\s*)/?>");
+ private final Pattern xmlCommentPattern = Pattern.compile("<!--[\\s\\S\n]*?-->");
+
+ @Override
public Type seed(String text, CAS cas) {
Type result = super.seed(text, cas);
JCas jCas = null;
@@ -52,20 +55,26 @@
// FIXME: lexer rules for html markup won't work. Therefore, those rules where removed in the
// grammar and the functionality is included directly with regex
if (text != null) {
- Matcher matcher = markupPattern.matcher(text);
Collection<AnnotationFS> toRemove = new LinkedList<AnnotationFS>();
- while (matcher.find()) {
- int begin = matcher.start();
- int end = matcher.end();
- MARKUP markup = new MARKUP(jCas, begin, end);
- markup.addToIndexes();
- List<AnnotationFS> selectCovered = CasUtil.selectCovered(result, markup);
- toRemove.addAll(selectCovered);
- }
+ addMarkupAnnotations(text, result, xmlCommentPattern, jCas, toRemove);
+ addMarkupAnnotations(text, result, markupPattern, jCas, toRemove);
for (AnnotationFS each : toRemove) {
cas.removeFsFromIndexes(each);
}
}
return result;
}
+
+ private void addMarkupAnnotations(String text, Type result, Pattern pattern, JCas jCas,
+ Collection<AnnotationFS> toRemove) {
+ Matcher matcher = pattern.matcher(text);
+ while (matcher.find()) {
+ int begin = matcher.start();
+ int end = matcher.end();
+ MARKUP markup = new MARKUP(jCas, begin, end);
+ markup.addToIndexes();
+ List<AnnotationFS> selectCovered = CasUtil.selectCovered(result, markup);
+ toRemove.addAll(selectCovered);
+ }
+ }
}
diff --git a/ruta-core/src/main/jflex/org/apache/uima/ruta/seed/SeedLexer.flex b/ruta-core/src/main/jflex/org/apache/uima/ruta/seed/SeedLexer.flex
index 8ebaad8..d4b1a0e 100644
--- a/ruta-core/src/main/jflex/org/apache/uima/ruta/seed/SeedLexer.flex
+++ b/ruta-core/src/main/jflex/org/apache/uima/ruta/seed/SeedLexer.flex
@@ -19,8 +19,6 @@
package org.apache.uima.ruta.seed;
-import java.util.*;
-import java.util.regex.*;
import org.apache.uima.cas.text.AnnotationFS;
import org.apache.uima.jcas.JCas;
@@ -32,7 +30,6 @@
import org.apache.uima.ruta.type.COMMA;
import org.apache.uima.ruta.type.CW;
import org.apache.uima.ruta.type.EXCLAMATION;
-import org.apache.uima.ruta.type.MARKUP;
import org.apache.uima.ruta.type.NBSP;
import org.apache.uima.ruta.type.NUM;
import org.apache.uima.ruta.type.PERIOD;
@@ -68,23 +65,6 @@
<YYINITIAL> {
- \<[/][!][^>]*> {
- MARKUP t = new MARKUP(cas);
- t.setBegin(yychar);
- t.setEnd(yychar + yytext().length());
-
- return t;
- }
-
- \<[!][^>]*> {
- MARKUP t = new MARKUP(cas);
- t.setBegin(yychar);
- t.setEnd(yychar + yytext().length());
-
- return t;
- }
-
-
\u00A0|\u202F|\uFEFF|\u2007|\u180E| |&NBSP; {
NBSP t = new NBSP(cas);
t.setBegin(yychar);
diff --git a/ruta-core/src/test/java/org/apache/uima/ruta/engine/RutaModifierTest.java b/ruta-core/src/test/java/org/apache/uima/ruta/engine/RutaModifierTest.java
index e3e0af6..a4f229f 100644
--- a/ruta-core/src/test/java/org/apache/uima/ruta/engine/RutaModifierTest.java
+++ b/ruta-core/src/test/java/org/apache/uima/ruta/engine/RutaModifierTest.java
@@ -42,13 +42,13 @@
String namespace = this.getClass().getPackage().getName().replaceAll("\\.", "/");
URL url = HtmlAnnotator.class.getClassLoader().getResource("Modifier.xml");
if (url == null) {
- url = HtmlAnnotator.class.getClassLoader().getResource(
- "org/apache/uima/ruta/engine/Modifier.xml");
+ url = HtmlAnnotator.class.getClassLoader()
+ .getResource("org/apache/uima/ruta/engine/Modifier.xml");
}
XMLInputSource in = new XMLInputSource(url);
ResourceSpecifier specifier = UIMAFramework.getXMLParser().parseResourceSpecifier(in);
AnalysisEngineDescription aed = (AnalysisEngineDescription) specifier;
-
+
TypeSystemDescription basicTypeSystem = aed.getAnalysisEngineMetaData().getTypeSystem();
for (int i = 1; i <= 20; i++) {
basicTypeSystem.addType(RutaTestUtils.TYPE + i, "Type for Testing", "uima.tcas.Annotation");
@@ -62,26 +62,19 @@
String viewName = "modified_for_testing";
ae.setConfigParameterValue(RutaModifier.PARAM_OUTPUT_VIEW, viewName);
ae.reconfigure();
-
+
String scriptName = this.getClass().getSimpleName();
- CAS cas = null;
- try {
- cas = RutaTestUtils.process(namespace + "/" + scriptName + RutaEngine.SCRIPT_FILE_EXTENSION, namespace + "/test.html", 50);
- } catch (Exception e) {
- e.printStackTrace();
- assert (false);
- }
+ CAS cas = RutaTestUtils.process(namespace + "/" + scriptName + RutaEngine.SCRIPT_FILE_EXTENSION,
+ namespace + "/test.html", 50);
ae.process(cas);
-
+
CAS modifiedView = cas.getView(viewName);
String text = modifiedView.getDocumentText();
-
- assertEquals("start of bodynormal BOLDend of body" , text);
-
-
+
+ assertEquals("start of bodynormal BOLDend of body", text);
+
cas.release();
ae.destroy();
}
-
}
diff --git a/ruta-core/src/test/java/org/apache/uima/ruta/seed/DefaultSeederTest.java b/ruta-core/src/test/java/org/apache/uima/ruta/seed/DefaultSeederTest.java
index e39a59c..4e93c3e 100644
--- a/ruta-core/src/test/java/org/apache/uima/ruta/seed/DefaultSeederTest.java
+++ b/ruta-core/src/test/java/org/apache/uima/ruta/seed/DefaultSeederTest.java
@@ -22,6 +22,8 @@
import static org.junit.Assert.assertEquals;
import java.net.URL;
+import java.util.LinkedHashMap;
+import java.util.Map;
import org.apache.uima.UIMAFramework;
import org.apache.uima.analysis_engine.AnalysisEngine;
@@ -173,4 +175,24 @@
RutaTestUtils.assertAnnotationsEquals(cas, 1, 1, "
");
}
+ @Test
+ public void testMultiLineXmlComment() throws Exception {
+
+ String document = "Text text <!-- some \n\r more text --> text text.";
+ String script = "ALL{-> T1};\n";
+ script += "ADDRETAINTYPE(MARKUP);\n";
+ script += "ALL{-> T2};\n";
+ script += "MARKUP{-> T3};\n";
+
+ CAS cas = RutaTestUtils.getCAS(document);
+ Map<String, Object> params = new LinkedHashMap<>();
+ params.put(RutaEngine.PARAM_SEEDERS, new String[] { DefaultSeeder.class.getName() });
+ Ruta.apply(cas, script, params);
+
+ RutaTestUtils.assertAnnotationsEquals(cas, 1, 5, "Text", "text", "text", "text", ".");
+ RutaTestUtils.assertAnnotationsEquals(cas, 2, 6, "Text", "text", "<!-- some \n\r more text -->",
+ "text", "text", ".");
+ RutaTestUtils.assertAnnotationsEquals(cas, 3, 1, "<!-- some \n\r more text -->");
+ }
+
}
diff --git a/ruta-core/src/test/java/org/apache/uima/ruta/seed/TextSeederTest.java b/ruta-core/src/test/java/org/apache/uima/ruta/seed/TextSeederTest.java
new file mode 100644
index 0000000..ee20d58
--- /dev/null
+++ b/ruta-core/src/test/java/org/apache/uima/ruta/seed/TextSeederTest.java
@@ -0,0 +1,59 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.uima.ruta.seed;
+
+import java.util.LinkedHashMap;
+import java.util.Map;
+
+import org.apache.uima.cas.CAS;
+import org.apache.uima.fit.util.JCasUtil;
+import org.apache.uima.ruta.engine.Ruta;
+import org.apache.uima.ruta.engine.RutaEngine;
+import org.apache.uima.ruta.engine.RutaTestUtils;
+import org.apache.uima.ruta.type.MARKUP;
+import org.junit.Assert;
+import org.junit.Test;
+
+public class TextSeederTest {
+
+ @Test
+ public void testNoMarkupForXmlComment() throws Exception {
+
+ String document = "Text text <!-- some more text --> text text.";
+ String script = "ALL{-> T1};\n";
+ script += "ADDRETAINTYPE(MARKUP);\n";
+ script += "ALL{-> T2};\n";
+ script += "MARKUP{-> T3};\n";
+
+ CAS cas = RutaTestUtils.getCAS(document);
+ Map<String, Object> params = new LinkedHashMap<>();
+ params.put(RutaEngine.PARAM_SEEDERS, new String[] { TextSeeder.class.getName() });
+ Ruta.apply(cas, script, params);
+
+ RutaTestUtils.assertAnnotationsEquals(cas, 1, 15, "Text", "text", "<", "!", "-", "-", "some",
+ "more", "text", "-", "-", ">", "text", "text", ".");
+ RutaTestUtils.assertAnnotationsEquals(cas, 2, 15, "Text", "text", "<", "!", "-", "-", "some",
+ "more", "text", "-", "-", ">", "text", "text", ".");
+ RutaTestUtils.assertAnnotationsEquals(cas, 3, 0);
+
+ Assert.assertTrue(JCasUtil.select(cas.getJCas(), MARKUP.class).isEmpty());
+ }
+
+}