trunk/ruta-core/src/test/java/org/apache/uima/ruta/engine/HtmlConverterXmlTest.java - uima-ruta - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one
  * or more contributor license agreements.  See the NOTICE file
  * distributed with this work for additional information
  * regarding copyright ownership.  The ASF licenses this file
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
  *
  * http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing,
  * software distributed under the License is distributed on an
  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
  * KIND, either express or implied.  See the License for the
  * specific language governing permissions and limitations
  * under the License.
  */

 package org.apache.uima.ruta.engine;

 import static org.junit.Assert.assertEquals;

 import java.net.URL;

 import org.apache.uima.UIMAFramework;
 import org.apache.uima.analysis_engine.AnalysisEngine;
 import org.apache.uima.cas.CAS;
 import org.apache.uima.cas.FSIterator;
 import org.apache.uima.cas.Feature;
 import org.apache.uima.cas.Type;
 import org.apache.uima.cas.text.AnnotationFS;
 import org.apache.uima.cas.text.AnnotationIndex;
 import org.apache.uima.resource.ResourceSpecifier;
 import org.apache.uima.util.XMLInputSource;
 import org.junit.Test;

 public class HtmlConverterXmlTest {

   @Test
   public void test() throws Exception {
     String html = "<Parent>\n";
     html += "<Child1>Some content</Child1>\n";
     html += "<Child2 attribute=“someValue” />\n";
     html += "<Child3>More content.</Child3>\n";
     html += "</Parent>\n";

     URL urlA = HtmlAnnotator.class.getClassLoader().getResource("HtmlAnnotator.xml");
     if (urlA == null) {
       urlA = HtmlAnnotator.class.getClassLoader().getResource(
               "org/apache/uima/ruta/engine/HtmlAnnotator.xml");
     }

     URL urlC = HtmlAnnotator.class.getClassLoader().getResource("HtmlConverter.xml");
     if (urlC == null) {
       urlC = HtmlAnnotator.class.getClassLoader().getResource(
               "org/apache/uima/ruta/engine/HtmlConverter.xml");
     }

     XMLInputSource inA = new XMLInputSource(urlA);
     ResourceSpecifier specifierA = UIMAFramework.getXMLParser().parseResourceSpecifier(inA);
     AnalysisEngine aeA = UIMAFramework.produceAnalysisEngine(specifierA);
     aeA.setConfigParameterValue(HtmlAnnotator.PARAM_ONLY_CONTENT, false);
     aeA.reconfigure();

     XMLInputSource inC = new XMLInputSource(urlC);
     ResourceSpecifier specifierC = UIMAFramework.getXMLParser().parseResourceSpecifier(inC);
     AnalysisEngine aeC = UIMAFramework.produceAnalysisEngine(specifierC);
     aeC.setConfigParameterValue(HtmlConverter.PARAM_SKIP_WHITESPACES, false);
     aeC.setConfigParameterValue(HtmlConverter.PARAM_PROCESS_ALL, true);
     aeC.setConfigParameterValue(HtmlConverter.PARAM_GAP_INDUCING_TAGS, new String[] { "child1",
         "child2", "child3" });
     aeC.setConfigParameterValue(HtmlConverter.PARAM_GAP_TEXT, "$");
     aeC.reconfigure();

     CAS cas = aeA.newCAS();
     Type tagType = cas.getTypeSystem().getType(HtmlAnnotator.NAMESPACE + "TAG");
     AnnotationIndex<AnnotationFS> ai = null;
     FSIterator<AnnotationFS> iterator = null;

     cas.setDocumentText(html);
     aeA.process(cas);
     aeC.process(cas);

     CAS plainTextCas = cas.getView(HtmlConverter.DEFAULT_MODIFIED_VIEW);

     assertEquals("$Some content$$More content.", plainTextCas.getDocumentText());

     ai = plainTextCas.getAnnotationIndex(tagType);
     iterator = ai.iterator();
     assertEquals(4, ai.size());
     assertEquals("$Some content$$More content.", iterator.next().getCoveredText());
     assertEquals("$Some content", iterator.next().getCoveredText());
     assertEquals("$", iterator.next().getCoveredText());
     assertEquals("$More content.", iterator.next().getCoveredText());

     cas.release();
   }

   @Test
   public void testExpandOffsets() throws Exception {
     String html = "<Parent>\n";
     html += "<Child1>Some content</Child1>\n";
     html += "<Child2 attribute=“someValue” />\n";
     html += "<Child3>More content.</Child3>\n";
     html += "</Parent>\n";

     URL urlA = HtmlAnnotator.class.getClassLoader().getResource("HtmlAnnotator.xml");
     if (urlA == null) {
       urlA = HtmlAnnotator.class.getClassLoader().getResource(
               "org/apache/uima/ruta/engine/HtmlAnnotator.xml");
     }

     URL urlC = HtmlAnnotator.class.getClassLoader().getResource("HtmlConverter.xml");
     if (urlC == null) {
       urlC = HtmlAnnotator.class.getClassLoader().getResource(
               "org/apache/uima/ruta/engine/HtmlConverter.xml");
     }

     XMLInputSource inA = new XMLInputSource(urlA);
     ResourceSpecifier specifierA = UIMAFramework.getXMLParser().parseResourceSpecifier(inA);
     AnalysisEngine aeA = UIMAFramework.produceAnalysisEngine(specifierA);
     aeA.setConfigParameterValue(HtmlAnnotator.PARAM_ONLY_CONTENT, false);
     aeA.reconfigure();

     XMLInputSource inC = new XMLInputSource(urlC);
     ResourceSpecifier specifierC = UIMAFramework.getXMLParser().parseResourceSpecifier(inC);
     AnalysisEngine aeC = UIMAFramework.produceAnalysisEngine(specifierC);
     aeC.setConfigParameterValue(HtmlConverter.PARAM_SKIP_WHITESPACES, false);
     aeC.setConfigParameterValue(HtmlConverter.PARAM_PROCESS_ALL, true);
     aeC.setConfigParameterValue(HtmlConverter.PARAM_EXPAND_OFFSETS, true);
     aeC.reconfigure();

     CAS cas = aeA.newCAS();
     Type tagType = cas.getTypeSystem().getType(HtmlAnnotator.NAMESPACE + "TAG");
     Feature expandedFeature = tagType.getFeatureByBaseName("expandedOffsets");
     AnnotationIndex<AnnotationFS> ai = null;
     FSIterator<AnnotationFS> iterator = null;

     cas.setDocumentText(html);
     aeA.process(cas);
     aeC.process(cas);

     CAS plainTextCas = cas.getView(HtmlConverter.DEFAULT_MODIFIED_VIEW);

     assertEquals("Some contentMore content.", plainTextCas.getDocumentText());

     ai = plainTextCas.getAnnotationIndex(tagType);
     iterator = ai.iterator();
     assertEquals(4, ai.size());
     AnnotationFS next = null;
     next = iterator.next();
     assertEquals(false, next.getBooleanValue(expandedFeature));
     assertEquals("Some contentMore content.", next.getCoveredText());
     next = iterator.next();
     assertEquals(false, next.getBooleanValue(expandedFeature));
     assertEquals("Some content", next.getCoveredText());
     next = iterator.next();
     boolean b1 = next.getBooleanValue(expandedFeature);
     assertEquals("More content.", next.getCoveredText());
     next = iterator.next();
     boolean b2 = next.getBooleanValue(expandedFeature);
     assertEquals("More content.", next.getCoveredText());
     // for one of these two annotation (with same offsets) the feature must be set to true
     assertEquals(true, b1 || b2);

     cas.release();
   }
 }
	/*
	* Licensed to the Apache Software Foundation (ASF) under one
	* or more contributor license agreements. See the NOTICE file
	* distributed with this work for additional information
	* regarding copyright ownership. The ASF licenses this file
	* to you under the Apache License, Version 2.0 (the
	* "License"); you may not use this file except in compliance
	* with the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing,
	* software distributed under the License is distributed on an
	* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
	* KIND, either express or implied. See the License for the
	* specific language governing permissions and limitations
	* under the License.
	*/

	package org.apache.uima.ruta.engine;

	import static org.junit.Assert.assertEquals;

	import java.net.URL;

	import org.apache.uima.UIMAFramework;
	import org.apache.uima.analysis_engine.AnalysisEngine;
	import org.apache.uima.cas.CAS;
	import org.apache.uima.cas.FSIterator;
	import org.apache.uima.cas.Feature;
	import org.apache.uima.cas.Type;
	import org.apache.uima.cas.text.AnnotationFS;
	import org.apache.uima.cas.text.AnnotationIndex;
	import org.apache.uima.resource.ResourceSpecifier;
	import org.apache.uima.util.XMLInputSource;
	import org.junit.Test;

	public class HtmlConverterXmlTest {

	@Test
	public void test() throws Exception {
	String html = "<Parent>\n";
	html += "<Child1>Some content</Child1>\n";
	html += "<Child2 attribute=“someValue” />\n";
	html += "<Child3>More content.</Child3>\n";
	html += "</Parent>\n";

	URL urlA = HtmlAnnotator.class.getClassLoader().getResource("HtmlAnnotator.xml");
	if (urlA == null) {
	urlA = HtmlAnnotator.class.getClassLoader().getResource(
	"org/apache/uima/ruta/engine/HtmlAnnotator.xml");
	}

	URL urlC = HtmlAnnotator.class.getClassLoader().getResource("HtmlConverter.xml");
	if (urlC == null) {
	urlC = HtmlAnnotator.class.getClassLoader().getResource(
	"org/apache/uima/ruta/engine/HtmlConverter.xml");
	}

	XMLInputSource inA = new XMLInputSource(urlA);
	ResourceSpecifier specifierA = UIMAFramework.getXMLParser().parseResourceSpecifier(inA);
	AnalysisEngine aeA = UIMAFramework.produceAnalysisEngine(specifierA);
	aeA.setConfigParameterValue(HtmlAnnotator.PARAM_ONLY_CONTENT, false);
	aeA.reconfigure();

	XMLInputSource inC = new XMLInputSource(urlC);
	ResourceSpecifier specifierC = UIMAFramework.getXMLParser().parseResourceSpecifier(inC);
	AnalysisEngine aeC = UIMAFramework.produceAnalysisEngine(specifierC);
	aeC.setConfigParameterValue(HtmlConverter.PARAM_SKIP_WHITESPACES, false);
	aeC.setConfigParameterValue(HtmlConverter.PARAM_PROCESS_ALL, true);
	aeC.setConfigParameterValue(HtmlConverter.PARAM_GAP_INDUCING_TAGS, new String[] { "child1",
	"child2", "child3" });
	aeC.setConfigParameterValue(HtmlConverter.PARAM_GAP_TEXT, "$");
	aeC.reconfigure();

	CAS cas = aeA.newCAS();
	Type tagType = cas.getTypeSystem().getType(HtmlAnnotator.NAMESPACE + "TAG");
	AnnotationIndex<AnnotationFS> ai = null;
	FSIterator<AnnotationFS> iterator = null;

	cas.setDocumentText(html);
	aeA.process(cas);
	aeC.process(cas);

	CAS plainTextCas = cas.getView(HtmlConverter.DEFAULT_MODIFIED_VIEW);

	assertEquals("$Some content$$More content.", plainTextCas.getDocumentText());

	ai = plainTextCas.getAnnotationIndex(tagType);
	iterator = ai.iterator();
	assertEquals(4, ai.size());
	assertEquals("$Some content$$More content.", iterator.next().getCoveredText());
	assertEquals("$Some content", iterator.next().getCoveredText());
	assertEquals("$", iterator.next().getCoveredText());
	assertEquals("$More content.", iterator.next().getCoveredText());

	cas.release();
	}

	@Test
	public void testExpandOffsets() throws Exception {
	String html = "<Parent>\n";
	html += "<Child1>Some content</Child1>\n";
	html += "<Child2 attribute=“someValue” />\n";
	html += "<Child3>More content.</Child3>\n";
	html += "</Parent>\n";

	URL urlA = HtmlAnnotator.class.getClassLoader().getResource("HtmlAnnotator.xml");
	if (urlA == null) {
	urlA = HtmlAnnotator.class.getClassLoader().getResource(
	"org/apache/uima/ruta/engine/HtmlAnnotator.xml");
	}

	URL urlC = HtmlAnnotator.class.getClassLoader().getResource("HtmlConverter.xml");
	if (urlC == null) {
	urlC = HtmlAnnotator.class.getClassLoader().getResource(
	"org/apache/uima/ruta/engine/HtmlConverter.xml");
	}

	XMLInputSource inA = new XMLInputSource(urlA);
	ResourceSpecifier specifierA = UIMAFramework.getXMLParser().parseResourceSpecifier(inA);
	AnalysisEngine aeA = UIMAFramework.produceAnalysisEngine(specifierA);
	aeA.setConfigParameterValue(HtmlAnnotator.PARAM_ONLY_CONTENT, false);
	aeA.reconfigure();

	XMLInputSource inC = new XMLInputSource(urlC);
	ResourceSpecifier specifierC = UIMAFramework.getXMLParser().parseResourceSpecifier(inC);
	AnalysisEngine aeC = UIMAFramework.produceAnalysisEngine(specifierC);
	aeC.setConfigParameterValue(HtmlConverter.PARAM_SKIP_WHITESPACES, false);
	aeC.setConfigParameterValue(HtmlConverter.PARAM_PROCESS_ALL, true);
	aeC.setConfigParameterValue(HtmlConverter.PARAM_EXPAND_OFFSETS, true);
	aeC.reconfigure();

	CAS cas = aeA.newCAS();
	Type tagType = cas.getTypeSystem().getType(HtmlAnnotator.NAMESPACE + "TAG");
	Feature expandedFeature = tagType.getFeatureByBaseName("expandedOffsets");
	AnnotationIndex<AnnotationFS> ai = null;
	FSIterator<AnnotationFS> iterator = null;

	cas.setDocumentText(html);
	aeA.process(cas);
	aeC.process(cas);

	CAS plainTextCas = cas.getView(HtmlConverter.DEFAULT_MODIFIED_VIEW);

	assertEquals("Some contentMore content.", plainTextCas.getDocumentText());

	ai = plainTextCas.getAnnotationIndex(tagType);
	iterator = ai.iterator();
	assertEquals(4, ai.size());
	AnnotationFS next = null;
	next = iterator.next();
	assertEquals(false, next.getBooleanValue(expandedFeature));
	assertEquals("Some contentMore content.", next.getCoveredText());
	next = iterator.next();
	assertEquals(false, next.getBooleanValue(expandedFeature));
	assertEquals("Some content", next.getCoveredText());
	next = iterator.next();
	boolean b1 = next.getBooleanValue(expandedFeature);
	assertEquals("More content.", next.getCoveredText());
	next = iterator.next();
	boolean b2 = next.getBooleanValue(expandedFeature);
	assertEquals("More content.", next.getCoveredText());
	// for one of these two annotation (with same offsets) the feature must be set to true
	assertEquals(true, b1 \|\| b2);

	cas.release();
	}
	}