blob: 70093112dd4da1cceea94dbdbe7277673580dbb0 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.apache.uima.cas_data.impl;
import static org.assertj.core.api.Assertions.assertThat;
import static org.junit.Assert.assertTrue;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.StringReader;
import java.io.StringWriter;
import java.util.Iterator;
import javax.xml.parsers.ParserConfigurationException;
import javax.xml.parsers.SAXParser;
import javax.xml.parsers.SAXParserFactory;
import org.apache.uima.UIMAFramework;
import org.apache.uima.analysis_engine.AnalysisEngine;
import org.apache.uima.analysis_engine.AnalysisEngineDescription;
import org.apache.uima.cas.CAS;
import org.apache.uima.cas.CASRuntimeException;
import org.apache.uima.cas.Type;
import org.apache.uima.cas.TypeSystem;
import org.apache.uima.cas.impl.XCASDeserializer;
import org.apache.uima.cas.impl.XCASSerializer;
import org.apache.uima.cas_data.CasData;
import org.apache.uima.cas_data.FeatureStructure;
import org.apache.uima.cas_data.FeatureValue;
import org.apache.uima.cas_data.PrimitiveValue;
import org.apache.uima.resource.ResourceInitializationException;
import org.apache.uima.resource.metadata.FsIndexDescription;
import org.apache.uima.resource.metadata.TypeSystemDescription;
import org.apache.uima.resource.metadata.impl.TypePriorities_impl;
import org.apache.uima.test.junit_extension.JUnitExtension;
import org.apache.uima.util.CasCreationUtils;
import org.apache.uima.util.Level;
import org.apache.uima.util.XMLInputSource;
import org.apache.uima.util.XMLSerializer;
import org.junit.jupiter.api.Test;
import org.xml.sax.ContentHandler;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;
import org.xml.sax.XMLReader;
/**
* Tests XCasToCasDataSaxHandler. Also Tests CasDataToXCas.
*
*/
public class XCasToCasDataSaxHandlerTest {
@Test
public void testParse() throws Exception {
try {
CasData casData = new CasDataImpl();
XCasToCasDataSaxHandler handler = new XCasToCasDataSaxHandler(casData);
SAXParserFactory fact = SAXParserFactory.newInstance();
SAXParser parser = fact.newSAXParser();
XMLReader xmlReader = parser.getXMLReader();
xmlReader.setContentHandler(handler);
xmlReader.parse(new InputSource(getClass().getResourceAsStream("xcastest.xml")));
// System.out.println(casData);
Iterator<FeatureStructure> fsIter = casData.getFeatureStructures();
boolean foundCrawlUrl = false;
while (fsIter.hasNext()) {
FeatureStructure fs = fsIter.next();
if ("Crawl_colon_URL".equals(fs.getType())) {
// System.out.println("[" + fs.getFeatureValue("value") + "]");
assertThat(fs.getFeatureValue("value").toString()).isEqualTo(
"http://www.nolimitmedia.com/index.php?act=group&gro=1&gron=Flash&PHPSESSID=5dcc31fb425c4a204b70d9eab92531a5");
foundCrawlUrl = true;
}
}
assertTrue(foundCrawlUrl);
} catch (Exception e) {
JUnitExtension.handleException(e);
}
}
@Test
public void testConversions() throws Exception {
try {
// complex CAS obtained by deserialization
File typeSystemFile = JUnitExtension.getFile("ExampleCas/testTypeSystem.xml");
TypeSystemDescription typeSystem = UIMAFramework.getXMLParser()
.parseTypeSystemDescription(new XMLInputSource(typeSystemFile));
CAS cas = CasCreationUtils.createCas(typeSystem, new TypePriorities_impl(),
new FsIndexDescription[0]);
InputStream serCasStream = new FileInputStream(JUnitExtension.getFile("ExampleCas/cas.xml"));
XCASDeserializer deser = new XCASDeserializer(cas.getTypeSystem());
ContentHandler deserHandler = deser.getXCASHandler(cas);
SAXParserFactory fact = SAXParserFactory.newInstance();
SAXParser parser = fact.newSAXParser();
XMLReader xmlReader = parser.getXMLReader();
xmlReader.setContentHandler(deserHandler);
xmlReader.parse(new InputSource(serCasStream));
serCasStream.close();
_testConversions(cas);
// a CAS with multiple Sofas
InputStream translatorAeStream = new FileInputStream(
JUnitExtension.getFile("CpeSofaTest/TransAnnotator.xml"));
AnalysisEngineDescription translatorAeDesc = UIMAFramework.getXMLParser()
.parseAnalysisEngineDescription(new XMLInputSource(translatorAeStream, null));
AnalysisEngine transAnnotator = UIMAFramework.produceAnalysisEngine(translatorAeDesc);
CAS cas2 = transAnnotator.newCAS();
CAS englishView = cas2.createView("EnglishDocument");
englishView.setSofaDataString("this beer is good", "text/plain");
transAnnotator.process(cas2);
_testConversions(cas2);
} catch (Exception e) {
JUnitExtension.handleException(e);
}
}
private void _testConversions(CAS aCAS) throws IOException, ParserConfigurationException,
SAXException, ResourceInitializationException, CASRuntimeException {
// generate XCAS events and pipe them to XCasToCasDataSaxHandler
CasData casData = new CasDataImpl();
XCasToCasDataSaxHandler handler = new XCasToCasDataSaxHandler(casData);
XCASSerializer xcasSer = new XCASSerializer(aCAS.getTypeSystem());
xcasSer.serialize(aCAS, handler);
assertThat(casData).isNotNull();
assertValidCasData(casData, aCAS.getTypeSystem());
// System.out.println(casData);
// now generate XCAS from the CasData
CasDataToXCas generator = new CasDataToXCas();
StringWriter sw = new StringWriter();
XMLSerializer xmlSer = new XMLSerializer(sw, false);
generator.setContentHandler(xmlSer.getContentHandler());
generator.generateXCas(casData);
String xml = sw.getBuffer().toString();
// workaround for XML serializatioj problem on Sun Java 1.4
if (!builtInXmlSerializationSupportsCRs()) {
xml = xml.replaceAll("&#10;", "&#13;&#10;");
}
UIMAFramework.getLogger(XCasToCasDataSaxHandlerTest.class).log(Level.FINE, xml);
// deserialize back into CAS for comparison
// CASMgr tcasMgr = CASFactory.createCAS(aCAS.getTypeSystem());
// tcasMgr.initCASIndexes();
// tcasMgr.getIndexRepositoryMgr().commit();
CAS cas2 = CasCreationUtils.createCas(null, aCAS.getTypeSystem(), null);
XCASDeserializer deser = new XCASDeserializer(cas2.getTypeSystem());
ContentHandler deserHandler = deser.getXCASHandler(cas2);
SAXParserFactory fact = SAXParserFactory.newInstance();
SAXParser parser = fact.newSAXParser();
XMLReader xmlReader = parser.getXMLReader();
xmlReader.setContentHandler(deserHandler);
xmlReader.parse(new InputSource(new StringReader(xml)));
// CASes should be identical
CasComparer.assertEquals(aCAS, cas2);
}
/**
* @param casData
* @param system
*/
private void assertValidCasData(CasData casData, TypeSystem typeSystem) {
Type annotType = typeSystem.getType(CAS.TYPE_NAME_ANNOTATION);
Type arrayType = typeSystem.getType(CAS.TYPE_NAME_ARRAY_BASE);
Iterator<FeatureStructure> fsIter = casData.getFeatureStructures();
while (fsIter.hasNext()) {
org.apache.uima.cas_data.FeatureStructure fs = fsIter.next();
String typeName = fs.getType();
// don't do tests on the "fake" document text FS
if (XCASSerializer.DEFAULT_DOC_TYPE_NAME.equals(typeName)) {
continue;
}
Type type = typeSystem.getType(typeName);
assertThat(type).isNotNull();
if (typeSystem.subsumes(annotType, type)) {
// annotation type - check for presence of begin/end
FeatureValue beginVal = fs.getFeatureValue("begin");
assertThat(beginVal instanceof PrimitiveValue).isTrue();
assertThat(((PrimitiveValue) beginVal).toInt() >= 0).isTrue();
FeatureValue endVal = fs.getFeatureValue("end");
assertThat(endVal instanceof PrimitiveValue).isTrue();
assertThat(((PrimitiveValue) endVal).toInt() >= 0).isTrue();
}
}
}
/**
* Checks the Java vendor and version and returns true if running a version of Java whose built-in
* XSLT support can properly serialize carriage return characters, and false if not. It seems to
* be the case that Sun JVMs prior to 1.5 do not properly serialize carriage return characters. We
* have to modify our test case to account for this.
*
* @return true if XML serialization of CRs behave properly in the current JRE
*/
private boolean builtInXmlSerializationSupportsCRs() {
String javaVendor = System.getProperty("java.vendor");
if (javaVendor.startsWith("Sun")) {
String javaVersion = System.getProperty("java.version");
if (javaVersion.startsWith("1.3") || javaVersion.startsWith("1.4")) {
return false;
}
}
return true;
}
}