blob: 8f8c50253a52b634595f75579893b2be833e77c3 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.apache.uima.util;
import java.io.ByteArrayOutputStream;
import java.util.ArrayList;
import java.util.List;
import org.apache.uima.UIMARuntimeException;
import org.apache.uima.cas.CAS;
import org.apache.uima.cas.CASException;
import org.apache.uima.cas.FSIterator;
import org.apache.uima.cas.FSMatchConstraint;
import org.apache.uima.cas.Feature;
import org.apache.uima.cas.FeatureStructure;
import org.apache.uima.cas.impl.BooleanArrayFSImpl;
import org.apache.uima.cas.impl.ByteArrayFSImpl;
import org.apache.uima.cas.impl.DoubleArrayFSImpl;
import org.apache.uima.cas.impl.FloatArrayFSImpl;
import org.apache.uima.cas.impl.IntArrayFSImpl;
import org.apache.uima.cas.impl.LongArrayFSImpl;
import org.apache.uima.cas.impl.ShortArrayFSImpl;
import org.apache.uima.cas.impl.StringArrayFSImpl;
import org.apache.uima.cas.text.AnnotationFS;
import org.xml.sax.Attributes;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
import org.xml.sax.helpers.AttributesImpl;
/**
* Generates an inline XML representation of a CAS. Annotation types are represented as XML tags,
* features are represented as attributes. Note that features whose values are FeatureStructures are
* not represented.
*
*
*/
public class CasToInlineXml {
/**
* This destroy method does nothing.
*
* @see org.apache.uima.resource.Resource#destroy()
*/
public void destroy() {
}
/**
* Formats a CAS as a String.
*/
public String format(CAS aCAS) throws CASException {
return generateXML(aCAS, null);
}
/**
* Formats a CAS as a String. Only FeatureStructures matching the given filter will be output.
*/
public String format(CAS aCAS, FSMatchConstraint aFilter) throws CASException {
return generateXML(aCAS, aFilter);
}
/**
* Generates inline XML from a CAS.
*
* @param aCAS
* CAS to generate from
*/
public String generateXML(CAS aCAS) throws CASException {
return generateXML(aCAS, null);
}
/**
* Generates inline XML from a CAS.
*
* @param aCAS
* CAS to generate from
* @param aFilter
* constraint that determines which annotations are included in the output. If null (or
* ommitted), all annotations are included.
*/
public String generateXML(CAS aCAS, FSMatchConstraint aFilter) throws CASException {
ByteArrayOutputStream byteArrayOutputStream = new ByteArrayOutputStream();
XMLSerializer sax2xml = new XMLSerializer(byteArrayOutputStream);
// get document text
String docText = aCAS.getDocumentText();
char[] docCharArray = docText.toCharArray();
replaceInvalidXmlChars(docCharArray);
// get iterator over annotations sorted by increasing start position and
// decreasing end position
FSIterator<AnnotationFS> iterator = aCAS.getAnnotationIndex().iterator();
// filter the iterator if desired
if (aFilter != null) {
iterator = aCAS.createFilteredIterator(iterator, aFilter);
}
// This is basically a recursive algorithm that has had the recursion
// removed through the use of an explicit Stack. We iterate over the
// annotations, and if an annotation contains other annotations, we
// push the parent annotation on the stack, process the children, and
// then come back to the parent later.
List<AnnotationFS> stack = new ArrayList<AnnotationFS>();
int pos = 0;
try {
ContentHandler handler = sax2xml.getContentHandler();
handler.startDocument();
// write an artificial start tag
handler.startElement("", "Document", "Document", new AttributesImpl());
// now use null is a placeholder for this artificial Document annotation
AnnotationFS curAnnot = null;
while (iterator.isValid()) {
// debug
// FeatureStructure fs = iterator.get();
// System.out.println("Type: " + fs.getType().getName() + ", Class:" +
// fs.getClass().getName());
// AnnotationFS nextAnnot = (AnnotationFS)fs;
AnnotationFS nextAnnot = (AnnotationFS) iterator.get();
if (curAnnot == null || nextAnnot.getBegin() < curAnnot.getEnd()) {
// nextAnnot's start point is within the span of curAnnot
if (curAnnot == null || nextAnnot.getEnd() <= curAnnot.getEnd()) // crossover span check
{
// nextAnnot is contained within curAnnot
// write text between current pos and beginning of nextAnnot
try {
handler.characters(docCharArray, pos, nextAnnot.getBegin() - pos);
pos = nextAnnot.getBegin();
handler.startElement("", nextAnnot.getType().getName(),
nextAnnot.getType().getName(), getFeatureAttributes(nextAnnot, aCAS));
// push parent annotation on stack
stack.add(curAnnot);
// move on to next annotation
curAnnot = nextAnnot;
} catch (StringIndexOutOfBoundsException e) {
System.err.println("Invalid annotation range: " + nextAnnot.getBegin() + ","
+ nextAnnot.getEnd() + " in document of length " + docText.length());
}
}
iterator.moveToNext();
} else {
// nextAnnot begins after curAnnot ends
// write text between current pos and end of curAnnot
try {
handler.characters(docCharArray, pos, curAnnot.getEnd() - pos);
pos = curAnnot.getEnd();
} catch (StringIndexOutOfBoundsException e) {
System.err.println("Invalid annotation range: " + curAnnot.getBegin() + ","
+ curAnnot.getEnd() + " in document of length " + docText.length());
}
handler.endElement("", curAnnot.getType().getName(), curAnnot.getType().getName());
// pop next containing annotation off stack
curAnnot = (AnnotationFS) stack.remove(stack.size() - 1);
}
}
// finished writing all start tags, now finish up
if (curAnnot != null) {
try {
handler.characters(docCharArray, pos, curAnnot.getEnd() - pos);
pos = curAnnot.getEnd();
} catch (StringIndexOutOfBoundsException e) {
System.err.println("Invalid annotation range: " + curAnnot.getBegin() + ","
+ curAnnot.getEnd() + "in document of length " + docText.length());
}
handler.endElement("", curAnnot.getType().getName(), curAnnot.getType().getName());
while (!stack.isEmpty()) {
curAnnot = (AnnotationFS) stack.remove(stack.size() - 1); // pop
if (curAnnot == null) {
break;
}
try {
handler.characters(docCharArray, pos, curAnnot.getEnd() - pos);
pos = curAnnot.getEnd();
} catch (StringIndexOutOfBoundsException e) {
System.err.println("Invalid annotation range: " + curAnnot.getBegin() + ","
+ curAnnot.getEnd() + "in document of length " + docText.length());
}
handler.endElement("", curAnnot.getType().getName(), curAnnot.getType().getName());
}
}
if (pos < docCharArray.length) {
handler.characters(docCharArray, pos, docCharArray.length - pos);
}
handler.endElement("", "Document", "Document");
handler.endDocument();
// return XML string
return new String(byteArrayOutputStream.toByteArray());
} catch (SAXException e) {
throw new UIMARuntimeException(e);
}
}
private final Attributes getFeatureAttributes(FeatureStructure aFS, CAS aCAS) {
AttributesImpl attrs = new AttributesImpl();
for (Feature feat : aFS.getType().getFeatures()) {
String featName = feat.getShortName();
// how we get feature value depends on feature's range type)
String rangeTypeName = feat.getRange().getName();
if (feat.getRange().isPrimitive())
{
String str = aFS.getFeatureValueAsString(feat);
if (str == null) {
attrs.addAttribute("", featName, featName, "CDATA", "null");
} else {
if (str.length() > 64) {
str = str.substring(0, 64) + "...";
}
attrs.addAttribute("", featName, featName, "CDATA", str);
}
} else if (feat.getRange().isArray() && feat.getRange().getComponentType().isPrimitive())
{
//TODO: there should be a better way to get any array value as a string array
String[] vals = null;
if (CAS.TYPE_NAME_STRING_ARRAY.equals(rangeTypeName)) {
StringArrayFSImpl arrayFS = (StringArrayFSImpl) aFS.getFeatureValue(feat);
if (arrayFS != null)
vals = arrayFS.toArray();
} else if (CAS.TYPE_NAME_INTEGER_ARRAY.equals(rangeTypeName)) {
IntArrayFSImpl arrayFS = (IntArrayFSImpl) aFS.getFeatureValue(feat);
if (arrayFS != null)
vals = arrayFS.toStringArray();
} else if (CAS.TYPE_NAME_FLOAT_ARRAY.equals(rangeTypeName)) {
FloatArrayFSImpl arrayFS = (FloatArrayFSImpl) aFS.getFeatureValue(feat);
if (arrayFS != null)
vals = arrayFS.toStringArray();
} else if (CAS.TYPE_NAME_BOOLEAN_ARRAY.equals(rangeTypeName)) {
BooleanArrayFSImpl arrayFS = (BooleanArrayFSImpl) aFS.getFeatureValue(feat);
if (arrayFS != null)
vals = arrayFS.toStringArray();
} else if (CAS.TYPE_NAME_BYTE_ARRAY.equals(rangeTypeName)) {
ByteArrayFSImpl arrayFS = (ByteArrayFSImpl) aFS.getFeatureValue(feat);
if (arrayFS != null)
vals = arrayFS.toStringArray();
} else if (CAS.TYPE_NAME_SHORT_ARRAY.equals(rangeTypeName)) {
ShortArrayFSImpl arrayFS = (ShortArrayFSImpl) aFS.getFeatureValue(feat);
if (arrayFS != null)
vals = arrayFS.toStringArray();
} else if (CAS.TYPE_NAME_LONG_ARRAY.equals(rangeTypeName)) {
LongArrayFSImpl arrayFS = (LongArrayFSImpl) aFS.getFeatureValue(feat);
if (arrayFS != null)
vals = arrayFS.toStringArray();
} else if (CAS.TYPE_NAME_DOUBLE_ARRAY.equals(rangeTypeName)) {
DoubleArrayFSImpl arrayFS = (DoubleArrayFSImpl) aFS.getFeatureValue(feat);
if (arrayFS != null)
vals = arrayFS.toStringArray();
}
String attrVal;
if (vals == null) {
attrVal = "null";
} else {
StringBuffer buf = new StringBuffer();
buf.append('[');
for (int i = 0; i < vals.length - 1; i++) {
buf.append(vals[i]);
buf.append(',');
}
if (vals.length > 0) {
buf.append(vals[vals.length - 1]);
}
buf.append(']');
attrVal = buf.toString();
}
attrs.addAttribute("", featName, featName, "CDATA", attrVal);
} else {
// get value as FeatureStructure
FeatureStructure fsVal = aFS.getFeatureValue(feat);
if (fsVal == null) {
attrs.addAttribute("", featName, featName, "CDATA", "null");
} else {
// record type name as value, and covered text if it's an annotation
StringBuffer buf = new StringBuffer();
buf.append(fsVal.getType().getShortName());
if (fsVal instanceof AnnotationFS) {
buf.append(" [");
String str = ((AnnotationFS) fsVal).getCoveredText();
if (str.length() > 64) {
str = str.substring(0, 64) + "...";
}
buf.append(str);
buf.append(']');
}
attrs.addAttribute("", featName, featName, "CDATA", buf.toString());
}
}
}
return attrs;
}
private void replaceInvalidXmlChars(char[] aChars) {
for (int i = 0; i < aChars.length; i++) {
if ((aChars[i] < 0x20 && aChars[i] != 0x09 && aChars[i] != 0x0A && aChars[i] != 0x0D)
|| (aChars[i] > 0xD7FF && aChars[i] < 0xE000) || aChars[i] == 0xFFFE
|| aChars[i] == 0xFFFF) {
// System.out.println("Found invalid XML character: " + (int)aChars[i] + " at position " +
// i); //temp
aChars[i] = ' ';
}
}
}
}