blob: f671cb318bab081a16d8ad6d26b82ae547d21ef4 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.apache.uima.cas.impl;
import java.io.IOException;
import java.io.OutputStream;
import java.util.ArrayDeque;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.Deque;
import java.util.IdentityHashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import org.apache.uima.UimaContext;
import org.apache.uima.UimaSerializable;
import org.apache.uima.cas.CAS;
import org.apache.uima.cas.Feature;
import org.apache.uima.cas.TypeSystem;
import org.apache.uima.internal.util.IntVector;
import org.apache.uima.internal.util.Pair;
import org.apache.uima.internal.util.StringUtils;
import org.apache.uima.jcas.cas.BooleanArray;
import org.apache.uima.jcas.cas.ByteArray;
import org.apache.uima.jcas.cas.DoubleArray;
import org.apache.uima.jcas.cas.FSArray;
import org.apache.uima.jcas.cas.FloatArray;
import org.apache.uima.jcas.cas.IntegerArray;
import org.apache.uima.jcas.cas.LongArray;
import org.apache.uima.jcas.cas.ShortArray;
import org.apache.uima.jcas.cas.Sofa;
import org.apache.uima.jcas.cas.StringArray;
import org.apache.uima.jcas.cas.TOP;
import org.apache.uima.util.XMLSerializer;
import org.xml.sax.Attributes;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
import org.xml.sax.helpers.AttributesImpl;
/**
* XCAS serializer. Create a serializer from a type system, then encode individual CASes by writing
* to a SAX content handler. This class is thread safe. *
*/
public class XCASSerializer {
private int numChildren;
public int getNumChildren() {
return numChildren;
}
/**
* Use an inner class to hold the data for serializing a CAS. Each call to serialize() creates its
* own instance.
*
*
*/
private class XCASDocSerializer {
// Where the output goes.
// private SAXDocStack xmlStack;
private ContentHandler ch;
// The CAS we're serializing.
private CASImpl cas;
/** Any FS reference we've touched goes in here. value is index repo (first one?), or MULTIPLY_INDEXED */
final private Map<TOP, Integer> queued = new IdentityHashMap<>();
private static final int NOT_INDEXED = -1;
private static final int MULTIPLY_INDEXED = -2;
private static final int INVALID_INDEX = -3;
/** Any FS indexed in more than one IR goes in here, the value is the associated duplicate key,
* Key is used to index into dupVectors */
final private Map<TOP, Integer> duplicates = new IdentityHashMap<>();
/** A key identifying a particular FS indexed in multiple indexes.
* Starts a 0, incr by 1 for each new FS discovered to be indexed in more than one IR */
int numDuplicates;
/** list of IntVectors holding lists of repo numbers.
* Indexed by the key above, for fss that are in multiple index repos */
final List<IntVector> dupVectors = new ArrayList<>();
// next 2 are a pair; the first is a fs, the 2nd is the index repo its indexed in
/** list of FSs that are in an index somewhere. */
final private List<TOP> indexedFSs = new ArrayList<>();
/** Specific IndexRepository for indexed FSs */
final private IntVector indexReps = new IntVector();
/** The current queue for FSs to write out. */
final private Deque<TOP> queue = new ArrayDeque<>();
private final AttributesImpl emptyAttrs = new AttributesImpl();
private AttributesImpl workAttrs = new AttributesImpl();
private static final String cdataType = "CDATA";
// For debug statistics.
private int fsCount = 0;
// Out-Of-TypeSystem Data to be included in produced XCAS. (APL)
private OutOfTypeSystemData mOutOfTypeSystemData;
// We write to a SAXDocStack, a simplified interface to a
// ContentHandler.
private XCASDocSerializer(ContentHandler ch, CASImpl cas) {
super();
this.ch = ch;
this.cas = cas;
this.numDuplicates = 0;
}
/**
* Add an address to the queue.
*
* @param fs_id
* The address.
* @return <code>false</code> iff we've seen this address before.
*/
private boolean enqueue(TOP fs) {
if (KEY_ONLY_MATCH == isQueued(fs, INVALID_INDEX)) {
return false;
}
int typeCode = fs._getTypeCode();
// at this point we don't know if this FS is indexed
queued.put(fs, NOT_INDEXED);
queue.push(fs);
final int typeClass = classifyType(fs._getTypeImpl());
if (typeClass == LowLevelCAS.TYPE_CLASS_FS) {
if (mOutOfTypeSystemData != null) {
enqueueOutOfTypeSystemFeatures(fs);
}
enqueueFeatures(fs, typeCode);
} else if (typeClass == LowLevelCAS.TYPE_CLASS_FSARRAY) {
enqueueFSArray((FSArray) fs);
}
return true;
}
/**
* Same as enqueue, but for indexed FSs.
*
* @param fs_id
* The address to enqueue.
*/
private void enqueueIndexed(TOP fs, int indexRep) {
int status = isQueued(fs, indexRep);
switch (status) {
case KEY_NOT_FOUND: // most common case, key not found
queued.put(fs, indexRep);
indexedFSs.add(fs);
indexReps.add(indexRep);
break;
case KEY_AND_VALUE_MATCH: // next most common, FS already queued
break;
case KEY_ONLY_MATCH: // key is there, indexRep not
int prevIndex = queued.get(fs);
if (NOT_INDEXED == prevIndex) {
// this fs_id added from a previously found reference
queued.put(fs, indexRep); // set with given index
break;
}
if (MULTIPLY_INDEXED == prevIndex) {
// this fs already indexed more than once
int thisDup = duplicates.get(fs);
dupVectors.get(thisDup).add(indexRep);
break;
}
// first time we notice this FS is indexed in multiple indexes
duplicates.put(fs, numDuplicates);
dupVectors.add(new IntVector());
dupVectors.get(numDuplicates).add(prevIndex);
dupVectors.get(numDuplicates).add(indexRep);
numDuplicates++;
queued.put(fs, MULTIPLY_INDEXED); // mark this fs_id as multiply indexed
break;
}
return;
}
private static final int KEY_AND_VALUE_MATCH = 1;
private static final int KEY_ONLY_MATCH = -1;
private static final int KEY_NOT_FOUND = 0;
/**
* Bad name; check if we've seen this (address, value) before.
*
* @param fs
* The Feature Structure.
* @param value
* The index repository
* @return KEY_AND_VALUE_MATCH iff we've seen (address, value) before. KEY_NOT_FOUND iff the
* address has not been seen before. KEY_ONLY_MATCH iff the address has been seen before
* with a different value.
*/
private int isQueued(TOP fs, int value) {
Integer v = this.queued.get(fs);
return (null == v) ? KEY_NOT_FOUND : (value == v.intValue()) ? KEY_AND_VALUE_MATCH : KEY_ONLY_MATCH;
}
/*
* Version of serialize which also includes OutOfTypeSystemData (obtained from previous
* deserialization) in the produced XCAS.
*
*/
private void serialize(boolean encodeDoc, OutOfTypeSystemData outOfTypeSystemData)
throws IOException, SAXException {
mOutOfTypeSystemData = outOfTypeSystemData;
int iElementCount = 0;
enqueueIndexed();
enqueueFeaturesOfIndexed();
if (outOfTypeSystemData != null) {
// Queues out of type system data.
int nextId = cas.getLastUsedFsId() + 1;
Iterator<FSData> it = outOfTypeSystemData.fsList.iterator();
while (it.hasNext()) {
FSData fs = it.next();
String newId = Integer.toString(nextId++);
outOfTypeSystemData.idMap.put(fs.id, newId);
fs.id = newId;
}
iElementCount += outOfTypeSystemData.fsList.size();
enqueueOutOfTypeSystemData(outOfTypeSystemData);
}
iElementCount += indexedFSs.size();
iElementCount += queue.size();
AttributesImpl rootAttrs = new AttributesImpl();
rootAttrs.addAttribute("", VERSION_ATTR, VERSION_ATTR, cdataType, CURRENT_VERSION);
startElement(casTagName, rootAttrs, iElementCount);
// continue with serialization
encodeIndexed(); // encodes indexedFSs.size() elements
encodeQueued(); // encodes queue.size() elements
if (outOfTypeSystemData != null) {
// encodes aData.fsList.size() elements
serializeOutOfTypeSystemData(outOfTypeSystemData);
}
endElement(casTagName);
}
private void addText(String text) throws SAXException {
ch.characters(text.toCharArray(), 0, text.length());
}
private String replaceInvalidXmlChars(String aString) {
// first do a scan, so we don't have to change anything if there are
// no
// bad charactes
boolean controlCharFound = false;
for (int i = 0; i < aString.length(); i++) {
if (!isValidXmlChar(aString.charAt(i))) {
controlCharFound = true;
break;
}
}
if (!controlCharFound) {
return aString;
}
// bad character was found, do another pass and replace all bad
// chars
char[] chars = aString.toCharArray();
for (int i = 0; i < chars.length; i++) {
if (!isValidXmlChar(chars[i])) {
// replace invalid XML char with unicode replacement char
chars[i] = 0xFFFD;
}
}
return new String(chars);
}
private boolean isValidXmlChar(char c) {
return (c >= 0x20 && c < 0xFFFE) || c == 0x09 || c == 0x0A || c == 0x0D;
}
private void addAttribute(AttributesImpl attrs, String attrName, String attrValue) {
// special case: if attrName is "sofaString", we need to check for
// invalid
// XML characters in the data, and replace them
if (CAS.FEATURE_BASE_NAME_SOFASTRING.equals(attrName)) {
attrValue = replaceInvalidXmlChars(attrValue);
}
attrs.addAttribute("", attrName, attrName, cdataType, attrValue);
}
private void startElement(String tag, Attributes attrs, int num) throws SAXException {
numChildren = num;
// Saxon requirement? Can't set just one of localName & qName to ""
ch.startElement("", tag, tag, attrs);
}
private void endElement(String tag) throws SAXException {
ch.endElement("", "", tag);
}
/*
* Encode the indexed FS in the queue.
*/
private void encodeIndexed() throws IOException, SAXException {
final int max = indexedFSs.size();
for (int i = 0; i < max; i++) {
if (MULTIPLY_INDEXED != queued.get(indexedFSs.get(i))) {
IntVector iv = new IntVector(1);
iv.add(indexReps.get(i));
encodeFS(indexedFSs.get(i), iv);
} else {
int thisDup = duplicates.get(indexedFSs.get(i));
encodeFS(indexedFSs.get(i), dupVectors.get(thisDup));
}
}
}
/**
* Push the indexed FSs onto the queue.
*/
private void enqueueIndexed() {
Collection<Sofa> sofaCollection = cas.getBaseIndexRepositoryImpl().<Sofa>getIndexedFSs(Sofa.class);
int sofaCount = sofaCollection.size();
if (sofaCount > 0) {
Sofa[] allSofas = sofaCollection.toArray(new Sofa[sofaCount]);
// XCAS requires sofas in order of id
Arrays.sort(allSofas, (fs1, fs2) -> Integer.compare(fs1._id, fs2._id) );
enqueueArray(allSofas, 0);
}
// Get indexes for each SofaFS in the CAS
for (int sofaNum = 1, numViews = cas.getViewCount(); sofaNum <= numViews; sofaNum++) {
FSIndexRepositoryImpl viewIR = (FSIndexRepositoryImpl) cas.getBaseCAS().getSofaIndexRepository(sofaNum);
if (viewIR != null) {
Collection<TOP> fssInView = viewIR.getIndexedFSs();
if (! fssInView.isEmpty()) {
enqueueCollection(fssInView, sofaNum);
}
}
}
}
private void enqueueArray(TOP[] fss, int sofaNum) {
for (TOP fs : fss) { // enqueues the fss for one view (incl view 0 - the base view
enqueueIndexed(fs, sofaNum);
}
}
private void enqueueCollection(Collection<TOP> fss, int sofaNum) {
for (TOP fs : fss) {
enqueueIndexed(fs, sofaNum);
}
}
private void enqueueFeaturesOfIndexed() {
final int max = indexedFSs.size();
for (int i = 0; i < max; i++) {
TOP fs = indexedFSs.get(i);
int typeCode = fs._getTypeCode();
final int typeClass = classifyType(fs._getTypeImpl());
if (typeClass == LowLevelCAS.TYPE_CLASS_FS) {
if (mOutOfTypeSystemData != null) {
enqueueOutOfTypeSystemFeatures(fs);
}
enqueueFeatures(fs, typeCode);
} else if (typeClass == LowLevelCAS.TYPE_CLASS_FSARRAY) {
enqueueFSArray((FSArray) fs);
}
}
}
/*
* Encode all other enqueued (non-indexed) FSs.
*
*/
private void encodeQueued() throws IOException, SAXException {
for (TOP item : queue) {
encodeFS(item, null);
}
}
/**
* Encode an individual FS.
*
* @param fs_id
* The address to be encoded.
* @param isIndexed
* If the FS is indexed or not.
* @throws IOException passthru
* @throws SAXException passthru
*/
private void encodeFS(TOP fs, IntVector indexRep) throws IOException, SAXException {
++fsCount;
workAttrs.clear();
// Create an element with the type name as tag.
// xmlStack.pushElementNode(getTypeName(fs_id));
// Add indexed info.
// if (sofaTypeCode == cas.getHeapValue(fs_id) &&
// cas.isBackwardCompatibleCas()) {
// // Don't encode sofaFS if old style application
// return;
// }
if (indexRep != null) {
if (indexRep.size() == 1) {
// xmlStack.addAttribute(INDEXED_ATTR_NAME, TRUE_VALUE);
addAttribute(workAttrs, INDEXED_ATTR_NAME, Integer.toString(indexRep.get(0)));
} else {
StringBuilder multIndex = new StringBuilder();
multIndex.append(Integer.toString(indexRep.get(0)));
for (int mi = 1; mi < indexRep.size(); mi++) {
multIndex.append(' ').append(Integer.toString(indexRep.get(mi)));
}
addAttribute(workAttrs, INDEXED_ATTR_NAME, multIndex.toString());
}
}
// Add ID attribute. We do this for every FS, since otherwise we
// would
// have to do a complete traversal of the heap to find out which FSs
// is
// actually referenced.
// xmlStack.addAttribute(ID_ATTR_NAME, Integer.toString(fs_id));
addAttribute(workAttrs, ID_ATTR_NAME, Integer.toString(fs._id));
final int typeClass = classifyType(fs._getTypeImpl());
// Call special code according to the type of the FS (special
// treatment
// for arrays).
String[] data = null;
String typeName = getTypeName(fs);
switch (typeClass) {
case LowLevelCAS.TYPE_CLASS_FS: {
encodeFeatures(fs, workAttrs);
if (mOutOfTypeSystemData != null) {
encodeOutOfTypeSystemFeatures(fs, workAttrs); // APL
}
String xcasElementName = getXCasElementName(typeName);
startElement(xcasElementName, workAttrs, 0);
// xmlStack.commitNode();
endElement(xcasElementName);
return;
}
case LowLevelCAS.TYPE_CLASS_INTARRAY: {
data = ((IntegerArray)fs).toStringArray();
break;
}
case LowLevelCAS.TYPE_CLASS_FLOATARRAY: {
data = ((FloatArray)fs).toStringArray();
break;
}
case LowLevelCAS.TYPE_CLASS_STRINGARRAY: {
data = ((StringArray)fs).toArray();
break;
}
case LowLevelCAS.TYPE_CLASS_FSARRAY: {
encodeFSArray((FSArray) fs, workAttrs);
return;
}
case LowLevelCAS.TYPE_CLASS_BOOLEANARRAY: {
data = ((BooleanArray)fs).toStringArray();
break;
}
case LowLevelCAS.TYPE_CLASS_BYTEARRAY: {
data = ((ByteArray)fs).toStringArray();
break;
}
case LowLevelCAS.TYPE_CLASS_SHORTARRAY: {
data = ((ShortArray)fs).toStringArray();
break;
}
case LowLevelCAS.TYPE_CLASS_LONGARRAY: {
data = ((LongArray)fs).toStringArray();
break;
}
case LowLevelCAS.TYPE_CLASS_DOUBLEARRAY: {
data = ((DoubleArray)fs).toStringArray();
break;
}
default: {
// Internal error.
throw new RuntimeException("Internal error: classifying FS type.");
}
} // end of switch
// common code for most of the cases
encodePrimitiveTypeArrayFS(data, typeName, workAttrs);
// xmlStack.popNode();
}
private void encodePrimitiveTypeArrayFS(String[] data, String typeName, AttributesImpl attrs)
throws SAXException {
addAttribute(attrs, ARRAY_SIZE_ATTR, Integer.toString(data.length));
startElement(typeName, attrs, data.length);
for (int i = 0; i < data.length; i++) {
startElement(ARRAY_ELEMENT_TAG, emptyAttrs, 1);
addText(data[i] == null ? "" : data[i]);
endElement(ARRAY_ELEMENT_TAG);
}
endElement(typeName);
}
private void encodeFSArray(FSArray fs, AttributesImpl attrs) throws SAXException {
String typeName = fs._getTypeImpl().getName();
final int size = fs.size();
// int pos = cas.getArrayStartAddress(fs_id);
// xmlStack.addAttribute(ARRAY_SIZE_ATTR, Integer.toString(size));
// xmlStack.commitNode();
addAttribute(attrs, ARRAY_SIZE_ATTR, Integer.toString(size));
if (typeName.endsWith(TypeSystemImpl.ARRAY_TYPE_SUFFIX)) {
typeName = CASImpl.TYPE_NAME_FS_ARRAY;
}
startElement(typeName, attrs, size);
for (int i = 0; i < size; i++) {
String val = null;
// xmlStack.pushTextNode(ARRAY_ELEMENT_TAG);
// xmlStack.commitNode();
TOP element = fs.get(i);
if (null == element && mOutOfTypeSystemData != null) {
// This array element may have been a reference to an OOTS FS.
List<ArrayElement> ootsElems = mOutOfTypeSystemData.arrayElements.get(fs);
if (ootsElems != null) {
Iterator<ArrayElement> iter = ootsElems.iterator();
// TODO: iteration could be slow for large arrays
while (iter.hasNext())
{
ArrayElement ootsElem = iter.next();
if (ootsElem.index == i) {
val = mOutOfTypeSystemData.idMap.get(ootsElem.value);
break;
}
}
}
} else if (null != element) {
val = Integer.toString(element._id);
}
if (val != null) {
startElement(ARRAY_ELEMENT_TAG, emptyAttrs, 1);
addText(val);
} else {
startElement(ARRAY_ELEMENT_TAG, emptyAttrs, 0);
}
// xmlStack.popNode();
endElement(ARRAY_ELEMENT_TAG);
}
endElement(typeName);
}
private void enqueueFSArray(FSArray fs) {
TOP[] theArray = fs._getTheArray();
for (TOP element : theArray) {
if (element != null) {
enqueue(element);
}
}
}
/*
* Encode features of a regular (non-array) FS.
*/
private void encodeFeatures(TOP fs, AttributesImpl attrs) {
TypeImpl ti = fs._getTypeImpl();
for (FeatureImpl fi : ti.getFeatureImpls()) {
String attrValue;
if (fi.getRangeImpl().isRefType) {
TOP v = fs.getFeatureValue(fi);
attrValue = (null == v) ? null : Integer.toString(v._id);
} else {
attrValue = fs.getFeatureValueAsString(fi);
}
if (attrValue != null) {
addAttribute(attrs, featureNames[fi.getCode()], attrValue);
}
}
}
private void enqueueFeatures(TOP fs, int heapValue) {
TypeImpl ti = fs._getTypeImpl();
if (fs instanceof UimaSerializable) {
((UimaSerializable)fs)._save_to_cas_data();
}
for (FeatureImpl fi : ti.getFeatureImpls()) {
if (fi.getRangeImpl().isRefType) {
TOP v = fs.getFeatureValue(fi);
if (null != v) {
enqueue(v);
}
}
}
}
/*
* Encode Out-Of-TypeSystem Features.
*/
private void encodeOutOfTypeSystemFeatures(TOP fs, AttributesImpl attrs) {
List<Pair<String, Object>> attrList = mOutOfTypeSystemData.extraFeatureValues.get(fs);
if (attrList != null) {
for (Pair<String, Object> p : attrList) {
String sv = (p.u instanceof String) ? (String) p.u : "";
// remap ID if necessary
if (p.t.startsWith(REF_PREFIX)) {
if (sv.startsWith("a")) { // reference to OOTS FS
// - remap
p.u = sv = mOutOfTypeSystemData.idMap.get(sv);
}
}
addAttribute(attrs, p.t, sv);
}
}
}
/*
* Encode Out-Of-TypeSystem Features.
*/
private void enqueueOutOfTypeSystemFeatures(TOP fs) {
List<Pair<String, Object>> attrList = mOutOfTypeSystemData.extraFeatureValues.get(fs);
if (attrList != null) {
Iterator<Pair<String, Object>> it = attrList.iterator();
while (it.hasNext()) {
Pair<String, Object> p = it.next();
String sv = (p.u instanceof String) ? (String) p.u : "";
// remap ID if necessary
if (p.t.startsWith(REF_PREFIX)) {
// references whose ID starts with the character 'a' are references to out of type
// system FS. All other references should be to in-typesystem FS, which we need to
// enqueue.
if (p.u instanceof TOP) {
enqueue((TOP) p.u);
// enqueue(cas.getFsFromId_checked(Integer.parseInt(attr[1])));
}
}
}
}
}
private final String getTypeName(TOP fs) {
return fs.getType().getName();
}
/**
* classify the type, without distinguishng list types
* @param ti the type
* @return the classification
*/
private final int classifyType(TypeImpl ti) {
return TypeSystemImpl.getTypeClass(ti);
}
/*
* Produces XCAS from Out-Of-Typesystem data. (APL)
*/
private void enqueueOutOfTypeSystemData(OutOfTypeSystemData aData) {
for (FSData fs : aData.fsList) {
for (Entry<String, Object> entry : fs.featVals.entrySet()) {
String attrName = entry.getKey();
if (attrName.startsWith(REF_PREFIX)) {
Object attrVal = entry.getValue();
// references whose ID starts with the character 'a' are references to out of type
// system FS. All other references should be to in-typesystem FS, which we need to
// enqueue.
if (attrVal instanceof TOP /*String && !((String)attrVal).startsWith("a")*/) {
enqueue((TOP)attrVal);
// enqueue(cas.getFsFromId_checked(Integer.parseInt(attrVal)));
}
}
}
}
}
private void serializeOutOfTypeSystemData(OutOfTypeSystemData aData) throws SAXException {
for (FSData fs : aData.fsList) {
workAttrs.clear();
// Add indexed info.
if (fs.indexRep != null) {
// xmlStack.addAttribute(INDEXED_ATTR_NAME, TRUE_VALUE);
addAttribute(workAttrs, INDEXED_ATTR_NAME, fs.indexRep);
}
// Add ID attribute (remap to new unique integer ID).
addAttribute(workAttrs, ID_ATTR_NAME, fs.id);
// Add other attributes (remap OOTS refs)
for (Entry<String, Object> entry : fs.featVals.entrySet()) {
String attrName = entry.getKey();
Object attrVal = entry.getValue();
if (attrName.startsWith(REF_PREFIX)) {
if (attrVal instanceof String && ((String)attrVal).startsWith("a")) {
// "a" prefix indicates a reference from one OOTS FS
// to another OOTS FS;
// we need to remap those IDs to the actual IDs used
// in the XCAS
attrVal = mOutOfTypeSystemData.idMap.get(attrVal);
}
}
addAttribute(workAttrs, attrName, (attrVal instanceof TOP)
? Integer.toString(((TOP)attrVal)._id)
: (String)attrVal);
}
// send events
String xcasElementName = getXCasElementName(fs.type);
startElement(xcasElementName, workAttrs, 0);
endElement(xcasElementName);
}
}
}
/**
* Gets the XCAS element name for a CAS type name. The element name is usually the same as the
* type name, but the sequences _colon_ and _dash_ are translated to the characters : and -,
* respectively.
*
* @param aCasTypeName
* CAS type name
* @return XCAS element name for this type name
*/
private String getXCasElementName(String aTagName) {
if (aTagName.indexOf(':') == -1 && aTagName.indexOf('-') == -1) {
return aTagName;
} else {
// Note: This is really slow so we avoid if possible. -- RJB
return StringUtils
.replaceAll(StringUtils.replaceAll(aTagName, ":", "_colon_"), "-", "_dash_");
}
}
public static final String casTagName = "CAS";
public static final String VERSION_ATTR = "version";
public static final String CURRENT_VERSION = "2";
public static final String DEFAULT_DOC_TYPE_NAME = "uima.tcas.Document";
public static final String DEFAULT_DOC_TEXT_FEAT = "text";
public static final String INDEXED_ATTR_NAME = "_indexed";
public static final String REF_PREFIX = "_ref_";
public static final String ID_ATTR_NAME = "_id";
public static final String CONTENT_ATTR_NAME = "_content";
public static final String ARRAY_SIZE_ATTR = "size";
public static final String ARRAY_ELEMENT_TAG = "i";
public static final String TRUE_VALUE = "true";
private TypeSystemImpl ts;
// Create own cache of feature names because of _ref_ prefixes.
private String[] featureNames;
// name of tag to contain document text
private String docTypeName = DEFAULT_DOC_TYPE_NAME;
// value of _content attribute for document text element
private String docTextFeature = DEFAULT_DOC_TEXT_FEAT;
public XCASSerializer(TypeSystem ts, UimaContext uimaContext) {
super();
// System.out.println("Creating serializer for type system.");
this.ts = (TypeSystemImpl) ts;
// Create feature name cache.
final int featArraySize = this.ts.getNumberOfFeatures() + 1;
this.featureNames = new String[featArraySize];
FeatureImpl feat;
String featName;
Iterator<Feature> it = this.ts.getFeatures();
while (it.hasNext()) {
feat = (FeatureImpl) it.next();
if (feat.getRange().isPrimitive()) {
featName = feat.getShortName();
} else {
featName = REF_PREFIX + feat.getShortName();
}
this.featureNames[feat.getCode()] = featName;
}
}
public XCASSerializer(TypeSystem ts) {
this(ts, null);
}
/**
* Write the CAS data to a SAX content handler.
*
* @param cas
* The CAS to be serialized.
* @param contentHandler
* The SAX content handler the data is written to.
* @throws IOException passed thru
* @throws SAXException passed thru
*/
public void serialize(CAS cas, ContentHandler contentHandler) throws IOException, SAXException {
serialize(cas, contentHandler, true);
}
/**
* Write the CAS data to a SAX content handler.
*
* @param cas
* The CAS to be serialized.
* @param contentHandler
* The SAX content handler the data is written to.
* @param encodeDoc
* If set to false, no uima.tcas.Document structure will be created, and the document
* text will not be serialized.
* @throws IOException passed thru
* @throws SAXException passed thru
*/
public void serialize(CAS cas, ContentHandler contentHandler, boolean encodeDoc)
throws IOException, SAXException {
serialize(cas, contentHandler, encodeDoc, null);
}
/**
* Write the CAS data to a SAX content handler.
*
* @param cas
* The CAS to be serialized.
* @param contentHandler
* The SAX content handler the data is written to.
* @param encodeDoc
* If set to false, no uima.tcas.Document structure will be created, and the document
* text will not be serialized.
* @param outOfTypeSystemData
* data not part of the CAS type system, which should be inserted into the XCAS output
*
* @throws IOException passed thru
* @throws SAXException passed thru
*/
public void serialize(CAS cas, ContentHandler contentHandler, boolean encodeDoc,
OutOfTypeSystemData outOfTypeSystemData) throws IOException, SAXException {
contentHandler.startDocument();
XCASDocSerializer ser = new XCASDocSerializer(contentHandler, ((CASImpl) cas).getBaseCAS());
ser.serialize(encodeDoc, outOfTypeSystemData);
contentHandler.endDocument();
// System.out.println("Done serializing " + ser.fsCount + " FSs.");
}
/**
* Gets the name of the type representing the document. This will become the name of the XML
* element that will hold the document text.
*
* @return the document type name
*/
public String getDocumentTypeName() {
return docTypeName;
}
/**
* Gets the name of the type representing the document. This will become the name of the XML
* element that will hold the document text. If not set, defaults to
* {@link #DEFAULT_DOC_TYPE_NAME}.
*
* @param aDocTypeName
* the document type name
*/
public void setDocumentTypeName(String aDocTypeName) {
docTypeName = aDocTypeName;
}
/**
* Gets the name of the feature holding the documeng text. This will become the value of the
* _content attribute on the document element.
*
* @return the document text feature
*/
public String getDocumentTextFeature() {
return docTextFeature;
}
/**
* Sets the name of the feature holding the documeng text. This will become the value of the
* _content attribute on the document element. If not set, defaults to
* {@link #DEFAULT_DOC_TEXT_FEAT}. If set to null, no _content attribute will be emitted.
*
* @param aDocTextFeature
* the document text feature
*/
public void setDocumentTextFeature(String aDocTextFeature) {
docTextFeature = aDocTextFeature;
}
/**
* Serializes an XCAS to a stream.
*
* @param aCAS
* CAS to serialize.
* @param aStream
* output stream to which to write the XCAS XML document
*
* @throws SAXException
* if a problem occurs during XCAS serialization
* @throws IOException
* if an I/O failure occurs
*/
public static void serialize(CAS aCAS, OutputStream aStream) throws SAXException, IOException {
XCASSerializer.serialize(aCAS, aStream, false);
}
/**
* Serializes an XCAS to a stream.
*
* @param aCAS
* CAS to serialize.
* @param aStream
* output stream to which to write the XCAS XML document
* @param isFormattedOutput
* if true the XCAS will be serialized formatted
*
* @throws SAXException
* if a problem occurs during XCAS serialization
* @throws IOException
* if an I/O failure occurs
*/
public static void serialize(CAS aCAS, OutputStream aStream, boolean isFormattedOutput)
throws SAXException, IOException {
XCASSerializer xcasSerializer = new XCASSerializer(aCAS.getTypeSystem());
XMLSerializer sax2xml = new XMLSerializer(aStream, isFormattedOutput);
xcasSerializer.serialize(aCAS, sax2xml.getContentHandler());
}
}