blob: be377a94ba413943417fa938f52c44c4af5eb55c [file] [log] [blame]
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
package org.apache.uima.cas_data.impl;
import org.apache.uima.cas.CAS;
import org.apache.uima.cas.impl.XCASParsingException;
import org.apache.uima.cas.impl.XCASSerializer;
import org.apache.uima.cas_data.CasData;
import org.apache.uima.cas_data.PrimitiveArrayFS;
import org.apache.uima.cas_data.ReferenceArrayFS;
import org.apache.uima.internal.util.StringUtils;
import org.xml.sax.Attributes;
import org.xml.sax.Locator;
import org.xml.sax.SAXException;
import org.xml.sax.SAXParseException;
import org.xml.sax.helpers.DefaultHandler;
* A SAX ContentHandler that reads XCAS and creates a CasData.
public class XCasToCasDataSaxHandler extends DefaultHandler {
// ///////////////////////////////////////////////////////////////////////
// Internal states for the parser.
// Expect the start of the XML document.
private static final int DOC_STATE = 0;
// At the top level. Expect a FS.
private static final int FS_STATE = 1;
// Inside a FS. Expect features, or the end of the FS.
private static final int FEAT_STATE = 2;
// Inside FS. We have seen a _content attribute, and expect text.
private static final int CONTENT_STATE = 3;
// Inside a feature element. We expect the feature value.
private static final int FEAT_CONTENT_STATE = 4;
// Inside an array element. Expect array element value.
private static final int ARRAY_ELE_CONTENT_STATE = 5;
// Inside an array FS. Expect an array element, or the end of the FS.
private static final int ARRAY_ELE_STATE = 6;
// End parser states.
// ///////////////////////////////////////////////////////////////////////
private static final String reservedAttrPrefix = "_";
// For error message printing, if the Locator object can't provide source
// of XML input.
private static final String unknownXMLSource = "<unknown>";
private static final String DEFAULT_CONTENT_FEATURE = "value";
// private long time;
// SAX locator. Used for error message generation.
private Locator locator;
// The CasData we're filling.
private CasData cas;
// What we expect next.
private int state;
// StringBuffer to accumulate text.
private StringBuffer buffer;
// Most recently created FS. Needed for array elements
// and embedded feature values.
private FeatureStructureImpl currentFS;
// The name of the content feature, if we've seen one.
private String currentContentFeat;
// The current position when parsing array elements.
private int arrayPos;
// The type of the array we're currently reading. Needed for proper
// treatment of array element values.
private int arrayType;
* Create new XCasToCasDataSaxHandler.
* @param aCasData
* the CasData to which FeatureStructures parsed from XCAS will be appended
public XCasToCasDataSaxHandler(CasData aCasData) {
this.buffer = new StringBuffer();
this.cas = aCasData;
private final void resetBuffer() {
// this.buffer.delete(0, this.buffer.length());
this.buffer = new StringBuffer();
* (non-Javadoc)
* @see org.xml.sax.ContentHandler#startDocument()
public void startDocument() throws SAXException {
// Do setup work in the constructor.
this.state = DOC_STATE;
// System.out.println("Starting to read document.");
// time = System.currentTimeMillis();
* (non-Javadoc)
* @see org.xml.sax.ContentHandler#startElement(java.lang.String, java.lang.String,
* java.lang.String, org.xml.sax.Attributes)
public void startElement(String nameSpaceURI, String localName, String qualifiedName,
Attributes attrs) throws SAXException {
switch (state) {
case DOC_STATE: {
if (!qualifiedName.equals(XCASSerializer.casTagName)) {
throw createException(XCASParsingException.WRONG_ROOT_TAG, qualifiedName);
this.state = FS_STATE;
case FS_STATE: {
this.currentContentFeat = DEFAULT_CONTENT_FEATURE;
readFS(qualifiedName, attrs);
readArrayElement(qualifiedName, attrs);
default: {
// If we're not in an element expecting state, raise an error.
throw createException(XCASParsingException.TEXT_EXPECTED, qualifiedName);
// Get ready to read array element.
private void readArrayElement(String eleName, Attributes attrs) throws SAXParseException {
if (!eleName.equals(XCASSerializer.ARRAY_ELEMENT_TAG)) {
throw createException(XCASParsingException.ARRAY_ELE_EXPECTED, eleName);
if (attrs.getLength() > 0) {
throw createException(XCASParsingException.ARRAY_ELE_ATTRS);
// Create a new FS.
private void readFS(String qualifiedName, Attributes attrs) throws SAXParseException {
if (isArrayType(qualifiedName)) {
readArray(qualifiedName, attrs);
} else {
this.currentFS = new FeatureStructureImpl();
readFS(this.currentFS, attrs);
* Gets the CAS type name corresponding to an XCAS tag name. The type name is usually equal to the
* tag name, but the characters : and - are translated into the sequences _colon_ and _dash_,
* respectively.
* @param aTagName
* XCAS tag name
* @return CAS type name corresponding to this tag
private String getCasTypeName(String aTagName) {
return StringUtils.replaceAll(StringUtils.replaceAll(aTagName, ":", "_colon_"), "-", "_dash_");
* @param addr
* @param attrs
* @throws SAXParseException
private void readFS(FeatureStructureImpl fsImpl, Attributes attrs) throws SAXParseException {
String attrName, attrValue;
for (int i = 0; i < attrs.getLength(); i++) {
attrName = attrs.getQName(i);
attrValue = attrs.getValue(i);
if (attrName.startsWith(reservedAttrPrefix)) {
if (attrName.equals(XCASSerializer.ID_ATTR_NAME)) {
} else if (attrName.equals(XCASSerializer.CONTENT_ATTR_NAME)) {
this.currentContentFeat = attrValue;
} else if (attrName.equals(XCASSerializer.INDEXED_ATTR_NAME)) {
if (attrValue.equals(XCASSerializer.TRUE_VALUE)) {
fsImpl.setIndexed(new int[] { 1 }); // Backwards compatible CAS, has one default text
// Sofa
} else if (!attrValue.equals("false")) {
} else {
handleFeature(fsImpl, attrName, attrValue);
} else {
handleFeature(fsImpl, attrName, attrValue);
// Set the state; we're either expecting features, or _content.
// APL - 6/28/04 - even if _content attr is not specified, we can still have content, which
// would
// be assigned to the "value" feature, as per XCAS spec. FEAT_STATE did not really seem to be
// working, anyway.
this.state = CONTENT_STATE;
// if (this.state != CONTENT_STATE)
// {
// this.state = FEAT_STATE;
// }
* Parse a space-separated string into an integer array.
private int[] parseIntArray(String val) {
String[] strVals;
val = val.trim();
if ("".equals(val)) {
strVals = new String[0];
} else {
strVals = val.split("\\s+");
int[] intVals = new int[strVals.length];
for (int i = 0; i < strVals.length; i++) {
intVals[i] = Integer.parseInt(strVals[i]);
return intVals;
// Create a new array FS.
private void readArray(String type, Attributes attrs) throws SAXParseException {
String attrName, attrVal;
int[] indexed = new int[0];
int size = 0;
String id = null;
for (int i = 0; i < attrs.getLength(); i++) {
attrName = attrs.getQName(i);
attrVal = attrs.getValue(i);
if (attrName.equals(XCASSerializer.ID_ATTR_NAME)) {
id = attrVal;
} else if (attrName.equals(XCASSerializer.ARRAY_SIZE_ATTR)) {
try {
size = Integer.parseInt(attrVal);
if (size < 0) {
throw createException(XCASParsingException.ILLEGAL_ARRAY_SIZE, attrVal);
} catch (NumberFormatException e) {
throw createException(XCASParsingException.INTEGER_EXPECTED, attrVal);
} else if (attrName.equals(XCASSerializer.INDEXED_ATTR_NAME)) {
if (attrVal.equals(XCASSerializer.TRUE_VALUE)) {
indexed = new int[] { 1 }; // Backwards compatible CAS, has one default text Sofa
} else if (!attrVal.equals("false")) {
indexed = parseIntArray(attrVal);
} else {
throw createException(XCASParsingException.ILLEGAL_ARRAY_ATTR, attrName);
// Hang on to those for setting array values.
this.arrayPos = 0;
if (CAS.TYPE_NAME_INTEGER_ARRAY.equals(type)) {
this.currentFS = new PrimitiveArrayFSImpl(new int[size]);
this.arrayType = INT_TYPE;
} else if (CAS.TYPE_NAME_FLOAT_ARRAY.equals(type)) {
this.currentFS = new PrimitiveArrayFSImpl(new float[size]);
this.arrayType = FLOAT_TYPE;
} else if (CAS.TYPE_NAME_STRING_ARRAY.equals(type)) {
this.currentFS = new PrimitiveArrayFSImpl(new String[size]);
this.arrayType = STRING_TYPE;
} else {
this.currentFS = new ReferenceArrayFSImpl(new String[size]);
this.arrayType = FS_TYPE;
this.currentFS.setFeatureValue(XCASSerializer.ARRAY_SIZE_ATTR, new PrimitiveValueImpl(size));
this.state = ARRAY_ELE_STATE;
// The definition of a null value. Any other value must be in the expected
// format.
private final boolean emptyVal(String val) {
return ((val == null) || (val.length() == 0));
// Create a feature value from a string representation.
private void handleFeature(FeatureStructureImpl fsImpl, String featName, String featVal)
throws SAXParseException {
if (featName.startsWith(XCASSerializer.REF_PREFIX)) {
String realFeatName = featName.substring(XCASSerializer.REF_PREFIX.length());
fsImpl.setFeatureValue(realFeatName, new ReferenceValueImpl(featVal));
} else {
fsImpl.setFeatureValue(featName, new PrimitiveValueImpl(featVal));
* (non-Javadoc)
* @see org.xml.sax.ContentHandler#characters(char[], int, int)
public void characters(char[] chars, int start, int length) throws SAXException {
if ((this.state == CONTENT_STATE)
|| (this.state == ARRAY_ELE_CONTENT_STATE) || (this.state == FEAT_CONTENT_STATE)) {
// When we're in a text expecting state, add the characters to the
// text buffer. Else, do nothing.
buffer.append(chars, start, length);
* (non-Javadoc)
* @see org.xml.sax.ContentHandler#endElement(java.lang.String, java.lang.String,
* java.lang.String)
public void endElement(String nsURI, String localName, String qualifiedName) throws SAXException {
switch (this.state) {
case DOC_STATE: {
// Do nothing.
case FS_STATE: {
this.state = DOC_STATE;
case FEAT_STATE: {
this.state = FS_STATE;
if (!isAllWhitespace(buffer)) {
// Set the value of the content feature.
handleFeature(this.currentFS, currentContentFeat, buffer.toString());
this.state = FS_STATE;
// Create a feature value from an element.
handleFeature(this.currentFS, qualifiedName, buffer.toString());
this.state = FEAT_STATE;
// Create an array value.
this.state = ARRAY_ELE_STATE;
this.state = FS_STATE;
boolean isAllWhitespace(StringBuffer b) {
final int len = b.length();
for (int i = 0; i < len; i++) {
if (!Character.isWhitespace(b.charAt(i))) {
return false;
return true;
private void addArrayElement(String content) throws SAXParseException {
switch (arrayType) {
case INT_TYPE: {
if (!emptyVal(content)) {
try {
((PrimitiveArrayFS) this.currentFS).toIntArray()[arrayPos] = Integer.parseInt(content);
} catch (NumberFormatException e) {
throw createException(XCASParsingException.INTEGER_EXPECTED, content);
case FLOAT_TYPE: {
if (!emptyVal(content)) {
try {
((PrimitiveArrayFS) this.currentFS).toFloatArray()[arrayPos] = Float
} catch (NumberFormatException e) {
throw createException(XCASParsingException.FLOAT_EXPECTED, content);
((PrimitiveArrayFS) this.currentFS).toStringArray()[arrayPos] = content;
case FS_TYPE: {
if (!emptyVal(content)) {
((ReferenceArrayFS) this.currentFS).getIdRefArray()[arrayPos] = content;
* (non-Javadoc)
* @see org.xml.sax.ContentHandler#endDocument()
public void endDocument() throws SAXException {
// nothing to do
private XCASParsingException createException(int code) {
XCASParsingException e = new XCASParsingException(code);
String source = unknownXMLSource;
String line = unknownXMLSource;
String col = unknownXMLSource;
if (locator != null) {
source = locator.getSystemId();
if (source == null) {
source = locator.getPublicId();
if (source == null) {
source = unknownXMLSource;
line = Integer.toString(locator.getLineNumber());
col = Integer.toString(locator.getColumnNumber());
return e;
private XCASParsingException createException(int code, String arg) {
XCASParsingException e = createException(code);
return e;
* (non-Javadoc)
* @see org.xml.sax.ErrorHandler#error(org.xml.sax.SAXParseException)
public void error(SAXParseException e) throws SAXException {
throw e;
* (non-Javadoc)
* @see org.xml.sax.ErrorHandler#fatalError(org.xml.sax.SAXParseException)
public void fatalError(SAXParseException e) throws SAXException {
throw e;
* (non-Javadoc)
* @see org.xml.sax.ContentHandler#ignorableWhitespace(char[], int, int)
public void ignorableWhitespace(char[] arg0, int arg1, int arg2) throws SAXException {
// Since we're not validating, we don't need to do anything; this won't
// be called.
* (non-Javadoc)
* @see org.xml.sax.ContentHandler#setDocumentLocator(org.xml.sax.Locator)
public void setDocumentLocator(Locator loc) {
// System.out.println("Setting document locator.");
this.locator = loc;
* (non-Javadoc)
* @see org.xml.sax.ErrorHandler#warning(org.xml.sax.SAXParseException)
public void warning(SAXParseException e) throws SAXException {
throw e;
private static final int INT_TYPE = 0;
private static final int FLOAT_TYPE = 1;
private static final int STRING_TYPE = 2;
private static final int FS_TYPE = 3;
private boolean isArrayType(String typeName) {
return CAS.TYPE_NAME_INTEGER_ARRAY.equals(typeName)
|| CAS.TYPE_NAME_FLOAT_ARRAY.equals(typeName)
|| CAS.TYPE_NAME_STRING_ARRAY.equals(typeName)
|| CAS.TYPE_NAME_FS_ARRAY.equals(typeName);