blob: cd85cf21adb9f034d14095eedddc0adc4cd9c837 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.apache.uima.cas_data.impl;
import java.util.Arrays;
import java.util.Iterator;
import java.util.List;
import org.apache.uima.cas_data.CasData;
import org.apache.uima.cas_data.FeatureStructure;
import org.apache.uima.cas_data.FeatureValue;
import org.apache.uima.cas_data.PrimitiveArrayFS;
import org.apache.uima.cas_data.PrimitiveValue;
import org.apache.uima.cas_data.ReferenceArrayFS;
import org.apache.uima.cas_data.ReferenceValue;
import org.apache.uima.internal.util.StringUtils;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
import org.xml.sax.helpers.AttributesImpl;
/**
* Takes a CasData and generates XCAS SAX events.
*
*
*/
public class CasDataToXCas {
private String mDocumentTextTypeName = "uima.cpm.DocumentText";
private String mDocumentTextFeatureName = "value";
private String mXCasDocTextTag = "uima.cpm.DocumentText";
private boolean mIncludeAnnotationSpannedText = false;
private List mTypesToFilter = null;
/**
* Gets the name of the CASData FeatureStructure Type that stores the document text.
*
* @return the document text type name
*/
public String getDocumentTextTypeName() {
return mDocumentTextTypeName;
}
/**
* Sets the name of the CASData FeatureStructure Type that stores the document text.
*
* @parma aDocumentTextTypeName the document text type name
*/
public void setDocumentTextTypeName(String aDocumentTextTypeName) {
mDocumentTextTypeName = aDocumentTextTypeName;
}
/**
* Gets the name of the CASData Feature that stores the document text.
*
* @return the document text feature name
*/
public String getDocumentTextFeatureName() {
return mDocumentTextFeatureName;
}
/**
* Sets the name of the CASData Feature that stores the document text.
*
* @param aDocumentTextFeatureName
* the document text feature name
*/
public void setDocumentTextFeatureName(String aDocumentTextFeatureName) {
mDocumentTextFeatureName = aDocumentTextFeatureName;
}
/**
* Sets the name of the XCAS tag that will contain the document text.
*
* @param aXCasDocTextTag
* the document text tag
*/
public void setXCasDocumentTextTagName(String aXCasDocTextTag) {
mXCasDocTextTag = aXCasDocTextTag;
}
/**
* @param aIncludeAnnotationSpannedText
*/
public void setIncludeAnnotationSpannedText(boolean aIncludeAnnotationSpannedText) {
mIncludeAnnotationSpannedText = aIncludeAnnotationSpannedText;
}
/**
* Specifies names of types that will not be included in the XCAS
*
* @param aTypesToFilter
*/
public void setTypesToFilter(String[] aTypesToFilter) {
mTypesToFilter = Arrays.asList(aTypesToFilter);
}
/**
* Sets the ContentHandler to receive the SAX events.
*
* @param aHandler
*/
public void setContentHandler(ContentHandler aHandler) {
mHandler = aHandler;
}
/**
* Generates XCAS for a CasData. SAX events representing the XCAS will be sent to the
* ContentHandler registered via {@link #setContentHandler(ContentHandler)}.
*
* @param aCasData
* the CasData from which XCAS will be generated
*
* @throws SAXException
* if the ContentHandler throws a SAX Exception
*/
public void generateXCas(CasData aCasData) throws SAXException {
generateXCas(aCasData, null, true);
}
/**
* Special form of {@link #generateXCas(CasData)} that allows a UEID (Universal Entity ID) element
* to be added as the first element in the XCAS.
*
* @param aCasData
* the CasData from which XCAS will be generated
* @param aUEID
* the UEID to add to the XCAS
*
* @throws SAXException
* if the ContentHandler throws a SAX Exception
*/
public void generateXCas(CasData aCasData, String aUEID) throws SAXException {
generateXCas(aCasData, aUEID, true);
}
/**
* Special form of {@link #generateXCas(CasData)} that allows a UEID (Universal Entity ID) element
* to be added as the first element in the XCAS and also allows start/end document SAX calls to be
* supressed.
*
* @param aCasData
* the CasData from which XCAS will be generated
* @param aUEID
* the UEID to add to the XCAS
* @param aSendStartAndEndDocEvents
* true to send SAX events for start and end of document, false to supress them.
*
* @throws SAXException
* if the ContentHandler throws a SAX Exception
*/
public void generateXCas(CasData aCasData, String aUEID, boolean aSendStartAndEndDocEvents)
throws SAXException {
if (aSendStartAndEndDocEvents) {
mHandler.startDocument();
}
DocTextHolder docTextHolder = new DocTextHolder();
// start enclosing CAS tag
mHandler.startElement("", "CAS", "CAS", new AttributesImpl());
// add UEID if specified
if (aUEID != null) {
mHandler.startElement("", "UEID", "UEID", new AttributesImpl());
mHandler.characters(aUEID.toCharArray(), 0, aUEID.length());
mHandler.endElement("", "UEID", "UEID");
}
// iterate over FSs and generate XCAS
Iterator iter = aCasData.getFeatureStructures();
while (iter.hasNext()) {
FeatureStructure fs = (FeatureStructure) iter.next();
if (mTypesToFilter == null || !mTypesToFilter.contains(fs.getType())) {
_generate(fs, docTextHolder);
}
}
// end enclosing CAS tag
mHandler.endElement("", "CAS", "CAS");
if (aSendStartAndEndDocEvents) {
mHandler.endDocument();
}
}
private void _generate(FeatureStructure aFS, DocTextHolder aDocTextHolder) throws SAXException {
// document text is special case
if (aFS.getType().equals(this.getDocumentTextTypeName())) {
_generateDocFS(aFS, aDocTextHolder);
} else {
// generate attributes for features (except "value" feature, which is represented in element
// text)
AttributesImpl attrs = new AttributesImpl();
String contentValue = null;
if (aFS.getId() != null) {
attrs.addAttribute("", "_id", "_id", "CDATA", aFS.getId());
}
int[] indexed = aFS.getIndexed();
if (indexed.length > 0) {
StringBuffer indexedStr = new StringBuffer();
indexedStr.append(indexed[0]);
for (int i = 1; i < indexed.length; i++) {
indexedStr.append(' ').append(indexed[i]);
}
attrs.addAttribute("", "_indexed", "_indexed", "CDATA", indexedStr.toString());
}
String[] features = aFS.getFeatureNames();
for (int i = 0; i < features.length; i++) {
FeatureValue featVal = aFS.getFeatureValue(features[i]);
if (featVal instanceof PrimitiveValue) {
if (!"value".equals(features[i])) {
attrs.addAttribute("", features[i], features[i], "CDATA", featVal.toString());
} else {
contentValue = featVal.toString();
}
} else {
if (!"value".equals(features[i])) {
attrs.addAttribute("", "_ref_" + features[i], "_ref_" + features[i], "CDATA",
((ReferenceValue) featVal).getTargetId());
} else {
contentValue = ((ReferenceValue) featVal).getTargetId();
}
}
}
String xcasElementName = getXCasElementName(aFS);
mHandler.startElement("", xcasElementName, xcasElementName, attrs);
// encode array subelements
String[] arrayElems = null;
if (aFS instanceof PrimitiveArrayFS) {
arrayElems = ((PrimitiveArrayFS) aFS).toStringArray();
} else if (aFS instanceof ReferenceArrayFS) {
arrayElems = ((ReferenceArrayFS) aFS).getIdRefArray();
}
if (arrayElems != null) {
for (int j = 0; j < arrayElems.length; j++) {
mHandler.startElement("", "i", "i", new AttributesImpl());
if (arrayElems[j] != null) {
mHandler.characters(arrayElems[j].toCharArray(), 0, arrayElems[j].length());
}
mHandler.endElement("", "i", "i");
}
}
// encode "value" feature, if specified, as content
if (contentValue != null) {
mHandler.characters(contentValue.toCharArray(), 0, contentValue.length());
}
// encode annotation spanned text, if this FS has valid begin and end features
else if (mIncludeAnnotationSpannedText && aDocTextHolder.docText != null
&& aDocTextHolder.docText.length > 0) {
FeatureValue begin = aFS.getFeatureValue("begin");
FeatureValue end = aFS.getFeatureValue("end");
if (begin instanceof PrimitiveValue && end instanceof PrimitiveValue) {
int beginChar = ((PrimitiveValue) begin).toInt();
int endChar = ((PrimitiveValue) end).toInt();
if (beginChar >= 0 && endChar > beginChar && endChar <= aDocTextHolder.docText.length) {
// special case: do not include text of annotations spanning entire document
if (beginChar > 0 || endChar < aDocTextHolder.docText.length) {
mHandler.characters(aDocTextHolder.docText, beginChar, endChar - beginChar);
}
}
}
}
mHandler.endElement("", xcasElementName, xcasElementName);
}
}
/**
* Gets the XCAS element name for a FS. This is usually the same as the type name, but the
* sequences _colon_ and _dash_ are translated to the characters : and -, respectively.
*
* @param aFS
* feature structures
* @return XCAS element name for this feature structure
*/
private String getXCasElementName(FeatureStructure aFS) {
return StringUtils.replaceAll(StringUtils.replaceAll(aFS.getType(), "_colon_", ":"), "_dash_",
"-");
}
/**
* @param aFS
*/
private void _generateDocFS(FeatureStructure aFS, DocTextHolder aDocTextHolder)
throws SAXException {
AttributesImpl attrs = new AttributesImpl();
String textFeature = this.getDocumentTextFeatureName();
FeatureValue docTextValue = aFS.getFeatureValue(textFeature);
if (docTextValue != null) {
String text = docTextValue.toString();
aDocTextHolder.docText = text.toCharArray();
if (!textFeature.equals("value")) {
attrs.addAttribute("", "_content", "_content", "CDATA", textFeature);
}
mHandler.startElement("", mXCasDocTextTag, mXCasDocTextTag, attrs);
mHandler.characters(aDocTextHolder.docText, 0, aDocTextHolder.docText.length);
mHandler.endElement("", mXCasDocTextTag, mXCasDocTextTag);
} else {
mHandler.startElement("", mXCasDocTextTag, mXCasDocTextTag, attrs);
mHandler.endElement("", mXCasDocTextTag, mXCasDocTextTag);
}
}
private ContentHandler mHandler;
private static class DocTextHolder {
char[] docText;
}
}