| /* |
| * Licensed to the Apache Software Foundation (ASF) under one |
| * or more contributor license agreements. See the NOTICE file |
| * distributed with this work for additional information |
| * regarding copyright ownership. The ASF licenses this file |
| * to you under the Apache License, Version 2.0 (the |
| * "License"); you may not use this file except in compliance |
| * with the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, |
| * software distributed under the License is distributed on an |
| * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
| * KIND, either express or implied. See the License for the |
| * specific language governing permissions and limitations |
| * under the License. |
| */ |
| |
| package org.apache.uima.util; |
| |
| import java.io.ByteArrayOutputStream; |
| import java.util.ArrayList; |
| import java.util.Iterator; |
| import java.util.List; |
| |
| import org.apache.uima.UIMARuntimeException; |
| import org.apache.uima.cas.CAS; |
| import org.apache.uima.cas.CASException; |
| import org.apache.uima.cas.FSIterator; |
| import org.apache.uima.cas.FSMatchConstraint; |
| import org.apache.uima.cas.Feature; |
| import org.apache.uima.cas.FeatureStructure; |
| import org.apache.uima.cas.FloatArrayFS; |
| import org.apache.uima.cas.IntArrayFS; |
| import org.apache.uima.cas.StringArrayFS; |
| import org.apache.uima.cas.Type; |
| import org.apache.uima.cas.text.AnnotationFS; |
| import org.xml.sax.Attributes; |
| import org.xml.sax.ContentHandler; |
| import org.xml.sax.SAXException; |
| import org.xml.sax.helpers.AttributesImpl; |
| |
| /** |
| * Generates an inline XML representation of a CAS. Annotation types are represented as XML tags, |
| * features are represented as attributes. Note that features whose values are FeatureStructures are |
| * not represented. |
| * |
| * @deprecated As of v2.0, use {@link org.apache.uima.util.CasToInlineXml} instead. |
| */ |
| public class TCasToInlineXml implements TCasFormatter { |
| /** |
| * This destroy method does nothing. |
| * |
| * @see org.apache.uima.resource.Resource#destroy() |
| */ |
| public void destroy() { |
| } |
| |
| /** |
| * @see org.apache.uima.util.TCasFormatter#format(CAS) |
| */ |
| public String format(CAS aCAS) throws CASException { |
| return generateXML(aCAS, null); |
| } |
| |
| /** |
| * @see org.apache.uima.util.TCasFormatter#format(CAS, FSMatchConstraint) |
| */ |
| public String format(CAS aCAS, FSMatchConstraint aFilter) throws CASException { |
| return generateXML(aCAS, aFilter); |
| } |
| |
| /** |
| * Generates inline XML from a CAS. |
| * |
| * @param aCAS |
| * CAS to generate from |
| */ |
| public String generateXML(CAS aCAS) throws CASException { |
| return generateXML(aCAS, null); |
| } |
| |
| /** |
| * Generates inline XML from a CAS. |
| * |
| * @param aCAS |
| * CAS to generate from |
| * @param aFilter |
| * constraint that determines which annotations are included in the output. If null (or |
| * ommitted), all annotations are included. |
| */ |
| public String generateXML(CAS aCAS, FSMatchConstraint aFilter) throws CASException { |
| ByteArrayOutputStream byteArrayOutputStream = new ByteArrayOutputStream(); |
| XMLSerializer sax2xml = new XMLSerializer(byteArrayOutputStream); |
| |
| // get document text |
| String docText = aCAS.getDocumentText(); |
| char[] docCharArray = docText.toCharArray(); |
| replaceInvalidXmlChars(docCharArray); |
| |
| // get iterator over annotations sorted by increasing start position and |
| // decreasing end position |
| FSIterator iterator = aCAS.getAnnotationIndex().iterator(); |
| |
| // filter the iterator if desired |
| if (aFilter != null) { |
| iterator = aCAS.createFilteredIterator(iterator, aFilter); |
| } |
| |
| // This is basically a recursive algorithm that has had the recursion |
| // removed through the use of an explicit Stack. We iterate over the |
| // annotations, and if an annotation contains other annotations, we |
| // push the parent annotation on the stack, process the children, and |
| // then come back to the parent later. |
| ArrayList stack = new ArrayList(); |
| int pos = 0; |
| |
| try { |
| ContentHandler handler = sax2xml.getContentHandler(); |
| handler.startDocument(); |
| // write an artificial start tag |
| handler.startElement("", "Document", "Document", new AttributesImpl()); |
| // now use null is a placeholder for this artificial Document annotation |
| AnnotationFS curAnnot = null; |
| |
| while (iterator.isValid()) { |
| // debug |
| // FeatureStructure fs = iterator.get(); |
| // System.out.println("Type: " + fs.getType().getName() + ", Class:" + |
| // fs.getClass().getName()); |
| // AnnotationFS nextAnnot = (AnnotationFS)fs; |
| AnnotationFS nextAnnot = (AnnotationFS) iterator.get(); |
| |
| if (curAnnot == null || nextAnnot.getBegin() < curAnnot.getEnd()) { |
| // nextAnnot's start point is within the span of curAnnot |
| if (curAnnot == null || nextAnnot.getEnd() <= curAnnot.getEnd()) // crossover span check |
| { |
| // nextAnnot is contained within curAnnot |
| |
| // write text between current pos and beginning of nextAnnot |
| try { |
| handler.characters(docCharArray, pos, nextAnnot.getBegin() - pos); |
| pos = nextAnnot.getBegin(); |
| handler.startElement("", nextAnnot.getType().getName(), |
| nextAnnot.getType().getName(), getFeatureAttributes(nextAnnot, aCAS)); |
| |
| // push parent annotation on stack |
| stack.add(curAnnot); |
| // move on to next annotation |
| curAnnot = nextAnnot; |
| } catch (StringIndexOutOfBoundsException e) { |
| System.err.println("Invalid annotation range: " + nextAnnot.getBegin() + "," |
| + nextAnnot.getEnd() + " in document of length " + docText.length()); |
| } |
| } |
| iterator.moveToNext(); |
| } else { |
| // nextAnnot begins after curAnnot ends |
| // write text between current pos and end of curAnnot |
| try { |
| handler.characters(docCharArray, pos, curAnnot.getEnd() - pos); |
| pos = curAnnot.getEnd(); |
| } catch (StringIndexOutOfBoundsException e) { |
| System.err.println("Invalid annotation range: " + curAnnot.getBegin() + "," |
| + curAnnot.getEnd() + " in document of length " + docText.length()); |
| } |
| handler.endElement("", curAnnot.getType().getName(), curAnnot.getType().getName()); |
| |
| // pop next containing annotation off stack |
| curAnnot = (AnnotationFS) stack.remove(stack.size() - 1); |
| } |
| } |
| |
| // finished writing all start tags, now finish up |
| if (curAnnot != null) { |
| try { |
| handler.characters(docCharArray, pos, curAnnot.getEnd() - pos); |
| pos = curAnnot.getEnd(); |
| } catch (StringIndexOutOfBoundsException e) { |
| System.err.println("Invalid annotation range: " + curAnnot.getBegin() + "," |
| + curAnnot.getEnd() + "in document of length " + docText.length()); |
| } |
| handler.endElement("", curAnnot.getType().getName(), curAnnot.getType().getName()); |
| |
| while (!stack.isEmpty()) { |
| curAnnot = (AnnotationFS) stack.remove(stack.size() - 1); // pop |
| if (curAnnot == null) { |
| break; |
| } |
| try { |
| handler.characters(docCharArray, pos, curAnnot.getEnd() - pos); |
| pos = curAnnot.getEnd(); |
| } catch (StringIndexOutOfBoundsException e) { |
| System.err.println("Invalid annotation range: " + curAnnot.getBegin() + "," |
| + curAnnot.getEnd() + "in document of length " + docText.length()); |
| } |
| handler.endElement("", curAnnot.getType().getName(), curAnnot.getType().getName()); |
| } |
| } |
| |
| if (pos < docCharArray.length) { |
| handler.characters(docCharArray, pos, docCharArray.length - pos); |
| } |
| handler.endElement("", "Document", "Document"); |
| handler.endDocument(); |
| |
| // return XML string |
| return new String(byteArrayOutputStream.toByteArray()); |
| } catch (SAXException e) { |
| throw new UIMARuntimeException(e); |
| } |
| } |
| |
| private final Attributes getFeatureAttributes(FeatureStructure aFS, CAS aCAS) { |
| AttributesImpl attrs = new AttributesImpl(); |
| |
| Type stringType = aCAS.getTypeSystem().getType(CAS.TYPE_NAME_STRING); |
| |
| List aFeatures = aFS.getType().getFeatures(); |
| Iterator iter = aFeatures.iterator(); |
| while (iter.hasNext()) { |
| Feature feat = (Feature) iter.next(); |
| String featName = feat.getShortName(); |
| // how we get feature value depends on feature's range type) |
| String rangeTypeName = feat.getRange().getName(); |
| if (aCAS.getTypeSystem().subsumes(stringType, feat.getRange())) // must check for subtypes |
| // of string |
| { |
| String str = aFS.getStringValue(feat); |
| if (str == null) { |
| attrs.addAttribute("", featName, featName, "CDATA", "null"); |
| } else { |
| if (str.length() > 64) { |
| str = str.substring(0, 64) + "..."; |
| } |
| attrs.addAttribute("", featName, featName, "CDATA", str); |
| } |
| } else if (CAS.TYPE_NAME_INTEGER.equals(rangeTypeName)) { |
| attrs |
| .addAttribute("", featName, featName, "CDATA", Integer.toString(aFS |
| .getIntValue(feat))); |
| } else if (CAS.TYPE_NAME_FLOAT.equals(rangeTypeName)) { |
| attrs |
| .addAttribute("", featName, featName, "CDATA", Float.toString(aFS |
| .getFloatValue(feat))); |
| } else if (CAS.TYPE_NAME_STRING_ARRAY.equals(rangeTypeName)) { |
| StringArrayFS arrayFS = (StringArrayFS) aFS.getFeatureValue(feat); |
| if (arrayFS == null) { |
| attrs.addAttribute("", featName, featName, "CDATA", "null"); |
| } else { |
| StringBuffer buf = new StringBuffer(); |
| String[] vals = arrayFS.toArray(); |
| buf.append('['); |
| for (int i = 0; i < vals.length - 1; i++) { |
| buf.append(vals[i]); |
| buf.append(','); |
| } |
| if (vals.length > 0) { |
| buf.append(vals[vals.length - 1]); |
| } |
| buf.append(']'); |
| attrs.addAttribute("", featName, featName, "CDATA", buf.toString()); |
| } |
| } else if (CAS.TYPE_NAME_INTEGER_ARRAY.equals(rangeTypeName)) { |
| IntArrayFS arrayFS = (IntArrayFS) aFS.getFeatureValue(feat); |
| if (arrayFS == null) { |
| attrs.addAttribute("", featName, featName, "CDATA", "null"); |
| } else { |
| StringBuffer buf = new StringBuffer(); |
| int[] vals = arrayFS.toArray(); |
| buf.append('['); |
| for (int i = 0; i < vals.length - 1; i++) { |
| buf.append(vals[i]); |
| buf.append(','); |
| } |
| if (vals.length > 0) { |
| buf.append(vals[vals.length - 1]); |
| } |
| buf.append(']'); |
| attrs.addAttribute("", featName, featName, "CDATA", buf.toString()); |
| } |
| } else if (CAS.TYPE_NAME_FLOAT_ARRAY.equals(rangeTypeName)) { |
| FloatArrayFS arrayFS = (FloatArrayFS) aFS.getFeatureValue(feat); |
| if (arrayFS == null) { |
| attrs.addAttribute("", featName, featName, "CDATA", "null"); |
| } else { |
| StringBuffer buf = new StringBuffer(); |
| float[] vals = arrayFS.toArray(); |
| buf.append('['); |
| for (int i = 0; i < vals.length - 1; i++) { |
| buf.append(vals[i]); |
| buf.append(','); |
| } |
| if (vals.length > 0) { |
| buf.append(vals[vals.length - 1]); |
| } |
| buf.append(']'); |
| attrs.addAttribute("", featName, featName, "CDATA", buf.toString()); |
| } |
| } else { |
| // get value as FeatureStructure |
| FeatureStructure fsVal = aFS.getFeatureValue(feat); |
| if (fsVal == null) { |
| attrs.addAttribute("", featName, featName, "CDATA", "null"); |
| } else { |
| // record type name as value, and covered text if it's an annotation |
| StringBuffer buf = new StringBuffer(); |
| buf.append(fsVal.getType().getShortName()); |
| |
| if (fsVal instanceof AnnotationFS) { |
| buf.append(" ["); |
| String str = ((AnnotationFS) fsVal).getCoveredText(); |
| if (str.length() > 64) { |
| str = str.substring(0, 64) + "..."; |
| } |
| buf.append(str); |
| buf.append(']'); |
| } |
| attrs.addAttribute("", featName, featName, "CDATA", buf.toString()); |
| } |
| } |
| } |
| return attrs; |
| } |
| |
| private void replaceInvalidXmlChars(char[] aChars) { |
| for (int i = 0; i < aChars.length; i++) { |
| if ((aChars[i] < 0x20 && aChars[i] != 0x09 && aChars[i] != 0x0A && aChars[i] != 0x0D) |
| || (aChars[i] > 0xD7FF && aChars[i] < 0xE000) || aChars[i] == 0xFFFE |
| || aChars[i] == 0xFFFF) { |
| // System.out.println("Found invalid XML character: " + (int)aChars[i] + " at position " + |
| // i); //temp |
| aChars[i] = ' '; |
| } |
| } |
| } |
| } |