| /* |
| * Licensed to the Apache Software Foundation (ASF) under one |
| * or more contributor license agreements. See the NOTICE file |
| * distributed with this work for additional information |
| * regarding copyright ownership. The ASF licenses this file |
| * to you under the Apache License, Version 2.0 (the |
| * "License"); you may not use this file except in compliance |
| * with the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, |
| * software distributed under the License is distributed on an |
| * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
| * KIND, either express or implied. See the License for the |
| * specific language governing permissions and limitations |
| * under the License. |
| */ |
| |
| package org.apache.uima.cas.impl; |
| |
| import java.io.IOException; |
| import java.io.InputStream; |
| import java.util.ArrayList; |
| import java.util.Iterator; |
| import java.util.List; |
| import java.util.Map; |
| |
| import org.apache.uima.UimaContext; |
| import org.apache.uima.cas.CAS; |
| import org.apache.uima.cas.SofaFS; |
| import org.apache.uima.cas.Type; |
| import org.apache.uima.cas.TypeSystem; |
| import org.apache.uima.internal.util.IntVector; |
| import org.apache.uima.internal.util.StringUtils; |
| import org.apache.uima.internal.util.rb_trees.RedBlackTree; |
| import org.xml.sax.Attributes; |
| import org.xml.sax.ContentHandler; |
| import org.xml.sax.InputSource; |
| import org.xml.sax.Locator; |
| import org.xml.sax.SAXException; |
| import org.xml.sax.SAXParseException; |
| import org.xml.sax.XMLReader; |
| import org.xml.sax.helpers.DefaultHandler; |
| import org.xml.sax.helpers.XMLReaderFactory; |
| |
| /** |
| * XCAS Deserializer. Takes an XCAS and reads it into a CAS. |
| * |
| * |
| */ |
| public class XCASDeserializer { |
| |
| private static class FSInfo { |
| |
| private int addr; |
| |
| private IntVector indexRep; |
| |
| private FSInfo(int addr, IntVector indexRep) { |
| super(); |
| this.addr = addr; |
| this.indexRep = indexRep; |
| } |
| |
| } |
| |
| private class XCASDeserializerHandler extends DefaultHandler { |
| |
| // /////////////////////////////////////////////////////////////////////// |
| // Internal states for the parser. |
| |
| // Expect the start of the XML document. |
| private static final int DOC_STATE = 0; |
| |
| // At the top level. Expect a FS, or the document text element, or the end of the |
| // XML input. |
| private static final int FS_STATE = 1; |
| |
| // Inside a FS. Expect features, or the end of the FS. |
| private static final int FEAT_STATE = 2; |
| |
| // Inside FS. We have seen a _content attribute, and expect text. |
| private static final int CONTENT_STATE = 3; |
| |
| // Inside a feature element. We expect the feature value. |
| private static final int FEAT_CONTENT_STATE = 4; |
| |
| // Inside an array element. Expect array element value. |
| private static final int ARRAY_ELE_CONTENT_STATE = 5; |
| |
| // Inside an array FS. Expect an array element, or the end of the FS. |
| private static final int ARRAY_ELE_STATE = 6; |
| |
| // Inside the document text element. Expect the doc text. |
| private static final int DOC_TEXT_STATE = 7; |
| |
| // Inside an Out-Of-Typesystem FS. Expect features, or the end of the FS. |
| private static final int OOTS_FEAT_STATE = 8; |
| |
| // Inside an Out-Of-Typesystem FS. We have seen a _content attribute, |
| // and expect text. |
| private static final int OOTS_CONTENT_STATE = 9; |
| |
| // Default feature name for contents of an FS element, if not specified by _content attribute. |
| private static final String DEFAULT_CONTENT_FEATURE = "value"; |
| |
| // End parser states. |
| // /////////////////////////////////////////////////////////////////////// |
| |
| private static final String reservedAttrPrefix = "_"; |
| |
| // For error message printing, if the Locator object can't provide source |
| // of XML input. |
| private static final String unknownXMLSource = "<unknown>"; |
| |
| // private long time; |
| |
| // SAX locator. Used for error message generation. |
| private Locator locator; |
| |
| // The CAS we're filling. |
| private CASImpl cas; |
| |
| // Store FSs with ID in a search tree (for later reference resolution). |
| private RedBlackTree fsTree; |
| |
| // Store IDless FSs in a vector; |
| private ArrayList idLess; |
| |
| // What we expect next. |
| private int state; |
| |
| // StringBuffer to accumulate text. |
| private StringBuffer buffer; |
| |
| // The address of the most recently created FS. Needed for array elements |
| // and embedded feature values. |
| private int currentAddr; |
| |
| // The name of the content feature, if we've seen one. |
| private String currentContentFeat = DEFAULT_CONTENT_FEATURE; |
| |
| // The current position when parsing array elements. |
| private int arrayPos; |
| |
| // Stores out of type system data (APL) |
| private OutOfTypeSystemData outOfTypeSystemData; |
| |
| // Current out of type system FS |
| private FSData currentOotsFs; |
| |
| // SofaFS type |
| private int sofaTypeCode; |
| |
| // AnnotationBase type |
| private Type annotBaseType; |
| |
| // Store IndexRepositories in a vector; |
| private ArrayList indexRepositories; |
| |
| // and Views too |
| private ArrayList views; |
| |
| // for processing v1.x format XCAS |
| // map from sofa int values to id references |
| private IntVector sofaRefMap; |
| |
| // map incoming _indexed values |
| private IntVector indexMap; |
| |
| // working with initial view |
| private int nextIndex; |
| |
| private XCASDeserializerHandler(CASImpl aCAS, OutOfTypeSystemData ootsData) { |
| super(); |
| this.cas = aCAS.getBaseCAS(); |
| // Reset the CAS. |
| cas.resetNoQuestions(); |
| this.fsTree = new RedBlackTree(); |
| this.idLess = new ArrayList(); |
| this.buffer = new StringBuffer(); |
| this.outOfTypeSystemData = ootsData; |
| this.indexRepositories = new ArrayList(); |
| this.views = new ArrayList(); |
| // using the baseCas for indexing Sofas |
| indexRepositories.add(this.cas.getBaseIndexRepository()); |
| // There should always be another index for the Initial View |
| indexRepositories.add(this.cas.getView(CAS.NAME_DEFAULT_SOFA).getIndexRepository()); |
| this.sofaTypeCode = cas.ll_getTypeSystem().ll_getCodeForType( |
| cas.getTypeSystem().getType(CAS.TYPE_NAME_SOFA)); |
| this.annotBaseType = this.cas.getAnnotationType(); |
| this.sofaRefMap = new IntVector(); |
| this.indexMap = new IntVector(); |
| // add entry for baseCAS ... point non-compliant annotations at first Sofa |
| sofaRefMap.add(1); |
| // add entry for baseCAS ... _indexed=0 stays in 0 |
| indexMap.add(0); |
| } |
| |
| private final void resetBuffer() { |
| // this.buffer.delete(0, this.buffer.length()); |
| this.buffer = new StringBuffer(); |
| } |
| |
| /* |
| * (non-Javadoc) |
| * |
| * @see org.xml.sax.ContentHandler#startDocument() |
| */ |
| public void startDocument() throws SAXException { |
| // Do setup work in the constructor. |
| this.state = DOC_STATE; |
| // System.out.println("Starting to read document."); |
| // time = System.currentTimeMillis(); |
| } |
| |
| /* |
| * (non-Javadoc) |
| * |
| * @see org.xml.sax.ContentHandler#startElement(java.lang.String, java.lang.String, |
| * java.lang.String, org.xml.sax.Attributes) |
| */ |
| public void startElement(String nameSpaceURI, String localName, String qualifiedName, |
| Attributes attrs) throws SAXException { |
| // org.apache.vinci.debug.Debug.p("startElement: " + qualifiedName); |
| // if (attrs != null) { |
| // for (int i=0; i<attrs.getLength(); i++) { |
| // org.apache.vinci.debug.Debug.p("a: " + attrs.getQName(i) + " v: " + attrs.getValue(i)); |
| // } |
| // } |
| resetBuffer(); |
| switch (state) { |
| case DOC_STATE: { |
| if (!qualifiedName.equals(XCASSerializer.casTagName)) { |
| throw createException(XCASParsingException.WRONG_ROOT_TAG, qualifiedName); |
| } |
| this.state = FS_STATE; |
| break; |
| } |
| case FS_STATE: { |
| this.currentContentFeat = DEFAULT_CONTENT_FEATURE; |
| if (qualifiedName.equals(getDocumentTypeName())) { |
| readDocument(attrs); |
| } else { |
| readFS(qualifiedName, attrs); |
| } |
| break; |
| } |
| case ARRAY_ELE_STATE: { |
| readArrayElement(qualifiedName, attrs); |
| break; |
| } |
| default: { |
| // If we're not in an element expecting state, raise an error. |
| throw createException(XCASParsingException.TEXT_EXPECTED, qualifiedName); |
| } |
| } |
| } |
| |
| // Get ready to read document text. |
| private void readDocument(Attributes attrs) { |
| this.state = DOC_TEXT_STATE; |
| } |
| |
| // Get ready to read array element. |
| private void readArrayElement(String eleName, Attributes attrs) throws SAXParseException { |
| if (!eleName.equals(XCASSerializer.ARRAY_ELEMENT_TAG)) { |
| throw createException(XCASParsingException.ARRAY_ELE_EXPECTED, eleName); |
| } |
| if (attrs.getLength() > 0) { |
| throw createException(XCASParsingException.ARRAY_ELE_ATTRS); |
| } |
| this.state = ARRAY_ELE_CONTENT_STATE; |
| // resetBuffer(); |
| } |
| |
| // Create a new FS. |
| private void readFS(String qualifiedName, Attributes attrs) throws SAXParseException { |
| String typeName = getCasTypeName(qualifiedName); |
| TypeImpl type = (TypeImpl) ts.getType(typeName); |
| if (type == null) { |
| if (typeName.equals("uima.cas.SofA")) { |
| // temporary fix for XCAS written with pre-public version of Sofas |
| type = (TypeImpl) ts.getType("uima.cas.Sofa"); |
| } |
| } |
| if (type == null) { |
| if (this.outOfTypeSystemData == null) { |
| throw createException(XCASParsingException.UNKNOWN_TYPE, typeName); |
| } else { |
| // add this FS to out-of-typesystem data - this also sets the |
| // parser state appropriately (APL) |
| addToOutOfTypeSystemData(typeName, attrs); |
| } |
| } else { |
| if (cas.isArrayType(type.getCode())) { |
| readArray(type, attrs); |
| return; |
| } |
| final int addr = cas.ll_createFS(type.getCode()); |
| readFS(addr, attrs, true); |
| } |
| } |
| |
| /** |
| * |
| * @param addr |
| * @param attrs |
| * @param toIndex |
| * Special hack to accomodate document annotation, which is already in the index. |
| * @throws SAXParseException |
| */ |
| private void readFS(final int addr, Attributes attrs, boolean toIndex) throws SAXParseException { |
| // Hang on address for setting content feature |
| this.currentAddr = addr; |
| int id = -1; |
| IntVector indexRep = new IntVector(); // empty means not indexed |
| String attrName, attrValue; |
| final int heapValue = cas.getHeapValue(addr); |
| final Type type = cas.ll_getTypeSystem().ll_getTypeForCode(cas.ll_getFSRefType(addr)); |
| |
| // Special handling for Sofas |
| if (sofaTypeCode == heapValue) { |
| // create some maps to handle v1 format XCAS ... |
| // ... where the sofa feature of annotations was an int not a ref |
| |
| // determine if this is the one and only initial view Sofa |
| boolean isInitialView = false; |
| String sofaID = attrs.getValue(CAS.FEATURE_BASE_NAME_SOFAID); |
| if (sofaID.equals("_DefaultTextSofaName")) { |
| sofaID = CAS.NAME_DEFAULT_SOFA; |
| } |
| if (uimaContext != null) { |
| // Map incoming SofaIDs |
| sofaID = uimaContext.mapToSofaID(sofaID).getSofaID(); |
| } |
| if (sofaID.equals(CAS.NAME_DEFAULT_SOFA)) { |
| isInitialView = true; |
| } |
| // get the sofaNum |
| String sofaNum = attrs.getValue(CAS.FEATURE_BASE_NAME_SOFANUM); |
| int thisSofaNum = Integer.parseInt(sofaNum); |
| |
| // get the sofa's FeatureStructure id |
| int sofaFsId = Integer.parseInt(attrs.getValue(XCASSerializer.ID_ATTR_NAME)); |
| |
| // for v1 and v2 formats, create the index map |
| // ***we assume Sofas are always received in Sofanum order*** |
| // Two scenarios ... the initial view is the first sofa, or not. |
| // If not, the _indexed values need to be remapped to leave room for the initial view, |
| // which may or may not be in the received CAS. |
| if (this.indexMap.size() == 1) { |
| if (isInitialView) { |
| // the first Sofa an initial view |
| if (thisSofaNum == 2) { |
| // this sofa was mapped to the initial view |
| this.indexMap.add(-1); // for this CAS, there should not be a sofanum = 1 |
| this.indexMap.add(1); // map 2 to 1 |
| this.nextIndex = 2; |
| } else { |
| this.indexMap.add(1); |
| this.nextIndex = 2; |
| } |
| } else { |
| if (thisSofaNum > 1) { |
| // the first Sofa not initial, but sofaNum > 1 |
| // must be a v2 format, and sofaNum better be 2 |
| this.indexMap.add(1); |
| assert (thisSofaNum == 2); |
| this.indexMap.add(2); |
| this.nextIndex = 3; |
| } else { |
| // must be v1 format |
| this.indexMap.add(2); |
| this.nextIndex = 3; |
| } |
| } |
| } else { |
| // if the new Sofa is the initial view, always map to 1 |
| if (isInitialView) { |
| // the initial view is not the first |
| // if v2 format, space already reserved in mapping |
| if (this.indexMap.size() == thisSofaNum) { |
| // v1 format, add mapping for initial view |
| this.indexMap.add(1); |
| } |
| } else { |
| this.indexMap.add(this.nextIndex); |
| this.nextIndex++; |
| } |
| } |
| |
| // Now update the mapping from annotation int to ref values |
| if (this.sofaRefMap.size() == thisSofaNum) { |
| // Sofa received in sofaNum order, add new one |
| this.sofaRefMap.add(sofaFsId); |
| } else if (this.sofaRefMap.size() > thisSofaNum) { |
| // new Sofa has lower sofaNum than last one |
| this.sofaRefMap.set(thisSofaNum, sofaFsId); |
| } else { |
| // new Sofa has skipped ahead more than 1 |
| this.sofaRefMap.setSize(thisSofaNum + 1); |
| this.sofaRefMap.set(thisSofaNum, sofaFsId); |
| } |
| |
| } |
| |
| for (int i = 0; i < attrs.getLength(); i++) { |
| attrName = attrs.getQName(i); |
| attrValue = attrs.getValue(i); |
| if (attrName.startsWith(reservedAttrPrefix)) { |
| if (attrName.equals(XCASSerializer.ID_ATTR_NAME)) { |
| try { |
| id = Integer.parseInt(attrValue); |
| } catch (NumberFormatException e) { |
| throw createException(XCASParsingException.ILLEGAL_ID, attrValue); |
| } |
| } else if (attrName.equals(XCASSerializer.CONTENT_ATTR_NAME)) { |
| this.currentContentFeat = attrValue; |
| // this.state = CONTENT_STATE; APL-6/28/04 - removed, see below |
| } else if (attrName.equals(XCASSerializer.INDEXED_ATTR_NAME)) { |
| // if (attrValue.equals(XCASSerializer.TRUE_VALUE) && toIndex) |
| String[] arrayvals = parseArray(attrValue); |
| for (int s = 0; s < arrayvals.length; s++) { |
| indexRep.add(Integer.parseInt(arrayvals[s])); |
| } |
| } else { |
| handleFeature(type, addr, attrName, attrValue, false); |
| } |
| } else { |
| if (sofaTypeCode == heapValue) { |
| if (attrName.equals(CAS.FEATURE_BASE_NAME_SOFAID)) { |
| if (attrValue.equals("_DefaultTextSofaName")) { |
| // First change old default Sofa name into the new one |
| attrValue = CAS.NAME_DEFAULT_SOFA; |
| } |
| if (uimaContext != null) { |
| // Map incoming SofaIDs |
| attrValue = uimaContext.mapToSofaID(attrValue).getSofaID(); |
| } |
| } |
| } |
| handleFeature(type, addr, attrName, attrValue, false); |
| } |
| } |
| |
| if (sofaTypeCode == heapValue) { |
| // If a Sofa, create CAS view to get new indexRepository |
| SofaFS sofa = (SofaFS) cas.createFS(addr); |
| // also add to indexes so we can retrieve the Sofa later |
| cas.getBaseIndexRepository().addFS(sofa); |
| CAS view = cas.getView(sofa); |
| if (sofa.getSofaRef() == 1) { |
| cas.registerInitialSofa(); |
| } else { |
| // add indexRepo for views other than the initial view |
| indexRepositories.add(cas.getSofaIndexRepository(sofa)); |
| } |
| ((CASImpl) view).registerView(sofa); |
| views.add(view); |
| } |
| FSInfo fsInfo = new FSInfo(addr, indexRep); |
| if (id < 0) { |
| idLess.add(fsInfo); |
| } else { |
| fsTree.put(id, fsInfo); |
| } |
| // Set the state; we're either expecting features, or _content. |
| // APL - 6/28/04 - even if _content attr is not specified, we can still have content, which |
| // would |
| // be assigned to the "value" feature, as per XCAS spec. FEAT_STATE did not really seem to be |
| // working, anyway. |
| this.state = CONTENT_STATE; |
| // if (this.state != CONTENT_STATE) |
| // { |
| // this.state = FEAT_STATE; |
| // } |
| } |
| |
| // Create a new array FS. |
| private void readArray(TypeImpl type, Attributes attrs) throws SAXParseException { |
| String attrName, attrVal; |
| // No entries in indexRep means not indexed |
| IntVector indexRep = new IntVector(); |
| int size = 0; |
| int id = -1; |
| for (int i = 0; i < attrs.getLength(); i++) { |
| attrName = attrs.getQName(i); |
| attrVal = attrs.getValue(i); |
| if (attrName.equals(XCASSerializer.ID_ATTR_NAME)) { |
| try { |
| id = Integer.parseInt(attrVal); |
| } catch (NumberFormatException e) { |
| throw createException(XCASParsingException.ILLEGAL_ID, attrVal); |
| } |
| } else if (attrName.equals(XCASSerializer.ARRAY_SIZE_ATTR)) { |
| try { |
| size = Integer.parseInt(attrVal); |
| if (size < 0) { |
| throw createException(XCASParsingException.ILLEGAL_ARRAY_SIZE, attrVal); |
| } |
| } catch (NumberFormatException e) { |
| throw createException(XCASParsingException.INTEGER_EXPECTED, attrVal); |
| } |
| } else if (attrName.equals(XCASSerializer.INDEXED_ATTR_NAME)) { |
| String[] arrayvals = parseArray(attrVal); |
| for (int s = 0; s < arrayvals.length; s++) { |
| indexRep.add(Integer.parseInt(arrayvals[s])); |
| } |
| } else { |
| throw createException(XCASParsingException.ILLEGAL_ARRAY_ATTR, attrName); |
| } |
| } |
| FeatureStructureImpl fs; |
| if (cas.isIntArrayType(type)) { |
| fs = (FeatureStructureImpl) cas.createIntArrayFS(size); |
| } else if (cas.isFloatArrayType(type)) { |
| fs = (FeatureStructureImpl) cas.createFloatArrayFS(size); |
| } else if (cas.isStringArrayType(type)) { |
| fs = (FeatureStructureImpl) cas.createStringArrayFS(size); |
| } else if (cas.isBooleanArrayType(type)) { |
| fs = (FeatureStructureImpl) cas.createBooleanArrayFS(size); |
| } else if (cas.isByteArrayType(type)) { |
| fs = (FeatureStructureImpl) cas.createByteArrayFS(size); |
| } else if (cas.isShortArrayType(type)) { |
| fs = (FeatureStructureImpl) cas.createShortArrayFS(size); |
| } else if (cas.isLongArrayType(type)) { |
| fs = (FeatureStructureImpl) cas.createLongArrayFS(size); |
| } else if (cas.isDoubleArrayType(type)) { |
| fs = (FeatureStructureImpl) cas.createDoubleArrayFS(size); |
| } else { |
| fs = (FeatureStructureImpl) cas.createArrayFS(size); |
| } |
| |
| final int addr = fs.getAddress(); |
| FSInfo fsInfo = new FSInfo(addr, indexRep); |
| if (id >= 0) { |
| fsTree.put(id, fsInfo); |
| } else { |
| idLess.add(fsInfo); |
| } |
| // Hang on to those for setting array values. |
| this.currentAddr = addr; |
| this.arrayPos = 0; |
| |
| this.state = ARRAY_ELE_STATE; |
| } |
| |
| // The definition of a null value. Any other value must be in the expected |
| // format. |
| private final boolean emptyVal(String val) { |
| return ((val == null) || (val.length() == 0)); |
| } |
| |
| // Create a feature value from a string representation. |
| private void handleFeature(int addr, String featName, String featVal, boolean lenient) |
| throws SAXParseException { |
| int typeCode = cas.ll_getFSRefType(addr); |
| Type type = cas.ll_getTypeSystem().ll_getTypeForCode(typeCode); |
| handleFeature(type, addr, featName, featVal, lenient); |
| } |
| |
| private void handleFeature(final Type type, int addr, String featName, String featVal, |
| boolean lenient) throws SAXParseException { |
| // The FeatureMap approach is broken because it assumes feature short names |
| // are unique. This is my quick fix. -APL |
| // final FeatureImpl feat = (FeatureImpl) featureMap.get(featName); |
| |
| // handle v1.x format annotations, mapping int to ref values |
| if (featName.equals("sofa") && ts.subsumes(this.annotBaseType, type)) { |
| featVal = Integer.toString(this.sofaRefMap.get(Integer.parseInt(featVal))); |
| } |
| |
| // handle v1.x sofanum values, remapping so that _InitialView always == 1 |
| if (featName.equals(CAS.FEATURE_BASE_NAME_SOFAID) |
| && this.sofaTypeCode == cas.getHeapValue(addr)) { |
| Type sofaType = ts.ll_getTypeForCode(this.sofaTypeCode); |
| final FeatureImpl sofaNumFeat = (FeatureImpl) sofaType |
| .getFeatureByBaseName(CAS.FEATURE_BASE_NAME_SOFANUM); |
| int sofaNum = cas.getFeatureValue(addr, sofaNumFeat.getCode()); |
| cas.setFeatureValue(addr, sofaNumFeat.getCode(), this.indexMap.get(sofaNum)); |
| } |
| |
| String realFeatName; |
| if (featName.startsWith(XCASSerializer.REF_PREFIX)) { |
| realFeatName = featName.substring(XCASSerializer.REF_PREFIX.length()); |
| } else { |
| realFeatName = featName; |
| } |
| final FeatureImpl feat = (FeatureImpl) type.getFeatureByBaseName(realFeatName); |
| // System.out.println("DEBUG - Feature map result: " + featName + " = " + feat.getName()); |
| if (feat == null) { // feature does not exist in typesystem |
| if (outOfTypeSystemData != null) { |
| // Add to Out-Of-Typesystem data (APL) |
| Integer addrInteger = new Integer(addr); |
| List ootsAttrs = (List) outOfTypeSystemData.extraFeatureValues.get(addrInteger); |
| if (ootsAttrs == null) { |
| ootsAttrs = new ArrayList(); |
| outOfTypeSystemData.extraFeatureValues.put(addrInteger, ootsAttrs); |
| } |
| ootsAttrs.add(new String[] { featName, featVal }); |
| } else if (!lenient) { |
| throw createException(XCASParsingException.UNKNOWN_FEATURE, featName); |
| } |
| } else { |
| if (cas.ll_isRefType(ts.range(feat.getCode()))) { |
| cas.setFeatureValue(addr, feat.getCode(), Integer.parseInt(featVal)); |
| } else { |
| cas.setFeatureValueFromString(addr, feat.getCode(), featVal); |
| } |
| |
| } |
| } |
| |
| /* |
| * (non-Javadoc) |
| * |
| * @see org.xml.sax.ContentHandler#characters(char[], int, int) |
| */ |
| public void characters(char[] chars, int start, int length) throws SAXException { |
| switch (this.state) { |
| case DOC_TEXT_STATE: |
| case CONTENT_STATE: |
| case OOTS_CONTENT_STATE: |
| case ARRAY_ELE_CONTENT_STATE: |
| case FEAT_CONTENT_STATE: |
| buffer.append(chars, start, length); |
| break; |
| default: |
| } |
| } |
| |
| boolean isAllWhitespace(StringBuffer b) { |
| final int len = b.length(); |
| for (int i = 0; i < len; i++) { |
| if (!Character.isWhitespace(b.charAt(i))) { |
| return false; |
| } |
| } |
| return true; |
| } |
| |
| /* |
| * (non-Javadoc) |
| * |
| * @see org.xml.sax.ContentHandler#endElement(java.lang.String, java.lang.String, |
| * java.lang.String) |
| */ |
| public void endElement(String nsURI, String localName, String qualifiedName) |
| throws SAXException { |
| switch (this.state) { |
| case DOC_STATE: { |
| // Do nothing. |
| break; |
| } |
| case FS_STATE: { |
| this.state = DOC_STATE; |
| break; |
| } |
| case FEAT_STATE: { |
| this.state = FS_STATE; |
| break; |
| } |
| case CONTENT_STATE: { |
| // Set the value of the content feature. |
| if (!isAllWhitespace(buffer)) { |
| try { |
| handleFeature(currentAddr, currentContentFeat, buffer.toString(), true); |
| } catch (XCASParsingException x) { |
| // Not sure why we are calling handleFeature for WF content |
| } |
| } |
| this.state = FS_STATE; |
| break; |
| } |
| case FEAT_CONTENT_STATE: { |
| // Create a feature value from an element. |
| handleFeature(currentAddr, qualifiedName, buffer.toString(), false); |
| this.state = FEAT_STATE; |
| break; |
| } |
| case ARRAY_ELE_CONTENT_STATE: { |
| // Create an array value. |
| addArrayElement(buffer.toString()); |
| this.state = ARRAY_ELE_STATE; |
| break; |
| } |
| case ARRAY_ELE_STATE: { |
| this.state = FS_STATE; |
| break; |
| } |
| case DOC_TEXT_STATE: { |
| // Assume old style CAS with one text Sofa |
| SofaFS newSofa = cas.createInitialSofa("text"); |
| CASImpl tcas = (CASImpl) cas.getInitialView(); |
| tcas.registerView(newSofa); |
| // Set the document text without creating a documentAnnotation |
| tcas.setDocTextFromDeserializtion(buffer.toString()); |
| |
| // and assume the new Sofa is at location 1! |
| int addr = 1; |
| int id = 1; |
| this.sofaRefMap.add(id); |
| |
| // and register the id for this Sofa |
| FSInfo fsInfo = new FSInfo(addr, new IntVector()); |
| fsTree.put(id, fsInfo); |
| |
| this.state = FS_STATE; |
| break; |
| } |
| case OOTS_CONTENT_STATE: { |
| // Set the value of the content feature. |
| if (!isAllWhitespace(buffer)) { |
| // Set the value of the content feature. |
| currentOotsFs.featVals.put(currentContentFeat, buffer.toString()); |
| } |
| this.state = FS_STATE; |
| break; |
| } |
| case OOTS_FEAT_STATE: { |
| this.state = FS_STATE; |
| break; |
| } |
| } |
| } |
| |
| private void addArrayElement(String content) throws SAXParseException { |
| if (arrayPos >= cas.ll_getArraySize(currentAddr)) { |
| throw createException(XCASParsingException.EXCESS_ARRAY_ELE); |
| } |
| try { |
| if (!emptyVal(content)) { |
| if (cas.isArrayType(cas.getHeap().heap[currentAddr])) { |
| cas.setArrayValueFromString(currentAddr, arrayPos, content); |
| } else { |
| System.out.println(" not a known array type "); |
| } |
| } |
| } catch (NumberFormatException e) { |
| throw createException(XCASParsingException.INTEGER_EXPECTED, content); |
| } |
| |
| ++arrayPos; |
| } |
| |
| /* |
| * (non-Javadoc) |
| * |
| * @see org.xml.sax.ContentHandler#endDocument() |
| */ |
| public void endDocument() throws SAXException { |
| // time = System.currentTimeMillis() - time; |
| // System.out.println("Done reading xml data in " + new TimeSpan(time)); |
| // System.out.println( |
| // "Resolving references for id data (" + fsTree.size() + ")."); |
| // time = System.currentTimeMillis(); |
| // Resolve references, index. |
| Iterator it = fsTree.iterator(); |
| while (it.hasNext()) { |
| finalizeFS((FSInfo) it.next()); |
| } |
| // time = System.currentTimeMillis() - time; |
| // System.out.println("Done in " + new TimeSpan(time)); |
| // System.out.println( |
| // "Resolving references for non-id data (" + idLess.size() + ")."); |
| // time = System.currentTimeMillis(); |
| for (int i = 0; i < idLess.size(); i++) { |
| finalizeFS((FSInfo) idLess.get(i)); |
| } |
| // time = System.currentTimeMillis() - time; |
| // System.out.println("Done in " + new TimeSpan(time)); |
| |
| // also finalize Out-Of-TypeSystem FSs and features (APL) |
| if (outOfTypeSystemData != null) { |
| it = outOfTypeSystemData.fsList.iterator(); |
| while (it.hasNext()) { |
| finalizeOutOfTypeSystemFS((FSData) it.next()); |
| } |
| finalizeOutOfTypeSystemFeatures(); |
| } |
| |
| for (int i = 0; i < views.size(); i++) { |
| ((CASImpl) views.get(i)).updateDocumentAnnotation(); |
| } |
| } |
| |
| private void finalizeFS(FSInfo fsInfo) { |
| final int addr = fsInfo.addr; |
| if (fsInfo.indexRep.size() >= 0) { |
| // Now add FS to all specified index repositories |
| for (int i = 0; i < fsInfo.indexRep.size(); i++) { |
| if (indexMap.size() == 1) { |
| ((FSIndexRepositoryImpl) indexRepositories.get(fsInfo.indexRep.get(i))).addFS(addr); |
| } else { |
| ((FSIndexRepositoryImpl) indexRepositories.get(indexMap.get(fsInfo.indexRep.get(i)))) |
| .addFS(addr); |
| } |
| } |
| } |
| final int type = cas.getHeapValue(addr); |
| if (cas.isArrayType(type)) { |
| finalizeArray(type, addr, fsInfo); |
| return; |
| } |
| int[] feats = cas.getTypeSystemImpl().ll_getAppropriateFeatures(type); |
| int feat; |
| FSInfo fsValInfo; |
| for (int i = 0; i < feats.length; i++) { |
| feat = feats[i]; |
| if (cas.ll_isRefType(ts.range(feats[i]))) { |
| int featVal = cas.getFeatureValue(addr, feat); |
| fsValInfo = (FSInfo) fsTree.get(featVal); |
| if (fsValInfo == null) { |
| cas.setFeatureValue(addr, feat, CASImpl.NULL); |
| // this feature may be a ref to an out-of-typesystem FS. |
| // add it to the Out-of-typesystem features list (APL) |
| if (featVal != 0 && outOfTypeSystemData != null) { |
| Integer addrInteger = new Integer(addr); |
| List ootsAttrs = (List) outOfTypeSystemData.extraFeatureValues.get(addrInteger); |
| if (ootsAttrs == null) { |
| ootsAttrs = new ArrayList(); |
| outOfTypeSystemData.extraFeatureValues.put(addrInteger, ootsAttrs); |
| } |
| String featFullName = ts.ll_getFeatureForCode(feat).getName(); |
| int separatorOffset = featFullName.indexOf(TypeSystem.FEATURE_SEPARATOR); |
| String featName = "_ref_" + featFullName.substring(separatorOffset + 1); |
| ootsAttrs.add(new String[] { featName, Integer.toString(featVal) }); |
| } |
| } else { |
| cas.setFeatureValue(addr, feat, fsValInfo.addr); |
| } |
| } |
| } |
| } |
| |
| private void finalizeArray(int type, int addr, FSInfo fsInfo) { |
| if (!cas.isFSArrayType(type)) { |
| // Nothing to do. |
| return; |
| } |
| final int size = cas.ll_getArraySize(addr); |
| FSInfo fsValInfo; |
| for (int i = 0; i < size; i++) { |
| int arrayVal = cas.getArrayValue(addr, i); |
| fsValInfo = (FSInfo) fsTree.get(arrayVal); |
| if (fsValInfo == null) { |
| cas.setArrayValue(addr, i, CASImpl.NULL); |
| // this element may be a ref to an out-of-typesystem FS. |
| // add it to the Out-of-typesystem array elements list (APL) |
| if (arrayVal != 0 && outOfTypeSystemData != null) { |
| Integer arrayAddrInteger = new Integer(addr); |
| List ootsElements = (List) outOfTypeSystemData.arrayElements.get(arrayAddrInteger); |
| if (ootsElements == null) { |
| ootsElements = new ArrayList(); |
| outOfTypeSystemData.arrayElements.put(arrayAddrInteger, ootsElements); |
| } |
| // the "value" of the refrence is the ID, but we prefix with a letter to indicate |
| // that this ID refers to an OOTS FS |
| ArrayElement ootsElem = new ArrayElement(i, "a" + Integer.toString(arrayVal)); |
| ootsElements.add(ootsElem); |
| } |
| } else { |
| cas.setArrayValue(addr, i, fsValInfo.addr); |
| } |
| } |
| } |
| |
| /** |
| * Finalizes an Out Of Type System FS by assigning a unique ID (prepending a letter) and |
| * remapping ID references appropriately (both In-Type-System and Out-Of-TypeSystem refs). |
| */ |
| private void finalizeOutOfTypeSystemFS(FSData aFS) { |
| // make ID unique by prefixing a letter |
| aFS.id = 'a' + aFS.id; |
| // remap ref features |
| Iterator it = aFS.featVals.entrySet().iterator(); |
| while (it.hasNext()) { |
| Map.Entry entry = (Map.Entry) it.next(); |
| String attrName = (String) entry.getKey(); |
| if (attrName.startsWith("_ref_")) { |
| int val = Integer.parseInt((String) entry.getValue()); |
| if (val >= 0) // negative numbers represent null and are left unchanged |
| { |
| // attempt to locate target in type system |
| FSInfo fsValInfo = (FSInfo) fsTree.get(val); |
| if (fsValInfo != null) { |
| entry.setValue(Integer.toString(fsValInfo.addr)); |
| } else |
| // out of type system - remap by prepending letter |
| { |
| entry.setValue("a" + val); |
| } |
| } |
| } |
| |
| } |
| } |
| |
| /** |
| * Finalizes the Out Of Type System features (extra features on in-typesystem types). |
| */ |
| private void finalizeOutOfTypeSystemFeatures() { |
| // remap ref features |
| Iterator it = outOfTypeSystemData.extraFeatureValues.values().iterator(); |
| while (it.hasNext()) { |
| List attrs = (List) it.next(); |
| Iterator attrIt = attrs.iterator(); |
| while (attrIt.hasNext()) { |
| String[] attr = (String[]) attrIt.next(); |
| if (attr[0].startsWith("_ref_")) { |
| int val = Integer.parseInt(attr[1]); |
| if (val >= 0) // negative numbers represent null and are left unchanged |
| { |
| // attempt to locate target in type system |
| FSInfo fsValInfo = (FSInfo) fsTree.get(val); |
| if (fsValInfo != null) { |
| attr[1] = Integer.toString(fsValInfo.addr); |
| } else |
| // out of type system - remap by prepending letter |
| { |
| attr[1] = "a" + val; |
| } |
| } |
| } |
| } |
| } |
| } |
| |
| private XCASParsingException createException(int code) { |
| XCASParsingException e = new XCASParsingException(code); |
| String source = unknownXMLSource; |
| String line = unknownXMLSource; |
| String col = unknownXMLSource; |
| if (locator != null) { |
| source = locator.getSystemId(); |
| if (source == null) { |
| source = locator.getPublicId(); |
| } |
| if (source == null) { |
| source = unknownXMLSource; |
| } |
| line = Integer.toString(locator.getLineNumber()); |
| col = Integer.toString(locator.getColumnNumber()); |
| } |
| e.addArgument(source); |
| e.addArgument(line); |
| e.addArgument(col); |
| return e; |
| } |
| |
| private XCASParsingException createException(int code, String arg) { |
| XCASParsingException e = createException(code); |
| e.addArgument(arg); |
| return e; |
| } |
| |
| /* |
| * (non-Javadoc) |
| * |
| * @see org.xml.sax.ErrorHandler#error(org.xml.sax.SAXParseException) |
| */ |
| public void error(SAXParseException e) throws SAXException { |
| throw e; |
| } |
| |
| /* |
| * (non-Javadoc) |
| * |
| * @see org.xml.sax.ErrorHandler#fatalError(org.xml.sax.SAXParseException) |
| */ |
| public void fatalError(SAXParseException e) throws SAXException { |
| throw e; |
| } |
| |
| /* |
| * (non-Javadoc) |
| * |
| * @see org.xml.sax.ContentHandler#ignorableWhitespace(char[], int, int) |
| */ |
| public void ignorableWhitespace(char[] arg0, int arg1, int arg2) throws SAXException { |
| // Since we're not validating, we don't need to do anything; this won't |
| // be called. |
| } |
| |
| /* |
| * (non-Javadoc) |
| * |
| * @see org.xml.sax.ContentHandler#setDocumentLocator(org.xml.sax.Locator) |
| */ |
| public void setDocumentLocator(Locator loc) { |
| // System.out.println("Setting document locator."); |
| this.locator = loc; |
| } |
| |
| /* |
| * (non-Javadoc) |
| * |
| * @see org.xml.sax.ErrorHandler#warning(org.xml.sax.SAXParseException) |
| */ |
| public void warning(SAXParseException e) throws SAXException { |
| throw e; |
| } |
| |
| /** |
| * Adds a feature sturcture to the out-of-typesystem data, and sets the parser's state |
| * appropriately. (APL) |
| */ |
| private void addToOutOfTypeSystemData(String typeName, Attributes attrs) |
| throws XCASParsingException { |
| if (this.outOfTypeSystemData != null) { |
| FSData fs = new FSData(); |
| fs.type = typeName; |
| fs.indexRep = null; // not indexed |
| String attrName, attrValue; |
| for (int i = 0; i < attrs.getLength(); i++) { |
| attrName = attrs.getQName(i); |
| attrValue = attrs.getValue(i); |
| if (attrName.startsWith(reservedAttrPrefix)) { |
| if (attrName.equals(XCASSerializer.ID_ATTR_NAME)) { |
| fs.id = attrValue; |
| } else if (attrName.equals(XCASSerializer.CONTENT_ATTR_NAME)) { |
| this.currentContentFeat = attrValue; |
| } else if (attrName.equals(XCASSerializer.INDEXED_ATTR_NAME)) { |
| fs.indexRep = attrValue; |
| } else { |
| fs.featVals.put(attrName, attrValue); |
| } |
| } else { |
| fs.featVals.put(attrName, attrValue); |
| } |
| } |
| this.outOfTypeSystemData.fsList.add(fs); |
| this.currentOotsFs = fs; |
| // Set the state; we're ready to accept the "content" feature, |
| // if one is specified |
| this.state = OOTS_CONTENT_STATE; |
| } |
| } |
| |
| /** |
| * Parse a multi-valued attribute into a String array, by splitting on whitespace. |
| * |
| * @param val |
| * attribute value |
| * @return an array with each array value as an element |
| */ |
| private String[] parseArray(String val) { |
| String[] arrayVals; |
| val = val.trim(); |
| if (emptyVal(val)) { |
| arrayVals = new String[0]; |
| } else { |
| arrayVals = val.split("\\s+"); |
| } |
| return arrayVals; |
| } |
| |
| /** |
| * Gets the CAS type name corresponding to an XCAS tag name. The type name is usually equal to |
| * the tag name, but the characters : and - are translated into the sequences _colon_ and |
| * _dash_, respectively. |
| * |
| * @param aTagName |
| * XCAS tag name |
| * @return CAS type name corresponding to this tag |
| */ |
| private String getCasTypeName(String aTagName) { |
| if (aTagName.indexOf(':') == -1 && aTagName.indexOf('-') == -1) { |
| return aTagName; |
| } else { |
| // Note: This is really slow so we avoid if possible. -- RJB |
| return StringUtils.replaceAll(StringUtils.replaceAll(aTagName, ":", "_colon_"), "-", |
| "_dash_"); |
| } |
| } |
| } |
| |
| private TypeSystemImpl ts; |
| |
| private UimaContext uimaContext; |
| |
| // private HashMap featureMap; -APL |
| // ///private int[] featureType; |
| |
| // name of tag to contain document text |
| private String docTypeName = XCASSerializer.DEFAULT_DOC_TYPE_NAME; |
| |
| /** |
| * Create a new deserializer from a type system. Note: all CAS arguments later supplied to |
| * <code>getXCASHandler()</code> must have this type system as their type system. |
| * |
| * @param ts |
| * The type system of the CASes to be deserialized. |
| */ |
| public XCASDeserializer(TypeSystem ts, UimaContext uimaContext) { |
| super(); |
| this.ts = (TypeSystemImpl) ts; |
| this.uimaContext = uimaContext; |
| // this.featureMap = new HashMap(); - APL |
| } |
| |
| public XCASDeserializer(TypeSystem ts) { |
| this(ts, null); |
| } |
| |
| /** |
| * Create a default handler for deserializing an XCAS into the <code>cas</code> parameter. |
| * <p> |
| * Warning: for efficiency reasons, the deserializer does not do much type checking for features |
| * and their values. It is expected that the incoming XCAS conforms to the type system provided. |
| * If it doesn't, the results are undefined. |
| * |
| * @param cas |
| * This CAS will be used to hold the data of the serialized XCAS. |
| * @return The <code>DefaultHandler</code> to pass to the SAX parser. |
| */ |
| public DefaultHandler getXCASHandler(CAS cas) { |
| return getXCASHandler(cas, null); |
| } |
| |
| /** |
| * Create a default handler for deserializing an XCAS into the <code>cas</code> parameter. This |
| * version causes the deserializer to store out-of-typesystem data for later use. (APL) |
| * <p> |
| * Warning: for efficiency reasons, the deserializer does not do much type checking for features |
| * and their values. It is expected that the incoming XCAS conforms to the type system provided. |
| * If it doesn't, the results are undefined. |
| * |
| * @param cas |
| * This CAS will be used to hold the data of the serialized XCAS. |
| * @param outOfTypeSystemData |
| * An object that stores FSs that do not conform to the CAS's type system |
| * @return The <code>DefaultHandler</code> to pass to the SAX parser. |
| */ |
| public DefaultHandler getXCASHandler(CAS cas, OutOfTypeSystemData outOfTypeSystemData) { |
| return new XCASDeserializerHandler((CASImpl) cas, outOfTypeSystemData); |
| } |
| |
| /** |
| * Gets the name of the type representing the document. This will become the name of the XML |
| * element that will hold the document text. |
| * |
| * @return the document type name |
| */ |
| public String getDocumentTypeName() { |
| return docTypeName; |
| } |
| |
| /** |
| * Gets the name of the type representing the document. This will become the name of the XML |
| * element that will hold the document text. If not set, defaults to |
| * {@link XCASSerializer#DEFAULT_DOC_TYPE_NAME XCASSerializer.DEFAULT_DOC_TYPE_NAME}. |
| * |
| * @param aDocTypeName |
| * the document type name |
| */ |
| public void setDocumentTypeName(String aDocTypeName) { |
| docTypeName = aDocTypeName; |
| } |
| |
| /** |
| * Deserializes an XCAS from a stream. By default this is not lenient, meaning that if the XCAS |
| * references Types that are not in the Type System, an Exception will be thrown. Use |
| * {@link XCASDeserializer#deserialize(InputStream,CAS,boolean)} to turn on lenient mode and |
| * ignore any unknown types. |
| * |
| * @param aStream |
| * input stream from which to read the XCAS XML document |
| * @param aCAS |
| * CAS into which to deserialize. This CAS must be set up with a type system that is |
| * compatible with that in the XCAS |
| * |
| * @throws SAXException |
| * if an XML Parsing error occurs |
| * @throws IOException |
| * if an I/O failure occurs |
| */ |
| public static void deserialize(InputStream aStream, CAS aCAS) throws SAXException, IOException { |
| XCASDeserializer.deserialize(aStream, aCAS, false); |
| } |
| |
| /** |
| * Deserializes an XCAS from a stream. |
| * |
| * @param aStream |
| * input stream from which to read the XCAS XML document |
| * @param aCAS |
| * CAS into which to deserialize. This CAS must be set up with a type system that is |
| * compatible with that in the XCAS. |
| * @param aLenient |
| * if true, unknown Types will be ignored. If false, unknown Types will cause an |
| * exception. The default is false. |
| * |
| * @throws SAXException |
| * if an XML Parsing error occurs |
| * @throws IOException |
| * if an I/O failure occurs |
| */ |
| public static void deserialize(InputStream aStream, CAS aCAS, boolean aLenient) |
| throws SAXException, IOException { |
| XMLReader xmlReader = XMLReaderFactory.createXMLReader(); |
| XCASDeserializer deser = new XCASDeserializer(aCAS.getTypeSystem()); |
| ContentHandler handler; |
| if (aLenient) { |
| handler = deser.getXCASHandler(aCAS, new OutOfTypeSystemData()); |
| } else { |
| handler = deser.getXCASHandler(aCAS); |
| } |
| xmlReader.setContentHandler(handler); |
| xmlReader.parse(new InputSource(aStream)); |
| } |
| |
| } |