/* | |
* Licensed to the Apache Software Foundation (ASF) under one | |
* or more contributor license agreements. See the NOTICE file | |
* distributed with this work for additional information | |
* regarding copyright ownership. The ASF licenses this file | |
* to you under the Apache License, Version 2.0 (the | |
* "License"); you may not use this file except in compliance | |
* with the License. You may obtain a copy of the License at | |
* | |
* http://www.apache.org/licenses/LICENSE-2.0 | |
* | |
* Unless required by applicable law or agreed to in writing, | |
* software distributed under the License is distributed on an | |
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY | |
* KIND, either express or implied. See the License for the | |
* specific language governing permissions and limitations | |
* under the License. | |
*/ | |
package org.apache.uima.cas.impl; | |
import java.util.Arrays; | |
import java.util.BitSet; | |
import java.util.Comparator; | |
import java.util.HashMap; | |
import java.util.HashSet; | |
import java.util.Iterator; | |
import java.util.Map; | |
import java.util.NoSuchElementException; | |
import java.util.Set; | |
import java.util.concurrent.atomic.AtomicInteger; | |
import org.apache.uima.cas.CAS; | |
import org.apache.uima.cas.CASRuntimeException; | |
import org.apache.uima.cas.FSIndex; | |
import org.apache.uima.cas.FeatureStructure; | |
import org.apache.uima.internal.util.IntVector; | |
import org.apache.uima.internal.util.PositiveIntSet; | |
import org.apache.uima.internal.util.PositiveIntSet_impl; | |
import org.apache.uima.internal.util.XmlElementName; | |
import org.apache.uima.util.Logger; | |
import org.apache.uima.util.MessageReport; | |
import org.xml.sax.ContentHandler; | |
import org.xml.sax.ErrorHandler; | |
import org.xml.sax.SAXException; | |
import org.xml.sax.SAXParseException; | |
/** | |
* CAS serializer support for XMI and JSON formats. | |
* | |
* There are multiple use cases. | |
* 1) normal - the consumer is independent of UIMA | |
* - (maybe) support for delta serialization | |
* 2) service calls: | |
* - support deserialization with out-of-type-system set-aside, and subsequent serialization with re-merging | |
* - guarantee of using same xmi:id's as were deserialized when serializing | |
* - support for delta serialization | |
* | |
* There is an outer class (one instance per "configuration" - reusable after configuration, and | |
* an inner class - one per serialize call. | |
* | |
* These classes are the common parts of serialization between XMI and JSON, mainly having to do with | |
* 1) enquuing the FS to be serialized | |
* 2) serializing according to their types and features | |
* | |
* | |
* Methods marked public are not for public use but are that way to permit | |
* other users of this class in other packages to "see" these methods. | |
* | |
* XmiCasSerializer JsonCasSerializer | |
* Instance Instance | |
* css ref -------> CasSerializerSupport <------ css ref | |
* | |
* | |
* XmiDocSerializer JsonDocSerializer | |
* Instance Instance | |
* (1 per serialize action) (1 per serialize action) | |
* cds ref -------> CasDocSerializer <------- cds ref | |
* csss points back | |
* | |
* | |
* Construction: | |
* new Xmi/JsonCasSerializer | |
* initializes css with new CasSerializerSupport | |
* | |
* serialize method creates a new Xmi/JsonDocSerializer inner class | |
* constructor creates a new CasDocSerializer, | |
* | |
* Use Cases and Algorithms | |
* Support set-aside for out-of-type-system FS on deserialization (record in shareData) | |
* implies can't determine sharing status of things ref'd by features; need to depend on | |
* multiple-refs-allowed flag. | |
* If multiple-refs found during serialization for feat marked non-shared, unshare these (make | |
* 2 serializations, one or more inplace, for example. | |
* Perhaps not considered an error. | |
* implies need (for non-delta case) to send all FSs that were deserialized - some may be ref'd by oots elements | |
* ** Could ** not do this if no oots elements, but could break some assumptions | |
* and this only would apply to non-delta - not worth doing | |
* | |
* | |
* | |
*/ | |
public class CasSerializerSupport { | |
// Special "type class" codes for list types. The LowLevelCAS.ll_getTypeClass() method | |
// returns type classes for primitives and arrays, but not lists (which are just ordinary FS types | |
// as far as the CAS is concerned). The serialization treats lists specially, however, and | |
// so needs its own type codes for these. | |
public static final int TYPE_CLASS_INTLIST = 101; | |
public static final int TYPE_CLASS_FLOATLIST = 102; | |
public static final int TYPE_CLASS_STRINGLIST = 103; | |
public static final int TYPE_CLASS_FSLIST = 104; | |
public static int PP_LINE_LENGTH = 120; | |
public static int PP_ELEMENTS = 30; // number of elements to do before nl | |
public static AtomicInteger errorCount = new AtomicInteger(0); | |
final static Comparator<TypeImpl> COMPARATOR_SHORT_TYPENAME = new Comparator<TypeImpl>() { | |
public int compare(TypeImpl object1, TypeImpl object2) { | |
return object1.getShortName().compareTo(object2.getShortName()); | |
} | |
}; | |
TypeSystemImpl filterTypeSystem; | |
ErrorHandler errorHandler = null; | |
// UIMA logger, to which we may write warnings | |
Logger logger; | |
public boolean isFormattedOutput; // true for pretty printing | |
/*********************************************** | |
* C O N S T R U C T O R S * | |
***********************************************/ | |
public CasSerializerSupport() {} | |
/******************************************************** | |
* Routines to set/reset configuration * | |
********************************************************/ | |
/** | |
* set or reset the pretty print flag (default is false) | |
* @param pp true to do pretty printing of output | |
* @return the original instance, possibly updated | |
*/ | |
public CasSerializerSupport setPrettyPrint(boolean pp) { | |
this.isFormattedOutput = pp; | |
return this; | |
} | |
/** | |
* pass in a type system to use for filtering what gets serialized; | |
* only those types and features which are defined this type system are included. | |
* @param ts the filter | |
* @return the original instance, possibly updated | |
*/ | |
public CasSerializerSupport setFilterTypes(TypeSystemImpl ts) { | |
this.filterTypeSystem = ts; | |
return this; | |
} | |
// for testing | |
public TypeSystemImpl getFilterTypes() { | |
return filterTypeSystem; | |
} | |
// not done here, done on serialize call, different (typically) for each call | |
// /** | |
// * set the Marker to specify delta cas serialization | |
// * @param m - the marker | |
// * @return the original instance, possibly updated | |
// */ | |
// public CasSerializerSupport setDeltaCas(Marker m, XmiSerializationSharedData sharedData) { | |
// this.marker = (MarkerImpl) m; | |
// this.sharedData = sharedData; | |
// return this; | |
// } | |
/** | |
* set an error handler to receive information about errors | |
* @param eh the error handler | |
* @return the original instance, possibly updated | |
*/ | |
public CasSerializerSupport setErrorHandler(ErrorHandler eh) { | |
this.errorHandler = eh; | |
return this; | |
} | |
/*********************************************** | |
* Methods used to serialize items | |
* Separate implementations for JSON and Xmi | |
* | |
***********************************************/ | |
public static abstract class CasSerializerSupportSerialize { | |
abstract protected void initializeNamespaces(); | |
abstract protected void checkForNameCollision(XmlElementName xmlElementName); | |
abstract protected void addNameSpace(XmlElementName xmlElementName); | |
abstract protected XmlElementName uimaTypeName2XmiElementName(String typeName); | |
abstract protected void writeFeatureStructures(int elementCount) throws Exception; | |
abstract protected void writeViews() throws Exception; | |
abstract protected void writeView(int sofaAddr, int[] members) throws Exception; | |
abstract protected void writeView(int sofaAddr, int[] added, int[] deleted, int[] reindexed) throws Exception; | |
/** | |
* | |
* @param addr - | |
* @param typeCode - | |
* @return true if writing out referenced items (JSON) | |
* @throws Exception - | |
*/ | |
abstract protected boolean writeFsStart(int addr, int typeCode) throws Exception; | |
abstract protected void writeFs(int addr, int typeCode) throws Exception; | |
abstract protected void writeListsAsIndividualFSs(int addr, int typeCode) throws Exception; | |
abstract protected void writeArrays(int addr, int typeCode, int typeClass) throws Exception; | |
abstract protected void writeEndOfIndividualFs() throws Exception; | |
abstract protected void writeEndOfSerialization() throws Exception; | |
abstract protected void writeFsRef(int addr) throws Exception; | |
} | |
/** | |
* Use an inner class to hold the data for serializing a CAS. Each call to serialize() creates its | |
* own instance. | |
* | |
* package private to allow a test case to access | |
* not static to share the logger and the initializing values (could be changed) | |
*/ | |
public class CasDocSerializer { | |
// The CAS we're serializing. | |
public final CASImpl cas; | |
public final TypeSystemImpl tsi; | |
/** | |
* set of FSs that have been enqueued to be serialized | |
* Computed during "enqueue" phase, prior to encoding | |
* Used to prevent duplicate enqueuing | |
*/ | |
public final PositiveIntSet_impl visited_not_yet_written; | |
/** | |
* set of FSs that have multiple references | |
* This is for JSON which is computing the multi-refs, not depending on the setting in a feature. | |
*/ | |
public final PositiveIntSet multiRefFSs; | |
/* ********************************************* | |
* FSs that need to be serialized because they're | |
* a) in an index | |
* b) in the set of previously serialized FS which have ids (that is, they weren't previously embedded) | |
* c) (delta only) have a feature which has an embedded value some part of which changed (no id) | |
* | |
* d) the set of FSs that are reachable via FSrefs from the above 3 sets | |
*/ | |
public IntVector previouslySerializedFSs = null; | |
public IntVector modifiedEmbeddedValueFSs = null; | |
// All FSs that are in an index somewhere. | |
public final IntVector[] indexedFSs; | |
// only referenced FSs. | |
private final IntVector queue; | |
// utilities for dealing with CAS list types | |
public final ListUtils listUtils; | |
public XmlElementName[] typeCode2namespaceNames; // array, indexed by type code, giving XMI names for each type | |
private final BitSet typeUsed; // identifies types being serialized, a subset of all possible types | |
public boolean needNameSpaces = true; // may be false; currently for JSON only | |
/** | |
* map from a namespace expanded form to the namespace prefix, to identify potential collisions when | |
* generating a namespace string | |
*/ | |
public final Map<String, String> nsUriToPrefixMap = new HashMap<String, String>(); | |
/** | |
* the set of all namespace prefixes used, to disallow some if they are | |
* in use already in set-aside data (xmi serialization) being merged back in | |
*/ | |
public final Set<String> nsPrefixesUsed = new HashSet<String>(); | |
/** | |
* Used to tell if a FS was created before or after mark. | |
*/ | |
public final MarkerImpl marker; | |
/** | |
* for Delta serialization, holds the info gathered from deserialization needed for delta serialization | |
* and for handling out-of-type-system data for both plain and delta serialization | |
*/ | |
public final XmiSerializationSharedData sharedData; | |
/** | |
* Whether the serializer needs to serialize only the deltas, that is, new FSs created after | |
* mark represented by Marker object and preexisting FSs and Views that have been | |
* modified. Set to true if Marker object is not null and CASImpl object of this serialize | |
* matches the CASImpl in Marker object. | |
*/ | |
public final boolean isDelta; | |
/** | |
* Whether the serializer needs to check for filtered-out types/features. Set to true if type | |
* system of CAS does not match type system that was passed to constructor of serializer. | |
*/ | |
public final boolean isFiltering; | |
private TypeImpl[] sortedUsedTypes; | |
private final ErrorHandler errorHandler; | |
public TypeSystemImpl filterTypeSystem; | |
// map to reduce string usage by reusing equal string representations; lives just for one serialize call | |
private final Map<String, String> uniqueStrings = new HashMap<String, String>(); | |
public final boolean isFormattedOutput; | |
private final CasSerializerSupportSerialize csss; | |
/*********************************************** | |
* C O N S T R U C T O R S * | |
***********************************************/ | |
/** | |
* | |
* @param ch - | |
* @param cas - | |
* @param sharedData - | |
* @param marker - | |
* @param csss - | |
*/ | |
public CasDocSerializer(ContentHandler ch, CASImpl cas, XmiSerializationSharedData sharedData, MarkerImpl marker, CasSerializerSupportSerialize csss) { | |
this(ch, cas,sharedData, marker, csss, false); | |
} | |
public CasDocSerializer(ContentHandler ch, CASImpl cas, XmiSerializationSharedData sharedData, MarkerImpl marker, CasSerializerSupportSerialize csss, boolean trackMultiRefs) { | |
this.cas = cas; | |
this.csss = csss; | |
this.sharedData = sharedData; | |
// copy outer class values into final inner ones, to keep the outer thread-safe | |
filterTypeSystem = CasSerializerSupport.this.filterTypeSystem; | |
isFormattedOutput = CasSerializerSupport.this.isFormattedOutput; | |
this.marker = marker; | |
errorHandler = CasSerializerSupport.this.errorHandler; | |
tsi = cas.getTypeSystemImpl(); | |
visited_not_yet_written = new PositiveIntSet_impl(); | |
queue = new IntVector(); | |
indexedFSs = new IntVector[cas.getBaseSofaCount()]; // number of views | |
listUtils = new ListUtils(cas, logger, errorHandler); | |
typeUsed = new BitSet(); | |
isFiltering = filterTypeSystem != null && filterTypeSystem != tsi; | |
if (marker != null && !marker.isValid()) { | |
CASRuntimeException exception = new CASRuntimeException( | |
CASRuntimeException.INVALID_MARKER, new String[] { "Invalid Marker." }); | |
throw exception; | |
} | |
isDelta = marker != null; | |
multiRefFSs = (trackMultiRefs) ? new PositiveIntSet_impl() : null; | |
} | |
// TODO: internationalize | |
private void reportMultiRefWarning(int featCode) throws SAXException { | |
String message = String.format("Feature %s is marked multipleReferencesAllowed=false, but it has" | |
+ " multiple references. These will be serialized in duplicate.", | |
tsi.ll_getFeatureForCode(featCode).getName()); | |
MessageReport.decreasingWithTrace(errorCount, message, logger); | |
if (this.errorHandler != null) { | |
this.errorHandler.warning(new SAXParseException(message, null)); | |
} | |
} | |
/** | |
* Starts serialization | |
* @throws Exception - | |
*/ | |
public void serialize() throws Exception { | |
typeCode2namespaceNames = new XmlElementName[tsi.getLargestTypeCode() + 1]; | |
csss.initializeNamespaces(); | |
int iElementCount = 1; // start at 1 to account for special NULL object | |
enqueueIndexed(); // done first - to insure this has priority | |
enqueueIncoming(); //make sure we enqueue every FS that was deserialized into this CAS | |
// needed to support Out Of Typesystem data | |
enqueueNonsharedMultivaluedFS(); // needed for delta serialization of modified embedded lists/arrays | |
enqueueFeaturesOfIndexed(); // and incoming and modified embedded refs | |
iElementCount += (previouslySerializedFSs == null) ? 0 : previouslySerializedFSs.size(); | |
iElementCount += (modifiedEmbeddedValueFSs == null) ? 0 : modifiedEmbeddedValueFSs.size(); | |
for (IntVector fss : indexedFSs) { | |
iElementCount += (fss == null) ? 0 : fss.size(); | |
} | |
iElementCount += queue.size(); | |
FSIndex<FeatureStructure> sofaIndex = cas.getBaseCAS().indexRepository.getIndex(CAS.SOFA_INDEX_NAME); | |
if (!isDelta) { | |
iElementCount += (sofaIndex.size()); // one View element per sofa | |
iElementCount += getElementCountForSharedData(); | |
} else { | |
int numViews = cas.getBaseSofaCount(); | |
for (int sofaNum = 1; sofaNum <= numViews; sofaNum++) { | |
FSIndexRepositoryImpl loopIR = (FSIndexRepositoryImpl) cas.getBaseCAS().getSofaIndexRepository(sofaNum); | |
if (loopIR != null && loopIR.isModified()) { | |
iElementCount++; | |
} | |
} | |
} | |
csss.writeFeatureStructures(iElementCount); | |
csss.writeViews(); | |
csss.writeEndOfSerialization(); | |
} | |
/** | |
* | |
* @param sofaNum - starts at 1 | |
* @return the addr of the sofa FS, or 0 | |
*/ | |
public int getSofaAddr(int sofaNum) { | |
if (sofaNum != 1 || cas.isInitialSofaCreated()) { //skip if initial view && no Sofa yet | |
// all non-initial-views must have a sofa | |
return ((CASImpl)cas.getView(sofaNum)).getSofaRef(); | |
} | |
return 0; | |
} | |
public void writeViewsCommons() throws Exception { | |
// Get indexes for each SofaFS in the CAS | |
int numViews = cas.getBaseSofaCount(); | |
for (int sofaNum = 1; sofaNum <= numViews; sofaNum++) { | |
FSIndexRepositoryImpl loopIR = (FSIndexRepositoryImpl) cas.getBaseCAS().getSofaIndexRepository(sofaNum); | |
final int sofaAddr = getSofaAddr(sofaNum); | |
if (loopIR != null) { | |
if (!isDelta) { | |
int[] fsarray = loopIR.getIndexedFSs(); | |
csss.writeView(sofaAddr, fsarray); | |
} else { // is Delta Cas | |
if (sofaNum != 1 && this.marker.isNew(sofaAddr)) { | |
// for views created after mark (initial view never is - it is always created with the CAS) | |
// write out the view as new | |
int[] fsarray = loopIR.getIndexedFSs(); | |
csss.writeView(sofaAddr, fsarray); | |
} else if (loopIR.isModified()) { | |
csss.writeView(sofaAddr, loopIR.getAddedFSs(), loopIR.getDeletedFSs(), loopIR.getReindexedFSs()); | |
} | |
} | |
} | |
} | |
} | |
// sort is by shortname of type | |
public TypeImpl[] getSortedUsedTypes() { | |
if (null == sortedUsedTypes) { | |
sortedUsedTypes = new TypeImpl[typeUsed.cardinality()]; | |
int i = 0; | |
for (TypeImpl ti : getUsedTypesIterable()) { | |
sortedUsedTypes[i++] = ti; | |
} | |
Arrays.sort(sortedUsedTypes, COMPARATOR_SHORT_TYPENAME); | |
} | |
return sortedUsedTypes; | |
} | |
private Iterable<TypeImpl> getUsedTypesIterable() { | |
return new Iterable<TypeImpl>() { | |
public Iterator<TypeImpl> iterator() { | |
return new Iterator<TypeImpl>() { | |
private int i = 0; | |
public boolean hasNext() { | |
return typeUsed.nextSetBit(i) >= 0; | |
} | |
public TypeImpl next() { | |
final int next_i = typeUsed.nextSetBit(i); | |
if (next_i < 0) { | |
throw new NoSuchElementException(); | |
} | |
i = next_i + 1; | |
return (TypeImpl) tsi.ll_getTypeForCode(next_i); | |
} | |
public void remove() { | |
throw new UnsupportedOperationException(); | |
} | |
}; | |
} | |
}; | |
} | |
// private StringPair[] getSortedPrefixUri() { | |
// StringPair[] r = new StringPair[nsUriToPrefixMap.size()]; | |
// int i = 0; | |
// for (Map.Entry<String,String> e : nsUriToPrefixMap.entrySet()) { | |
// r[i++] = new StringPair(e.getValue(), e.getKey()); | |
// } | |
// Arrays.sort(r); | |
// return r; | |
// } | |
/** | |
* Enqueues all FS that are stored in the sharedData's id map. | |
* This map is populated during the previous deserialization. This method | |
* is used to make sure that all incoming FS are echoed in the next | |
* serialization. It is required if there are out-of-type FSs that | |
* are being merged back into the serialized form; those might | |
* reference some of these. | |
*/ | |
private void enqueueIncoming() { | |
if (sharedData == null) | |
return; | |
int[] fsAddrs = this.sharedData.getAllFsAddressesInIdMap(); | |
previouslySerializedFSs = new IntVector(); | |
for (int addr : fsAddrs) { | |
// don't enqueue id 0 - this is the "null" fs, which is automatically serialized by xmi | |
if (addr == 0 || | |
(isDelta && !marker.isModified(addr))) { | |
continue; | |
} | |
// is the first instance, but skip if delta and not modified or above the line or filtered | |
int typeCode = enqueueCommon(addr); | |
if (typeCode == -1) { | |
continue; | |
} | |
previouslySerializedFSs.add(addr); | |
} | |
} | |
/** | |
* add the indexed FSs onto the indexedFSs by view. | |
* add the SofaFSs onto the by-ref queue | |
*/ | |
private void enqueueIndexed() { | |
FSIndexRepositoryImpl ir = (FSIndexRepositoryImpl) cas.getBaseCAS().getBaseIndexRepository(); | |
int[] fsarray = ir.getIndexedFSs(); | |
try { | |
for (int fs : fsarray) { | |
enqueue(fs); // put on by-ref queue | |
} | |
} catch (SAXException e) { | |
throw new RuntimeException("Internal error - should never happen", e); | |
} | |
// FSIndex sofaIndex = cas.getBaseCAS().indexRepository.getIndex(CAS.SOFA_INDEX_NAME); | |
// FSIterator iterator = sofaIndex.iterator(); | |
// // Get indexes for each SofaFS in the CAS | |
// while (iterator.isValid()) | |
int numViews = cas.getBaseSofaCount(); | |
for (int sofaNum = 1; sofaNum <= numViews; sofaNum++) { | |
// SofaFS sofa = (SofaFS) iterator.get(); | |
// int sofaNum = sofa.getSofaRef(); | |
// iterator.moveToNext(); | |
FSIndexRepositoryImpl loopIR = (FSIndexRepositoryImpl) cas.getBaseCAS() | |
.getSofaIndexRepository(sofaNum); | |
if (loopIR != null) { | |
fsarray = loopIR.getIndexedFSs(); | |
for (int fs : fsarray) { | |
enqueueIndexedFs(sofaNum, fs); | |
} | |
} | |
} | |
} | |
/** | |
* When serializing Delta CAS, | |
* enqueue encompassing FS of nonshared multivalued FS that have been modified. | |
* The embedded nonshared-multivalued item could be a list or an array | |
*/ | |
private void enqueueNonsharedMultivaluedFS() { | |
if (sharedData == null || !isDelta) | |
return; | |
int[] fsAddrs = sharedData.getNonsharedMulitValuedFSs(); | |
modifiedEmbeddedValueFSs = new IntVector(); | |
for (int addr : fsAddrs) { | |
if (marker.isModified(addr)) { | |
int encompassingFs = sharedData.getEncompassingFS(addr); | |
if (-1 != enqueueCommonWithoutDeltaAndFilteringCheck(encompassingFs)) { // only to set type used info and check if already enqueued | |
modifiedEmbeddedValueFSs.add(encompassingFs); | |
} | |
} | |
} | |
} | |
/** | |
* Enqueue everything reachable from features of indexed FSs. | |
*/ | |
private void enqueueFeaturesOfIndexed() throws SAXException { | |
if (null != previouslySerializedFSs) { | |
enqueueFeaturesOfFSs(previouslySerializedFSs); | |
} | |
if (null != modifiedEmbeddedValueFSs) { | |
enqueueFeaturesOfFSs(modifiedEmbeddedValueFSs); | |
} | |
for (IntVector fss : indexedFSs) { | |
if (fss != null) { | |
enqueueFeaturesOfFSs(fss); | |
} | |
} | |
} | |
private void enqueueFeaturesOfFSs(final IntVector fss) throws SAXException { | |
final int max = fss.size(); | |
for (int i = 0; i < max; i++) { | |
int addr = fss.get(i); | |
int heapVal = cas.getHeapValue(addr); | |
enqueueFeatures(addr, heapVal); | |
} | |
} | |
int enqueueCommon(int addr) { | |
return enqueueCommon(addr, true); | |
} | |
int enqueueCommonWithoutDeltaAndFilteringCheck(int addr) { | |
return enqueueCommon(addr, false); | |
} | |
private int enqueueCommon(int addr, boolean doDeltaAndFilteringCheck) { | |
final int typeCode = cas.getHeapValue(addr); | |
assert(typeCode != 0); | |
if (doDeltaAndFilteringCheck) { | |
if (isDelta) { | |
if (!marker.isNew(addr) && !marker.isModified(addr)) { | |
return -1; | |
} | |
} | |
if (isFiltering) { | |
String typeName = tsi.ll_getTypeForCode(typeCode).getName(); | |
if (filterTypeSystem.getType(typeName) == null) { | |
return -1; // this type is not in the target type system | |
} | |
} | |
} | |
// We set visited only if we're going to enqueue this. | |
// (In other words, please don't move this up in this method) | |
// This handles the use case: | |
// delta cas; element is not modified, but at some later point, we determine | |
// an embedded feature value (array or list) is modified, which requires we serialize out this | |
// fs as if it was modified. | |
if (!visited_not_yet_written.add(addr)) { | |
// was already visited; means this FS has multiple references, either from FS feature(s) or indexes or both | |
if (null != multiRefFSs) { | |
boolean wasAdded = multiRefFSs.add(addr); | |
if (wasAdded) { | |
queue.add(addr); // if was in indexed set before, isn't in the queue set, but needs to be | |
} | |
} | |
return -1; | |
} | |
boolean alreadySet = typeUsed.get(typeCode); | |
if (!alreadySet) { | |
typeUsed.set(typeCode); | |
String typeName = tsi.ll_getTypeForCode(typeCode).getName(); | |
XmlElementName newXel = csss.uimaTypeName2XmiElementName(typeName); | |
if (!needNameSpaces) { // means if name spaces are not not always needed, then we have to check for collision | |
csss.checkForNameCollision(newXel); // executed for JSON code | |
} | |
typeCode2namespaceNames[typeCode] = newXel; | |
} | |
return typeCode; | |
} | |
/* | |
* Enqueues an indexed FS. Does NOT enqueue features at this point. | |
* Doesn't enqueue non-modified FS when delta | |
*/ | |
void enqueueIndexedFs(int viewNumber, int addr) { | |
if (enqueueCommon(addr) != -1) { | |
IntVector fss = indexedFSs[viewNumber - 1]; | |
if (null == fss) { | |
indexedFSs[viewNumber - 1] = fss = new IntVector(); | |
} | |
fss.add(addr); | |
} | |
} | |
/** | |
* Enqueue an FS, and everything reachable from it. | |
* | |
* This call is recursive with enqueueFeatures, \ | |
* and an arbitrary long chain can get stack overflow error. | |
* Probably should fix this someday. See https://issues.apache.org/jira/browse/UIMA-106 | |
* | |
* @param addr | |
* The FS address. | |
*/ | |
private void enqueue(int addr) throws SAXException { | |
int typeCode = enqueueCommon(addr); | |
if (typeCode == -1) { | |
return; | |
} | |
queue.add(addr); | |
enqueueFeatures(addr, typeCode); | |
// Also, for FSArrays enqueue the elements | |
if (cas.isFSArrayType(typeCode)) { //TODO: won't get parameterized arrays?? | |
enqueueFSArrayElements(addr); | |
} | |
} | |
boolean isArrayOrList(int typeCode) { | |
return | |
isArrayType(typeCode) || | |
isListType(typeCode); | |
} | |
private boolean isArrayType(int typeCode) { | |
return | |
(typeCode == TypeSystemImpl.intArrayTypeCode) || | |
(typeCode == TypeSystemImpl.floatArrayTypeCode) || | |
(typeCode == TypeSystemImpl.stringArrayTypeCode) || | |
(typeCode == TypeSystemImpl.fsArrayTypeCode) || | |
(typeCode == TypeSystemImpl.booleanArrayTypeCode) || | |
(typeCode == TypeSystemImpl.byteArrayTypeCode) || | |
(typeCode == TypeSystemImpl.shortArrayTypeCode) || | |
(typeCode == TypeSystemImpl.longArrayTypeCode) || | |
(typeCode == TypeSystemImpl.doubleArrayTypeCode); | |
} | |
private boolean isListType(int typeCode) { | |
return | |
listUtils.isIntListType(typeCode) || | |
listUtils.isFloatListType(typeCode) || | |
listUtils.isStringListType(typeCode) || | |
listUtils.isFsListType(typeCode); | |
} | |
/** | |
* | |
* @param curNode | |
* @param featCode | |
* @return true if OK, false if found cycle or multi-ref | |
* @throws SAXException | |
*/ | |
private boolean isListElementsMultiplyReferenced(int listNode, int featCode) throws SAXException { | |
int typeCode = cas.getHeapValue(listNode); // could be end | |
int neListType = listUtils.getNeListType(typeCode); | |
int tailFeat = listUtils.getTailFeatCode(typeCode); | |
boolean foundCycle = false; | |
int curNode = listNode; | |
// if (listNode == 14284) { // debug | |
// System.out.println(listNode); //debug | |
// } | |
while (typeCode == neListType) { // stop on end or 0 | |
if (!visited_not_yet_written.add(curNode)) { | |
foundCycle = true; | |
break; | |
} | |
curNode = cas.getHeapValue(curNode + cas.getFeatureOffset(tailFeat)); | |
typeCode = cas.getHeapValue(curNode); | |
} | |
return foundCycle; | |
} | |
private boolean isMultiRef_enqueue(int featCode, int featVal, boolean alreadyVisited, boolean isListNode, boolean isListFeat) throws SAXException { | |
if (multiRefFSs == null) { | |
// dynamic embedding is turned off - compute static embedding just for lists and arrays | |
boolean multiRefAllowed = isStaticMultiRef(featCode) || isListNode; | |
if (!multiRefAllowed) { | |
// two cases: a list or non-list | |
// if a list, check/mark all the nodes in the list | |
if ((isListFeat && isListElementsMultiplyReferenced(featVal, featCode)) || | |
(!isListFeat && alreadyVisited)) { | |
reportMultiRefWarning(featCode); | |
} else { | |
if (!isListFeat) { // already added visited for list nodes | |
visited_not_yet_written.add(featVal); | |
} | |
} | |
return false; // because static, multi-ref not allowed, no need to enqueue | |
} else { // is multiRefAllowed or in list node | |
return true; // static, multi-ref allowed or in list node, enqueue | |
} | |
} | |
// doing dynamic determination of multi-refs | |
if (alreadyVisited) { | |
return !multiRefFSs.contains(featVal); // enqueue in the "queue" section, first time this happens | |
} | |
return true; // enqueue this item. May or may not be eventually written embedded | |
// but we enqueue to track multi-use | |
} | |
/** | |
* Enqueue all FSs reachable from features of the given FS. | |
* | |
* @param addr | |
* address of an FS | |
* @param typeCode | |
* type of the FS | |
* @param insideListNode | |
* true iff the enclosing FS (addr) is a list type | |
*/ | |
private void enqueueFeatures(int addr, int typeCode) throws SAXException { | |
/** | |
* Handle FSArrays | |
*/ | |
if (typeCode == TypeSystemImpl.fsArrayTypeCode) { | |
final int array_size = cas.ll_getArraySize(addr); | |
int position = cas.getArrayStartAddress(addr); | |
for (int j = 0; j < array_size; j++) { | |
final int fsRef = cas.getHeapValue(position++); | |
if (isFiltering) { | |
String typeName = tsi.ll_getTypeForCode(cas.getHeapValue(fsRef)).getName(); | |
if (filterTypeSystem.getType(typeName) == null) { | |
continue; // don't enqueue this type because it's filtered out | |
} | |
} | |
if (fsRef != CASImpl.NULL) { // null feature values do not refer to any other FS | |
enqueue(fsRef); | |
} | |
} | |
return; | |
} | |
boolean insideListNode = listUtils.isListType(typeCode); | |
int[] feats = tsi.ll_getAppropriateFeatures(typeCode); | |
for (int feat : feats) { | |
if (isFiltering) { | |
// skip features that aren't in the target type system | |
String fullFeatName = tsi.ll_getFeatureForCode(feat).getName(); | |
if (filterTypeSystem.getFeatureByFullName(fullFeatName) == null) { | |
continue; | |
} | |
} | |
final int featAddr = addr + cas.getFeatureOffset(feat); | |
final int featVal = cas.getHeapValue(featAddr); | |
if (featVal == CASImpl.NULL) { // null feature values do not refer to any other FS | |
continue; | |
} | |
// enqueue behavior depends on range type of feature | |
final int fsClass = classifyType(tsi.range(feat)); | |
switch (fsClass) { | |
case LowLevelCAS.TYPE_CLASS_FS: { | |
enqueue(featVal); | |
break; | |
} | |
case LowLevelCAS.TYPE_CLASS_INTARRAY: | |
case LowLevelCAS.TYPE_CLASS_FLOATARRAY: | |
case LowLevelCAS.TYPE_CLASS_STRINGARRAY: | |
case LowLevelCAS.TYPE_CLASS_BOOLEANARRAY: | |
case LowLevelCAS.TYPE_CLASS_BYTEARRAY: | |
case LowLevelCAS.TYPE_CLASS_SHORTARRAY: | |
case LowLevelCAS.TYPE_CLASS_LONGARRAY: | |
case LowLevelCAS.TYPE_CLASS_DOUBLEARRAY: | |
case LowLevelCAS.TYPE_CLASS_FSARRAY: { | |
// we enqueue arrays if: | |
// when statically using multipleReferencesAllowed flag: | |
// when that says it's multiply referenced; | |
// otherwise, we skip enqueueing it because it will | |
// be picked up when serializing the feature | |
// when dynamically computing multiple-refs: we enqueue it | |
// unless already enqueued, in order to pick up any multiple refs | |
final boolean alreadyVisited = visited_not_yet_written.contains(featVal); | |
if (isMultiRef_enqueue(feat, featVal, alreadyVisited, false, false)) { | |
enqueue(featVal); // will add to queue list 1st time multi-ref detected | |
// otherwise, it is singly referenced (so far) and will be embedded | |
// (or has already been enqueued, in dynamic embedding mode), so don't enqueue | |
} else if (fsClass == LowLevelCAS.TYPE_CLASS_FSARRAY && !alreadyVisited) { | |
// enqueue any FSs reachable from an FSArray | |
enqueueFSArrayElements(featVal); | |
} | |
break; | |
} | |
case TYPE_CLASS_INTLIST: | |
case TYPE_CLASS_FLOATLIST: | |
case TYPE_CLASS_STRINGLIST: | |
case TYPE_CLASS_FSLIST: { | |
// we enqueue lists if: | |
// when statically using multipleReferencesAllowed flag: | |
// when that says it's multiply referenced or | |
// we're inside a list which was earlier multiply referenced | |
// otherwise, we skip enqueueing it because it will | |
// be picked up when serializing the feature | |
// when dynamically computing multiple-refs: we enqueue it | |
// unless already enqueued, in order to pick up any multiple refs | |
final boolean alreadyVisited = visited_not_yet_written.contains(featVal); | |
if (isMultiRef_enqueue(feat, featVal, alreadyVisited, insideListNode, true)) { | |
enqueue(featVal); | |
} else if (fsClass == TYPE_CLASS_FSLIST && !alreadyVisited) { | |
// also, we need to enqueue any FSs reachable from an FSList | |
enqueueFSListElements(featVal); | |
} | |
break; | |
} | |
} | |
} // end of loop over all features | |
} | |
/** | |
* Enqueues all FS reachable from an FSArray. | |
* | |
* @param addr | |
* Address of an FSArray | |
*/ | |
private void enqueueFSArrayElements(int addr) throws SAXException { | |
final int size = cas.ll_getArraySize(addr); | |
int pos = cas.getArrayStartAddress(addr); | |
int val; | |
for (int i = 0; i < size; i++) { | |
val = cas.getHeapValue(pos); | |
if (val != CASImpl.NULL) { | |
enqueue(val); | |
} | |
++pos; | |
} | |
} | |
/** | |
* Enqueues all FS reachable from an FSList. This does NOT include the list nodes themselves. | |
* | |
* @param addr | |
* Address of an FSList | |
*/ | |
private void enqueueFSListElements(int addr) throws SAXException { | |
int[] addrArray = listUtils.fsListToAddressArray(addr); | |
for (int j = 0; j < addrArray.length; j++) { | |
if (addrArray[j] != CASImpl.NULL) { | |
enqueue(addrArray[j]); | |
} | |
} | |
} | |
/* | |
* Encode the indexed FS in the queue. | |
*/ | |
public void encodeIndexed() throws Exception { | |
if (null != previouslySerializedFSs) { | |
encodeFSs(previouslySerializedFSs); | |
} | |
if (null != modifiedEmbeddedValueFSs) { | |
encodeFSs(modifiedEmbeddedValueFSs); | |
} | |
for (IntVector fss : indexedFSs) { | |
if (fss != null) { | |
encodeFSs(fss); | |
} | |
} | |
} | |
private void encodeFSs(final IntVector fss) throws Exception { | |
final int max = fss.size(); | |
for (int i = 0; i < max; i++) { | |
encodeFS(fss.get(i)); | |
} | |
} | |
/* | |
* Encode all other enqueued (non-indexed) FSs. | |
* The queue is read out in FiFo order. | |
* This insures that FsLists which are only | |
* referenced via a single FS ref, get | |
* encoded as [ x x x ] format rather than | |
* as individual FSs (because the individual | |
* items are also in the queue as items, but | |
* later). The isWritten test prevents dupl writes | |
*/ | |
public void encodeQueued() throws Exception { | |
int[] queueArray = queue.toArray(); | |
for (int addr : queueArray) { | |
// for some serializers, things could be enqueued multiple times in the ref queue | |
// so check if already written, and if so, skip | |
// Case where this happens: JSON serialization with dynamically determined single ref embedding | |
// - have to enqueue to check if multiple refs, even if embedding eventually | |
if (visited_not_yet_written.contains(addr)) { | |
if (null != multiRefFSs && !multiRefFSs.contains(addr)) { | |
continue; // skip writing embeddable item (for JSON dynamic embedding) from Q; will be written from reference | |
} | |
encodeFS(addr); | |
} | |
} | |
} | |
// public Integer[] collectAllFeatureStructures() { | |
// final int indexedSize = indexedFSs.size(); | |
// final int qSize = queue.size(); | |
// final int rLen = indexedSize + queue.size(); | |
// Integer[] r = new Integer[rLen]; | |
// int i = 0; | |
// for (; i < indexedSize; i++) { | |
// r[i] = indexedFSs.get(i); | |
// } | |
// for (int j = 0; j < qSize; j++) { | |
// r[i++] = queue.get(j); | |
// } | |
// return r; | |
// } | |
private int compareInts(int i1, int i2) { | |
return (i1 == i2) ? 0 : | |
(i1 > i2) ? 1 : -1; | |
} | |
private int compareFeat(int o1, int o2, int featCode) { | |
final int f1 = cas.ll_getIntValue(o1, featCode); | |
final int f2 = cas.ll_getIntValue(o2, featCode); | |
return compareInts(f1, f2); | |
} | |
/** | |
* sort a view, by type and then by begin/end asc/des for subtypes of Annotation, | |
* then by id | |
*/ | |
public final Comparator<Integer> sortFssByType = | |
new Comparator<Integer>() { | |
public int compare(Integer o1, Integer o2) { | |
final int typeCode1 = cas.getHeapValue(o1); | |
final int typeCode2 = cas.getHeapValue(o2); | |
int c = compareInts(typeCode1, typeCode2); | |
if (c != 0) { | |
return c; | |
} | |
// final boolean hasSofa = tsi.subsumes(tsi.annotBaseTypeCode, typeCode1); | |
// if (hasSofa) { | |
// c = compareFeat(o1, o2, tsi.annotSofaFeatCode); | |
// if (c != 0) { | |
// return c; | |
// } | |
final boolean isAnnot = tsi.subsumes(TypeSystemImpl.annotTypeCode, typeCode1); | |
if (isAnnot) { | |
c = compareFeat(o1, o2, TypeSystemImpl.startFeatCode); | |
return (c != 0) ? c : compareFeat(o2, o1, TypeSystemImpl.endFeatCode); // reverse order | |
} | |
// not sofa nor annotation | |
return compareInts(o1, o2); // return in @id order | |
} | |
}; | |
/** | |
* Encode an individual FS. | |
* | |
* Json has 2 encodings | |
* For type: | |
* "typeName" : [ { "@id" : 123, feat : value .... }, | |
* { "@id" : 456, feat : value .... }, | |
* ... | |
* ], | |
* ... | |
* | |
* For id: | |
* "nnnn" : {"@type" : typeName ; feat : value ...} | |
* | |
* For cases where the top level type is an array or list, there is | |
* a generated feature name, "@collection" whose value is | |
* the list or array of values associated with that type. | |
* | |
* @param addr | |
* The address to be encoded. | |
* @throws SAXException passthru | |
*/ | |
public void encodeFS(int addr) throws Exception { | |
final int typeCode = cas.getHeapValue(addr); | |
final int typeClass = classifyType(typeCode); | |
boolean isIndexId = csss.writeFsStart(addr, typeCode); | |
if (!isIndexId && multiRefFSs != null && multiRefFSs.contains(addr)) { | |
csss.writeFsRef(addr); | |
} else { | |
visited_not_yet_written.remove(addr); // mark as written | |
switch (typeClass) { | |
case LowLevelCAS.TYPE_CLASS_FS: | |
csss.writeFs(addr, typeCode); | |
break; | |
case TYPE_CLASS_INTLIST: | |
case TYPE_CLASS_FLOATLIST: | |
case TYPE_CLASS_STRINGLIST: | |
case TYPE_CLASS_FSLIST: | |
csss.writeListsAsIndividualFSs(addr, typeCode); | |
break; | |
case LowLevelCAS.TYPE_CLASS_FSARRAY: | |
case LowLevelCAS.TYPE_CLASS_INTARRAY: | |
case LowLevelCAS.TYPE_CLASS_FLOATARRAY: | |
case LowLevelCAS.TYPE_CLASS_BOOLEANARRAY: | |
case LowLevelCAS.TYPE_CLASS_BYTEARRAY: | |
case LowLevelCAS.TYPE_CLASS_SHORTARRAY: | |
case LowLevelCAS.TYPE_CLASS_LONGARRAY: | |
case LowLevelCAS.TYPE_CLASS_DOUBLEARRAY: | |
case LowLevelCAS.TYPE_CLASS_STRINGARRAY: | |
csss.writeArrays(addr, typeCode, typeClass); | |
break; | |
default: | |
throw new RuntimeException("Error classifying FS type."); | |
} | |
csss.writeEndOfIndividualFs(); | |
} | |
} | |
int filterType(int addr) { | |
if (isFiltering) { | |
String typeName = tsi.ll_getTypeForCode(cas.getHeapValue(addr)).getName(); | |
if (filterTypeSystem.getType(typeName) == null) { | |
return 0; | |
} | |
} | |
return addr; | |
} | |
/** | |
* Classifies a type. This returns an integer code identifying the type as one of the primitive | |
* types, one of the array types, one of the list types, or a generic FS type (anything else). | |
* <p> | |
* The {@link LowLevelCAS#ll_getTypeClass(int)} method classifies primitives and array types, | |
* but does not have a special classification for list types, which we need for XMI | |
* serialization. Therefore, in addition to the type codes defined on {@link LowLevelCAS}, this | |
* method can return one of the type codes TYPE_CLASS_INTLIST, TYPE_CLASS_FLOATLIST, | |
* TYPE_CLASS_STRINGLIST, or TYPE_CLASS_FSLIST. | |
* | |
* @param type | |
* the type to classify | |
* @return one of the TYPE_CLASS codes defined on {@link LowLevelCAS} or on this interface. | |
*/ | |
public final int classifyType(int type) { | |
// For most most types | |
if (listUtils.isIntListType(type)) { | |
return TYPE_CLASS_INTLIST; | |
} | |
if (listUtils.isFloatListType(type)) { | |
return TYPE_CLASS_FLOATLIST; | |
} | |
if (listUtils.isStringListType(type)) { | |
return TYPE_CLASS_STRINGLIST; | |
} | |
if (listUtils.isFsListType(type)) { | |
return TYPE_CLASS_FSLIST; | |
} | |
return cas.ll_getTypeClass(type); | |
} | |
int getElementCountForSharedData() { | |
return (sharedData == null) ? 0 : sharedData.getOutOfTypeSystemElements().size(); | |
} | |
/** | |
* Get the XMI ID to use for an FS. | |
* | |
* @param addr | |
* address of FS | |
* @return XMI ID. If addr == CASImpl.NULL, returns null | |
*/ | |
public String getXmiId(int addr) { | |
int v = getXmiIdAsInt(addr); | |
return (v == 0) ? null : Integer.toString(v); | |
} | |
public int getXmiIdAsInt(int addr) { | |
if (addr == CASImpl.NULL) { | |
return 0; | |
} | |
if (isFiltering) { // return as null any references to types not in target TS | |
String typeName = tsi.ll_getTypeForCode(cas.getHeapValue(addr)).getName(); | |
if (filterTypeSystem.getType(typeName) == null) { | |
return 0; | |
} | |
} | |
if (sharedData == null) { | |
// in the absence of outside information, just use the FS address | |
return addr; | |
} else { | |
return sharedData.getXmiIdAsInt(addr); | |
} | |
} | |
public String getNameSpacePrefix(String uimaTypeName, String nsUri, int lastDotIndex) { | |
// determine what namespace prefix to use | |
String prefix = nsUriToPrefixMap.get(nsUri); | |
if (prefix == null) { | |
if (lastDotIndex != -1) { // have namespace | |
int secondLastDotIndex = uimaTypeName.lastIndexOf('.', lastDotIndex-1); | |
prefix = uimaTypeName.substring(secondLastDotIndex + 1, lastDotIndex); | |
} else { | |
prefix = "noNamespace"; // is correct for older XMI standard too | |
} | |
// make sure this prefix hasn't already been used for some other namespace | |
// including out-of-type-system types (for XmiCasSerializer) | |
if (nsPrefixesUsed.contains(prefix)) { | |
String basePrefix = prefix; | |
int num = 2; | |
while (nsPrefixesUsed.contains(basePrefix + num)) { | |
num++; | |
} | |
prefix = basePrefix + num; | |
} | |
nsUriToPrefixMap.put(nsUri, prefix); | |
nsPrefixesUsed.add(prefix); | |
} | |
return prefix; | |
} | |
/* | |
* convert to shared string, without interning, reduce GCs | |
*/ | |
public String getUniqueString(String s) { | |
String u = uniqueStrings.get(s); | |
if (null == u) { | |
u = s; | |
uniqueStrings.put(s, s); | |
} | |
return u; | |
} | |
public String getTypeNameFromXmlElementName(XmlElementName xe) { | |
final String nsUri = xe.nsUri; | |
if (nsUri == null || nsUri.length() == 0) { | |
throw new UnsupportedOperationException(); | |
} | |
final int pfx = XmiCasSerializer.URIPFX.length; | |
final int sfx = XmiCasSerializer.URISFX.length; | |
String r = (nsUri.startsWith(XmiCasSerializer.DEFAULT_NAMESPACE_URI)) ? | |
"" : | |
nsUri.substring(pfx, nsUri.length() - sfx); | |
r = r.replace('/', '.'); | |
return r + xe.localName; | |
} | |
public boolean isStaticMultiRef(int featCode) { | |
return tsi.ll_getFeatureForCode(featCode).isMultipleReferencesAllowed(); | |
} | |
} | |
} |