blob: dc57a7985de2ccfd2862c665255199a4d20d4dc5 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.apache.uima.util;
import static org.apache.uima.cas.impl.Serialization.serializeCAS;
import static org.apache.uima.cas.impl.Serialization.serializeCASMgr;
import static org.apache.uima.cas.impl.Serialization.serializeWithCompression;
import java.io.BufferedInputStream;
import java.io.Closeable;
import java.io.DataInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.ObjectInputStream;
import java.io.ObjectOutputStream;
import java.io.OutputStream;
import java.net.URL;
import org.apache.uima.UIMARuntimeException;
import org.apache.uima.cas.CAS;
import org.apache.uima.cas.CASRuntimeException;
import org.apache.uima.cas.SerialFormat;
import org.apache.uima.cas.admin.CASMgr;
import org.apache.uima.cas.impl.CASCompleteSerializer;
import org.apache.uima.cas.impl.CASImpl;
import org.apache.uima.cas.impl.CASMgrSerializer;
import org.apache.uima.cas.impl.CASSerializer;
import org.apache.uima.cas.impl.CommonSerDes;
import org.apache.uima.cas.impl.CommonSerDes.Header;
import org.apache.uima.cas.impl.Serialization;
import org.apache.uima.cas.impl.XCASDeserializer;
import org.apache.uima.cas.impl.XCASSerializer;
import org.apache.uima.cas.impl.XmiCasDeserializer;
import org.apache.uima.cas.impl.XmiCasSerializer;
import org.xml.sax.SAXException;
/**
* a collection of static methods aimed at making it easy to
* - save and load CASes, and to
* - optionally include their Type Systems and index definitions based on those type systems (abbreviated TSI).
*
* There are several serialization formats supported; these are listed in the Java enum SerialFormat,
* together with their preferred file extension name.
*
* The APIs for loading attempt to automatically use the appropriate deserializers, based on the input data format.
* To select the right deserializer, first, the file extension name (if available) is used:
* - xmi: XMI format
* - xcas: XCAS format
* - xml: XCAS format
*
* If none of these apply, then the first few bytes of the input are examined to determine the format.
*
* For loading, the inputs may be supplied as URLs or as InputStream.
* You can use Files or Paths by converting these to URLs:
* URL url = a_path.toUri().toURL();
* URL url = a_file.toUri().toURL();
*
* When loading, an optional lenient boolean flag may be specified.
* It is observed only for the XMI and XCAS formats.
* If true, then types and/or features being deserialized which don't exist in the receiving CAS are silently ignored.
*
* When TSI is saved, it is either saved in the same destination (e.g. file or stream), or in a separate one.
* - One serialization format, SERIALIZED_TSI, supports saving the TSI in the same destination.
* Other formats require the TSI to be saved to a separate OutputStream.
*
* Summary of the APIs for saving:
* save(CAS, OutputStream, SerialFormat)
* save(CAS, OutputStream, OutputStream, SerialFormat) - extra outputStream for saving the TSI
*
* Note: there is no API for saving in COMPRESSED_FILTERED with a filtering type system; to do that, use the
* methods in Serialization.serializeWithCompression
*
* Summary of APIs for loading:
* load(URL , CAS)
* load(InputStream, CAS)
*
* load(URL , URL , CAS, lenient_flag) - the second URL is for loading a separately-stored TSI
* load(InputStream, InputStream, CAS, lenient_flag)
*
* You may specify the lenient_flag without the TSI input by setting the TSI input argument to null.
*
*/
public class CasIOUtils {
/**
* Loads a Cas from a URL source. The format is determined from the file extension name and the content.
* For formats of type SERIALIZED_TSI, the type system and index definitions are reset
* Lenient is false; to use lenient loading, use the 4 argument form.
*
* @param casUrl
* The url containing the CAS
* @param aCAS
* The CAS that should be filled
* @return the SerialFormat of the loaded CAS
* @throws IOException
* - Problem loading from given URL
*/
public static SerialFormat load(URL casUrl, CAS aCAS) throws IOException {
return load(casUrl, null, aCAS, false);
}
/**
* Loads a CAS from a URL source. The format is determined from the file extension name and the content.
* For formats of type SERIALIZED_TSI, the type system and index definitions are read from the casUrl source;
* the value of tsiInputStream is ignored.
* For other formats, if the tsiUrl is not null, type system and index definitions are read from that source.
*
* To specify lenient loading, without specifying an additional type system and index definition source,
* pass null for the tsiUrl.
*
* @param casUrl
* The url to deserialize the CAS from
* @param tsiUrl
* The optional url to deserialize the type system and index definitions from
* @param aCAS
* The CAS that should be filled
* @param leniently
* for XCAS and XMI formats, ignore feature structures and features of non-existing types and/or features.
* ignored for other formats.
* @return the SerialFormat of the loaded CAS
* @throws IOException
* - Problem loading from given URL
*/
public static SerialFormat load(URL casUrl, URL tsiUrl, CAS aCAS, boolean leniently)
throws IOException {
String path = casUrl.getPath().toLowerCase();
if (path.endsWith(SerialFormat.XMI.getDefaultFileExtension())) {
InputStream casIS = new BufferedInputStream(casUrl.openStream());
try {
XmiCasDeserializer.deserialize(casIS, aCAS, leniently);
return SerialFormat.XMI;
} catch (SAXException e) {
throw new IOException(e);
} finally {
closeQuitely(casIS);
}
}
if (path.endsWith(SerialFormat.XCAS.getDefaultFileExtension())
|| path.endsWith(".xml")) {
InputStream casIS = new BufferedInputStream(casUrl.openStream());
try {
XCASDeserializer.deserialize(casIS, aCAS, leniently);
return SerialFormat.XCAS;
} catch (SAXException e) {
throw new IOException(e);
} finally {
closeQuitely(casIS);
}
}
InputStream casIS = new BufferedInputStream(casUrl.openStream());
InputStream tsIS = (tsiUrl == null) ? null : new BufferedInputStream(tsiUrl.openStream());
try {
return load(casIS, tsIS, aCAS, leniently);
} finally {
closeQuitely(casIS);
closeQuitely(tsIS);
}
}
/**
* Loads a Cas from a URL source. The format is determined from the content.
* For formats of type SERIALIZED_TSI, the type system and index definitions are reset.
* Lenient is false; to use lenient loading, use the 4 argument form.
*
* @param casInputStream
* The input stream containing the CAS. Caller should buffer this appropriately.
* @param aCAS
* The CAS that should be filled
* @return the SerialFormat of the loaded CAS
* @throws IOException
* - Problem loading from given InputStream
*/
public static SerialFormat load(InputStream casInputStream, CAS aCAS) throws IOException {
return load(casInputStream, (CASMgrSerializer) null, aCAS, false);
}
/**
* Loads a CAS from a URL source. The format is determined from the content.
* For formats of type SERIALIZED_TSI, the type system and index definitions are read from the casUrl source;
* the value of tsiInputStream is ignored.
* For other formats, if the tsiUrl is not null, type system and index definitions are read from that source.
*
* To specify lenient loading, without specifying an additional type system and index definition source,
* pass null for the tsiInputStream.
*
* @param casInputStream
* The input stream containing the CAS.
* @param tsiInputStream
* The optional input stream containing the type system. This is only used if the
* casInputStream does not already come with an embedded CAS configuration.
* @param aCAS
* The CAS that should be filled
* @param leniently
* for XCAS and XMI formats, ignore feature structures and features of non-existing types and/or features.
* ignored for other formats.
* @return the SerialFormat of the loaded CAS
* @throws IOException
* - Problem loading from given InputStream
* @throws IllegalArgumentException
* - when trying to load XCAS
*/
public static SerialFormat load(InputStream casInputStream, InputStream tsiInputStream, CAS aCAS,
boolean leniently) throws IOException {
CASMgrSerializer casMgrSerializer = null;
// If there is a TSI specified, load it - we will see later if we actually use it.
if (tsiInputStream != null) {
if (!tsiInputStream.markSupported()) {
tsiInputStream = new BufferedInputStream(tsiInputStream);
}
try {
ObjectInputStream is = new ObjectInputStream(tsiInputStream);
casMgrSerializer = (CASMgrSerializer) is.readObject();
} catch (ClassNotFoundException e) {
throw new IOException(e);
}
}
return load(casInputStream, casMgrSerializer, aCAS, leniently);
}
/**
* Loads a CAS from a URL source. The format is determined from the content. For formats of type
* SERIALIZED_TSI, the type system and index definitions are read from the casUrl source; the
* value of tsiInputStream is ignored. For other formats, if the tsiUrl is not null, type system
* and index definitions are read from that source.
*
* To specify lenient loading, without specifying an additional type system and index definition
* source, pass null for the tsiInputStream.
*
* This method avoids the repeated loading of the typesystem and index definitions
* from a stream when loading many CASes in a row.
*
* @param casInputStream
* The input stream containing the CAS.
* @param casMgr
* The optional CAS configuration including type system definition and index definition
* in form of a {@link CASMgrSerializer}}. This is only used if the
* casInputStream does not already come with an embedded CAS configuration.
* @param aCAS
* The CAS that should be filled
* @param leniently
* for XCAS and XMI formats, ignore feature structures and features of non-existing types
* and/or features. ignored for other formats.
* @return the SerialFormat of the loaded CAS
* @throws IOException
* - Problem loading from given InputStream
* @throws IllegalArgumentException
* - when trying to load XCAS
*/
public static SerialFormat load(InputStream casInputStream, CASMgrSerializer casMgr, CAS aCAS,
boolean leniently) throws IOException {
if (!casInputStream.markSupported()) {
casInputStream = new BufferedInputStream(casInputStream);
}
CASImpl casImpl = (CASImpl) aCAS;
// scan the first part of the file for known formats
casInputStream.mark(6);
byte[] firstPartOfFile = new byte[6];
int bytesReadCount = casInputStream.read(firstPartOfFile);
casInputStream.reset();
String start = new String(firstPartOfFile, 0, bytesReadCount, "UTF-8").toLowerCase();
if (start.startsWith("<?xml ")) { // could be XCAS or XMI
try {
return XmlCasDeserializer.deserializeR(casInputStream, aCAS, leniently);
} catch (SAXException e) {
throw new UIMARuntimeException(e);
}
}
DataInputStream deserIn = CommonSerDes.maybeWrapToDataInputStream(casInputStream);
if (CommonSerDes.isBinaryHeader(deserIn)) {
Header h = CommonSerDes.readHeader(deserIn);
if (h.isTypeSystemIncluded()) { // Load TSI from CAS stream
try {
ObjectInputStream ois = new ObjectInputStream(deserIn);
CASMgrSerializer casMgrSerializer = (CASMgrSerializer) ois.readObject();
casImpl.setupCasFromCasMgrSerializer(casImpl, casMgrSerializer);
} catch (ClassNotFoundException e) {
/**Unrecognized serialized CAS format*/
throw new CASRuntimeException(CASRuntimeException.UNRECOGNIZED_SERIALIZED_CAS_FORMAT);
}
}
else if (casMgr != null) { // if TSI not in file, maybe set it from parameter
casImpl.setupCasFromCasMgrSerializer(casImpl, casMgr);
}
return casImpl.reinit(h, casInputStream);
} else {
// is a Java Object serialization, with or without a type system
ObjectInputStream ois = new ObjectInputStream(casInputStream);
try {
Object o = ois.readObject();
if (o instanceof CASSerializer) {
if (casMgr != null) { // maybe install type system and index def
casImpl.setupCasFromCasMgrSerializer(casImpl, casMgr);
}
casImpl.reinit((CASSerializer) o); // deserialize from object
return SerialFormat.SERIALIZED;
} else if (o instanceof CASCompleteSerializer) {
// with a type system use that, ignore any supplied via tsiInputStream
casImpl.reinit((CASCompleteSerializer) o);
return SerialFormat.SERIALIZED_TSI;
} else {
/**Unrecognized serialized CAS format*/
throw new CASRuntimeException(CASRuntimeException.UNRECOGNIZED_SERIALIZED_CAS_FORMAT);
}
} catch (ClassNotFoundException e) {
/**Unrecognized serialized CAS format*/
throw new CASRuntimeException(CASRuntimeException.UNRECOGNIZED_SERIALIZED_CAS_FORMAT);
}
}
}
/**
* Write the CAS in the specified format.
*
* @param aCas
* The CAS that should be serialized and stored
* @param docOS
* The output stream for the CAS
* @param format
* The SerialFormat in which the CAS should be stored.
* @throws IOException
* - Problem saving to the given InputStream
*/
public static void save(CAS aCas, OutputStream docOS, SerialFormat format) throws IOException {
save(aCas, docOS, null, format);
}
/**
* Write the CAS in the specified format. If the format does not include typesystem information
* and the optional output stream of the typesystem is specified, then the typesystem information
* is written there.
*
* @param aCas
* The CAS that should be serialized and stored
* @param docOS
* The output stream for the CAS, with appropriate buffering
* @param tsiOS
* Optional output stream for type system information. Only used if the format does not
* support storing typesystem information directly in the main output file.
* @param format
* The SerialFormat in which the CAS should be stored.
* @throws IOException
* - Problem saving to the given InputStream
*/
public static void save(CAS aCas, OutputStream docOS, OutputStream tsiOS, SerialFormat format)
throws IOException {
boolean typeSystemWritten = false;
try {
switch (format) {
case XMI:
XmiCasSerializer.serialize(aCas, docOS);
break;
case XCAS:
XCASSerializer.serialize(aCas, docOS, true); // true = formatted output
break;
case SERIALIZED:
writeJavaObject(Serialization.serializeCAS(aCas), docOS);
break;
case SERIALIZED_TSI:
writeJavaObject(Serialization.serializeCASComplete((CASMgr) aCas), docOS);
typeSystemWritten = true; // Embedded type system
break;
case BINARY: // Java-serialized CAS without type system
serializeCAS(aCas, docOS);
break;
case COMPRESSED: // Binary compressed CAS without type system (form 4)
serializeWithCompression(aCas, docOS);
break;
case COMPRESSED_FILTERED: // Binary compressed CAS (form 6)
serializeWithCompression(aCas, docOS, false);
break;
case COMPRESSED_FILTERED_TSI:
// Binary compressed CAS (form 6)
// ... with embedded Java-serialized type system
serializeWithCompression(aCas, docOS, true);
typeSystemWritten = true; // Embedded type system
break;
default:
throw new IllegalArgumentException("Unknown format [" + format.name()
+ "]. Must be one of: " + SerialFormat.values());
}
} catch (IOException e) {
throw e;
} catch (Exception e) {
throw new IOException(e);
}
// Write type system to the separate stream only if it has not alreay been embedded into the
// main stream
if (tsiOS != null && !typeSystemWritten) {
writeTypeSystem(aCas, tsiOS);
}
}
private static void writeJavaObject(Object o, OutputStream aOS) throws IOException {
ObjectOutputStream tsiOS = new ObjectOutputStream(aOS);
tsiOS.writeObject(o);
tsiOS.flush();
}
private static void writeTypeSystem(CAS aCas, OutputStream aOS) throws IOException {
writeJavaObject(serializeCASMgr((CASImpl) aCas), aOS);
}
private static void closeQuitely(Closeable closeable) {
if (closeable != null) {
try {
closeable.close();
} catch (IOException e) {
// do nothing
}
}
}
}