blob: 298bf1c5335e5d85d23db50cddf15a9a87cea5a5 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.apache.uima.ducc.sampleapps;
/*
* This sample Cas Consumer is designed to create an output zip file for each Work Item.
* The CAS compression format is selectable as either ZIP compressed XmiCas or UIMA
* compressed binary form 6 format. When compressed binary is used, each zip file also
* contains the full UIMA Type System in ZIP compressed text.
* CASes in UIMA compressed binary form 6 format have the same flexibility as an XmiCas
* in that they can be deserialized into a CAS with a different, but compatible Type System.
*
* See more information in DUCC Book chapters on sample applications.
*
*/
import java.io.BufferedOutputStream;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.util.Iterator;
import java.util.zip.ZipEntry;
import java.util.zip.ZipOutputStream;
import org.apache.uima.UimaContext;
import org.apache.uima.analysis_component.JCasAnnotator_ImplBase;
import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
import org.apache.uima.cas.FeatureStructure;
import org.apache.uima.cas.TypeSystem;
import org.apache.uima.cas.impl.Serialization;
import org.apache.uima.cas.impl.XmiCasSerializer;
import org.apache.uima.ducc.Workitem;
import org.apache.uima.jcas.JCas;
import org.apache.uima.resource.ResourceInitializationException;
import org.apache.uima.resource.metadata.TypeSystemDescription;
import org.apache.uima.util.Level;
import org.apache.uima.util.Logger;
import org.apache.uima.util.TypeSystemUtil;
import org.apache.uima.util.XMLSerializer;
public class DuccCasCC extends JCasAnnotator_ImplBase {
public static final String PARAM_XMICOMPRESSIONLEVEL = "XmiCompressionLevel";
public static final String PARAM_USEBINARYCOMPRESSION = "UseBinaryCompression";
private Logger logger;
private String outputFilename=null;
private File outFile;
private FileOutputStream fos;
private ZipOutputStream zos;
private boolean useBinaryCas;
private int zipCompLevel;
private String casExt;
public void initialize(UimaContext aContext) throws ResourceInitializationException {
super.initialize(aContext);
zipCompLevel = (Integer)getContext().getConfigParameterValue(PARAM_XMICOMPRESSIONLEVEL);
useBinaryCas = (null == getContext().getConfigParameterValue(PARAM_USEBINARYCOMPRESSION)) ? Boolean.FALSE :
(Boolean) getContext().getConfigParameterValue(PARAM_USEBINARYCOMPRESSION);
logger = aContext.getLogger();
if (useBinaryCas) {
zipCompLevel = 0;
casExt = "cas";
logger.log(Level.INFO, "Outputting CASes in UIMA compressed binary form 6");
}
else {
casExt = "xmi";
logger.log(Level.INFO, "Outputting CASes in XmiCas format, zip compressed at level="+zipCompLevel);
}
}
public void process(JCas jcas) throws AnalysisEngineProcessException {
Iterator<FeatureStructure> fsit = jcas.getIndexRepository().getAllIndexedFS(jcas.getCasType(Workitem.type));
if (fsit.hasNext()) {
Workitem wi = (Workitem) fsit.next();
if (outputFilename == null || !outputFilename.equals(wi.getOutputspec())) {
// this Work Item contained no documents. Create empty output file.
try {
outFile = new File(wi.getOutputspec());
File outDir = outFile.getParentFile();
if (outDir != null && !outDir.exists()) {
outDir.mkdirs();
}
zos = new ZipOutputStream(new FileOutputStream(outFile));
zos.close();
logger.log(Level.INFO, "DuccCasCC: Flushed empty "+wi.getOutputspec());
return;
} catch (Exception e) {
throw new AnalysisEngineProcessException(e);
}
}
try {
zos.close();
fos.close();
if (!outFile.renameTo(new File(outputFilename))) {
throw new IOException("Rename failed for "+outputFilename);
}
} catch (IOException e) {
throw new AnalysisEngineProcessException(e);
}
logger.log(Level.INFO, "DuccCasCC: Flushed "+wi.getOutputspec());
return;
}
fsit = jcas.getIndexRepository().getAllIndexedFS(jcas.getCasType(DuccDocumentInfo.type));
if (!fsit.hasNext()) {
throw new AnalysisEngineProcessException(new RuntimeException("No DuccDocumentInfo FS in CAS"));
}
DuccDocumentInfo di = (DuccDocumentInfo) fsit.next();
String outputfile = di.getOutputfile();
if (!outputfile.equals(outputFilename)) {
// create new output file
outputFilename = outputfile;
try {
outFile = new File(outputFilename+"_temp");
File outDir = outFile.getParentFile();
if (outDir != null && !outDir.exists()) {
outDir.mkdirs();
}
fos = new FileOutputStream(outFile);
zos = new ZipOutputStream(new BufferedOutputStream(fos,1024*100));
if (useBinaryCas) {
//put the output CAS typesystem in the output zipfile
ZipEntry ze = new ZipEntry("typesystem.xml");
ze.setMethod(ZipEntry.DEFLATED);
zos.setLevel(9);
zos.putNextEntry(ze);
TypeSystem ts = jcas.getTypeSystem();
TypeSystemDescription tsDesc = TypeSystemUtil.typeSystem2TypeSystemDescription(ts);
tsDesc.toXML(zos); // Capture type system in XML format
zos.closeEntry();
}
zos.setLevel(zipCompLevel);
} catch (Exception e) {
throw new AnalysisEngineProcessException(e);
}
}
ZipEntry ze = new ZipEntry("doc_"+di.getDocseq()+"."+casExt);
ze.setMethod(ZipEntry.DEFLATED);
try {
zos.putNextEntry(ze);
BufferedOutputStream bos = new BufferedOutputStream(zos,1024*10);
if (useBinaryCas) {
Serialization.serializeWithCompression(jcas.getCas(), bos, jcas.getTypeSystem());
}
else {
// write XMI
XmiCasSerializer ser = new XmiCasSerializer(jcas.getTypeSystem());
XMLSerializer xmlSer = new XMLSerializer(bos, false);
ser.serialize(jcas.getCas(), xmlSer.getContentHandler());
}
bos.flush();
zos.closeEntry();
} catch (Exception e) {
throw new AnalysisEngineProcessException(e);
}
}
}