blob: c3cc562c5b6105c240b9e93326d02208b7fcdfc7 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.apache.uima.fit.component;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStreamWriter;
import java.io.PrintWriter;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.commons.io.IOUtils;
import org.apache.commons.io.output.CloseShieldOutputStream;
import org.apache.uima.UimaContext;
import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
import org.apache.uima.cas.CAS;
import org.apache.uima.cas.FeatureStructure;
import org.apache.uima.cas.Type;
import org.apache.uima.cas.impl.FeatureStructureImpl;
import org.apache.uima.cas.text.AnnotationFS;
import org.apache.uima.fit.descriptor.ConfigurationParameter;
import org.apache.uima.resource.ResourceInitializationException;
import org.springframework.util.DigestUtils;
/**
* Dumps CAS content to a text file. This is useful when setting up test cases which contain a
* reference output to which an actually produced CAS is compared. The format produced by this
* component is more easily comparable than a XCAS or XMI format.
*
*/
public class CasDumpWriter extends CasConsumer_ImplBase {
/**
* Pattern inclusion prefix.
*/
public static final String INCLUDE_PREFIX = "+|";
/**
* Pattern exclusion prefix.
*/
public static final String EXCLUDE_PREFIX = "-|";
/**
* Output file. If multiple CASes as processed, their contents are concatenated into this file.
* Mind that a test case using this consumer with multiple CASes requires a reader which produced
* the CASes always in the same order. When this file is set to "-", the dump does to
* {@link System#out} (default).
*/
public static final String PARAM_OUTPUT_FILE = "outputFile";
@ConfigurationParameter(name = PARAM_OUTPUT_FILE, mandatory = true, defaultValue = "-")
private File outputFile;
/**
* Whether to dump the content of the {@link CAS#getDocumentAnnotation()}.
*/
public static final String PARAM_WRITE_DOCUMENT_META_DATA = "writeDocumentMetaData";
@ConfigurationParameter(name = PARAM_WRITE_DOCUMENT_META_DATA, mandatory = true, defaultValue = "true")
private boolean writeDocumentMetaData;
/**
* Include/exclude features according to the following patterns. Mind that the patterns do not
* actually match feature names but lines produced by {@code FeatureStructure.toString()}.
*/
public static final String PARAM_FEATURE_PATTERNS = "featurePatterns";
@ConfigurationParameter(name = PARAM_FEATURE_PATTERNS, mandatory = true, defaultValue = { "+|.*",
"-|^.*documentUri:.*$", "-|^.*collectionId:.*$", "-|^.*documentBaseUri:.*$" })
private String[] featurePatterns;
private InExPattern[] cookedFeaturePatterns;
/**
* Include/exclude specified UIMA types in the output.
*/
public static final String PARAM_TYPE_PATTERNS = "typePatterns";
@ConfigurationParameter(name = PARAM_TYPE_PATTERNS, mandatory = true, defaultValue = { "+|.*" })
private String[] typePatterns;
/**
* Sort increasing by begin, decreasing by end, increasing by name instead of relying on index
* order.
*/
public static final String PARAM_SORT = "sort";
@ConfigurationParameter(name = PARAM_SORT, mandatory = true, defaultValue = "false")
private boolean sort;
private InExPattern[] cookedTypePatterns;
private PrintWriter out;
private int iCas;
@Override
public void initialize(UimaContext context) throws ResourceInitializationException {
super.initialize(context);
try {
if (out == null) {
if ("-".equals(outputFile.getName())) {
out = new PrintWriter(new CloseShieldOutputStream(System.out));
} else {
if (outputFile.getParentFile() != null) {
outputFile.getParentFile().mkdirs();
}
out = new PrintWriter(new OutputStreamWriter(new FileOutputStream(outputFile), "UTF-8"));
}
}
} catch (IOException e) {
throw new ResourceInitializationException(e);
}
cookedTypePatterns = compilePatterns(typePatterns);
cookedFeaturePatterns = compilePatterns(featurePatterns);
}
@Override
public void process(CAS aCAS) throws AnalysisEngineProcessException {
out.println("======== CAS " + iCas + " begin ==================================");
out.println();
Iterator<CAS> viewIt = aCAS.getViewIterator();
while (viewIt.hasNext()) {
CAS view = viewIt.next();
processView(view);
if (view.getDocumentText() == null && view.getSofaDataStream() != null) {
processSofaData(view);
}
}
out.println("======== CAS " + iCas + " end ==================================");
out.println();
out.println();
out.flush();
iCas++;
}
@Override
public void collectionProcessComplete() {
IOUtils.closeQuietly(out);
out = null;
}
private void processDocumentMetadata(CAS aCAS) {
if (!writeDocumentMetaData) {
return;
}
processFeatureStructure(aCAS.getDocumentAnnotation());
}
private void processDocumentText(CAS aCAS) {
out.println();
out.println("CAS-Text:");
out.println(aCAS.getDocumentText());
}
private void processFeatureStructures(CAS aCAS) {
Set<String> typesToPrint = getTypes(aCAS);
Iterator<AnnotationFS> annotationIterator = aCAS.getAnnotationIndex().iterator();
if (sort) {
List<AnnotationFS> sortedFS = new ArrayList<AnnotationFS>();
while (annotationIterator.hasNext()) {
sortedFS.add(annotationIterator.next());
}
Collections.sort(sortedFS, new Comparator<AnnotationFS>() {
@Override
public int compare(AnnotationFS aO1, AnnotationFS aO2) {
int begin = aO1.getBegin() - aO2.getBegin();
if (begin != 0) {
return begin;
}
int end = aO2.getEnd() - aO1.getEnd();
if (end != 0) {
return end;
}
int name = aO1.getType().getName().compareTo(aO2.getType().getName());
if (name != 0) {
return name;
}
// Last resort: try the address.
if (aO1 instanceof FeatureStructureImpl && aO2 instanceof FeatureStructureImpl) {
return ((FeatureStructureImpl) aO1).getAddress()
- ((FeatureStructureImpl) aO2).getAddress();
}
// Fall back to name.
return name;
}
});
annotationIterator = sortedFS.iterator();
}
while (annotationIterator.hasNext()) {
AnnotationFS annotation = annotationIterator.next();
if (!typesToPrint.contains(annotation.getType().getName())) {
continue;
}
try {
out.println("[" + annotation.getCoveredText() + "]");
} catch (IndexOutOfBoundsException e) {
out.println("<OFFSETS OUT OF BOUNDS>");
}
processFeatureStructure(annotation);
}
}
private void processFeatureStructure(FeatureStructure aFS) {
String meta = aFS.toString();
for (String line : meta.split("\n")) {
boolean print = false;
for (InExPattern p : cookedFeaturePatterns) {
p.matchter.reset(line);
if (p.matchter.matches()) {
print = p.includeInOutput;
}
}
if (print) {
out.println(line);
}
}
}
private void processView(CAS aCAS) {
out.println("-------- View " + aCAS.getViewName() + " begin ----------------------------------");
out.println();
processDocumentMetadata(aCAS);
processDocumentText(aCAS);
processFeatureStructures(aCAS);
out.println("-------- View " + aCAS.getViewName() + " end ----------------------------------");
out.println();
}
private void processSofaData(CAS aCAS) throws AnalysisEngineProcessException {
out.println("Sofa data:");
//
// Mime type
String mimeType = aCAS.getSofaMimeType();
if (mimeType != null) {
out.println(" mime type:\t" + mimeType);
}
// Data
byte[] bytes = null;
InputStream in = null;
try {
in = aCAS.getSofaDataStream();
bytes = IOUtils.toByteArray(in);
} catch (IOException e) {
throw new AnalysisEngineProcessException(e);
} finally {
IOUtils.closeQuietly(in);
}
if (bytes != null) {
// Data size
out.println(" size:\t" + bytes.length + " byte(s)");
// Hash value of the bytes
String hash = DigestUtils.md5DigestAsHex(bytes);
out.println(" hash value:\t" + hash);
}
out.println();
}
private static InExPattern[] compilePatterns(String[] aPatterns) {
InExPattern[] patterns = new InExPattern[aPatterns.length];
for (int i = 0; i < aPatterns.length; i++) {
if (aPatterns[i].startsWith(INCLUDE_PREFIX)) {
patterns[i] = new InExPattern(aPatterns[i].substring(INCLUDE_PREFIX.length()), true);
} else if (aPatterns[i].startsWith(EXCLUDE_PREFIX)) {
patterns[i] = new InExPattern(aPatterns[i].substring(EXCLUDE_PREFIX.length()), false);
} else {
patterns[i] = new InExPattern(aPatterns[i], false);
}
}
return patterns;
}
private Set<String> getTypes(CAS cas) {
Set<String> types = new HashSet<String>();
Iterator<Type> typeIt = cas.getTypeSystem().getTypeIterator();
nextType: while (typeIt.hasNext()) {
Type type = typeIt.next();
if (type.getName().equals(cas.getDocumentAnnotation().getType().getName())) {
continue;
}
for (InExPattern p : cookedTypePatterns) {
p.matchter.reset(type.getName());
if (p.matchter.matches()) {
if (p.includeInOutput) {
types.add(type.getName());
} else {
types.remove(type.getName());
}
continue nextType;
}
}
}
return types;
}
private static class InExPattern {
final boolean includeInOutput;
final Matcher matchter;
public InExPattern(String aPattern, boolean aInclude) {
includeInOutput = aInclude;
matchter = Pattern.compile(aPattern).matcher("");
}
}
}