blob: 9b28de65a74f4b81c2e229aa7f370ee31f142650 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.apache.uima.examples;
import java.io.File;
import java.util.Iterator;
import org.apache.uima.UIMAException;
import org.apache.uima.UIMAFramework;
import org.apache.uima.cas.CAS;
import org.apache.uima.cas.FSIterator;
import org.apache.uima.cas.Feature;
import org.apache.uima.cas.FeatureStructure;
import org.apache.uima.cas.Type;
import org.apache.uima.collection.CollectionProcessingEngine;
import org.apache.uima.collection.EntityProcessStatus;
import org.apache.uima.collection.StatusCallbackListener;
import org.apache.uima.collection.impl.metadata.cpe.CpeDescriptorFactory;
import org.apache.uima.collection.metadata.CasProcessorConfigurationParameterSettings;
import org.apache.uima.collection.metadata.CpeCasProcessor;
import org.apache.uima.collection.metadata.CpeCollectionReader;
import org.apache.uima.collection.metadata.CpeComponentDescriptor;
import org.apache.uima.collection.metadata.CpeDescription;
import org.apache.uima.collection.metadata.CpeSofaMapping;
import org.apache.uima.collection.metadata.CpeSofaMappings;
import org.apache.uima.resource.ResourceInitializationException;
import org.apache.uima.tools.components.FileSystemCollectionReader;
import org.apache.uima.tools.components.InlineXmlCasConsumer;
import org.apache.uima.tools.components.XmlDetagger;
import org.apache.uima.util.AnalysisEnginePerformanceReports;
/**
* An example application that reads documents from the file system, sends them though an Analysis
* Engine(AE), and produces XML files with inline annotations. This application uses a
* {@link CollectionProcessingEngine} to drive the processing. For a simpler introduction to using
* AEs in an application, see {@link ExampleApplication}.
* <p>
* <code>Usage: java org.apache.uima.examples.RunAE [OPTIONS]
* &lt;AE descriptor or JAR file name&gt; &lt;input dir&gt;
* [&lt;output dir&gt;]</code>
* <p>
* If <code>output dir</code> is not specified, the analysis results will not be output. This can
* be useful when only interested in performance statistics.
* <p>
* <u>OPTIONS</u>
* <p>
* -t &lt;TagName&gt; (XML Text Tag) - specifies the name of an XML tag, found within the input
* documents, that contains the text to be analyzed. The text will also be detagged. If this option
* is not specified, the entire document will be processed. <br>
* -l &lt;ISO code&gt; (Language) - specifies the ISO code for the language of the input documents.
* Some AEs (e.g. PersonTitleAnnotator) require this. <br>
* -e &lt;Encoding&gt; - specifies character encoding of the input documents. The default is UTF-8.
* <br>
* -q (Quiet) - supresses progress messages that are normally printed as each document is processed.
* <br>
* -s&lt;x&gt; (Stats level) - determines the verboseness of performance statistics. s0=none,
* s1=brief, s2=full. The default is brief. <br>
* -x - process input files as XCAS files.
*/
public class RunAE implements StatusCallbackListener {
// Values read from cmd line args
private File aeSpecifierFile = null;
private File inputDir = null;
private File outputDir = null;
private String xmlTagName = null;
private String language;
private String encoding;
private boolean genProgressMessages = true;
private int statsLevel = 1;
private boolean xcasInput = false;
private boolean xmiInput = false;
private boolean xLenient = false;
int docsProcessed;
private CollectionProcessingEngine mCPE;
/**
* Constructor. Sets up and runs an Analysis Engine.
*/
public RunAE(String[] args) {
try {
// Read and validate command line arguments
if (!processCmdLineArgs(args)) {
printUsageMessage();
return;
}
// Enable schema validation (omit this to speed up initialization)
// UIMAFramework.getXMLParser().enableSchemaValidation(true);
// build a Collection Processing Engine descriptor that will drive processing
CpeDescription cpeDesc = CpeDescriptorFactory.produceDescriptor();
// add collection reader that will read input docs
cpeDesc.addCollectionReader(FileSystemCollectionReader.getDescriptorURL().toString());
// specify configuration parameters for collection reader
CasProcessorConfigurationParameterSettings crSettings = CpeDescriptorFactory
.produceCasProcessorConfigurationParameterSettings();
CpeCollectionReader cpeCollRdr = cpeDesc.getAllCollectionCollectionReaders()[0];
cpeCollRdr.setConfigurationParameterSettings(crSettings);
crSettings.setParameterValue(FileSystemCollectionReader.PARAM_INPUTDIR, inputDir
.getAbsolutePath());
crSettings.setParameterValue(FileSystemCollectionReader.PARAM_ENCODING, encoding);
crSettings.setParameterValue(FileSystemCollectionReader.PARAM_LANGUAGE, language);
if (xcasInput) {
crSettings.setParameterValue(FileSystemCollectionReader.PARAM_XCAS, "XCAS");
} else if (xmiInput) {
crSettings.setParameterValue(FileSystemCollectionReader.PARAM_XCAS, "XMI");
}
if (xLenient) {
crSettings.setParameterValue(FileSystemCollectionReader.PARAM_LENIENT, "true");
}
// if XML tag was specified, configure XmlDetagger annotator and add to CPE
CpeCasProcessor xmlDetaggerCasProc = null;
if (xmlTagName != null && xmlTagName.length() > 0) {
xmlDetaggerCasProc = CpeDescriptorFactory.produceCasProcessor("XmlDetagger");
CpeComponentDescriptor cpeComponentDescriptor =
CpeDescriptorFactory.produceComponentDescriptor(XmlDetagger.getDescriptorURL().toString());
xmlDetaggerCasProc.setCpeComponentDescriptor(cpeComponentDescriptor);
CasProcessorConfigurationParameterSettings detaggerSettings = CpeDescriptorFactory
.produceCasProcessorConfigurationParameterSettings();
xmlDetaggerCasProc.setConfigurationParameterSettings(detaggerSettings);
detaggerSettings.setParameterValue(XmlDetagger.PARAM_TEXT_TAG, xmlTagName);
xmlDetaggerCasProc.setMaxErrorCount(0);
cpeDesc.addCasProcessor(xmlDetaggerCasProc);
}
// add user's AE to CPE
CpeCasProcessor casProc = CpeDescriptorFactory.produceCasProcessor("UserAE");
CpeComponentDescriptor cpeComponentDescriptor =
CpeDescriptorFactory.produceComponentDescriptor(aeSpecifierFile.getAbsolutePath());
casProc.setCpeComponentDescriptor(cpeComponentDescriptor);
casProc.setMaxErrorCount(0);
cpeDesc.addCasProcessor(casProc);
// add CAS Consumer that will write the output
// create and configure CAS consumer that will write the output
CpeCasProcessor casCon = null;
if (outputDir != null) {
casCon = CpeDescriptorFactory.produceCasProcessor("CasConsumer");
cpeComponentDescriptor =
CpeDescriptorFactory.produceComponentDescriptor(InlineXmlCasConsumer.getDescriptorURL().toString());
casCon.setCpeComponentDescriptor(cpeComponentDescriptor);
CasProcessorConfigurationParameterSettings consumerSettings = CpeDescriptorFactory
.produceCasProcessorConfigurationParameterSettings();
casCon.setConfigurationParameterSettings(consumerSettings);
consumerSettings.setParameterValue(InlineXmlCasConsumer.PARAM_OUTPUTDIR, outputDir
.getAbsolutePath());
if (xcasInput) {
consumerSettings.setParameterValue(InlineXmlCasConsumer.PARAM_XCAS, "XCAS");
} else if (xmiInput) {
consumerSettings.setParameterValue(InlineXmlCasConsumer.PARAM_XCAS, "XMI");
}
casCon.setMaxErrorCount(0);
cpeDesc.addCasProcessor(casCon);
}
// if XML detagger is used, we need to configure sofa mappings for the CPE
if (xmlDetaggerCasProc != null) {
// For XML detagger map default sofa to "xmlDocument"
CpeSofaMapping sofaMapping = CpeDescriptorFactory.produceSofaMapping();
sofaMapping.setComponentSofaName("xmlDocument");
sofaMapping.setCpeSofaName(CAS.NAME_DEFAULT_SOFA);
CpeSofaMappings xmlDetaggerSofaMappings = CpeDescriptorFactory.produceSofaMappings();
xmlDetaggerSofaMappings.setSofaNameMappings(new CpeSofaMapping[] { sofaMapping });
xmlDetaggerCasProc.setSofaNameMappings(xmlDetaggerSofaMappings);
// User AE and InlineXmlCasConsumer (if present) operate on the "plainTextDocument"
// sofa produced by the XmlDetagger
CpeSofaMapping aeSofaMapping = CpeDescriptorFactory.produceSofaMapping();
aeSofaMapping.setCpeSofaName("plainTextDocument");
CpeSofaMappings userAeSofaMappings = CpeDescriptorFactory.produceSofaMappings();
userAeSofaMappings.setSofaNameMappings(new CpeSofaMapping[] { aeSofaMapping });
casProc.setSofaNameMappings(userAeSofaMappings);
if (casCon != null) {
CpeSofaMapping casConSofaMapping = CpeDescriptorFactory.produceSofaMapping();
casConSofaMapping.setCpeSofaName("plainTextDocument");
CpeSofaMappings consumerSofaMappings = CpeDescriptorFactory.produceSofaMappings();
consumerSofaMappings.setSofaNameMappings(new CpeSofaMapping[] { casConSofaMapping });
casCon.setSofaNameMappings(consumerSofaMappings);
}
}
// instantiate CPE
mCPE = UIMAFramework.produceCollectionProcessingEngine(cpeDesc);
// register callback listener
mCPE.addStatusCallbackListener(this);
// execute
docsProcessed = 0;
mCPE.process();
} catch (Exception e) {
//special check for using XML detagger with remotes, which will generate an error
//since sofa mappings aren't supported for remotes
if (xmlTagName != null && xmlTagName.length() > 0 && e instanceof UIMAException &&
((UIMAException)e).hasMessageKey(ResourceInitializationException.SOFA_MAPPING_NOT_SUPPORTED_FOR_REMOTE)) {
System.err.println("The XML detagging feature (-t) is not supported for remote Analysis Engines or for Aggregates containing remotes.");
}
else {
e.printStackTrace();
}
}
}
/**
* @see org.apache.uima.collection.base_cpm.BaseStatusCallbackListener#initializationComplete()
*/
public void initializationComplete() {
}
/**
* @see org.apache.uima.collection.StatusCallbackListener#entityProcessComplete(org.apache.uima.cas.CAS,
* org.apache.uima.collection.EntityProcessStatus)
*/
public void entityProcessComplete(CAS aCas, EntityProcessStatus aStatus) {
if (aStatus.isException()) {
Iterator iter = aStatus.getExceptions().iterator();
while (iter.hasNext()) {
((Throwable) iter.next()).printStackTrace();
}
} else if (genProgressMessages) {
// retrieve the filename of the input file from the CAS
// (it was put there by the FileSystemCollectionReader)
if (!(xcasInput || xmiInput)) {
Type fileLocType = aCas.getTypeSystem().getType(
"org.apache.uima.examples.SourceDocumentInformation");
Feature fileNameFeat = fileLocType.getFeatureByBaseName("uri");
FSIterator it = aCas.getAnnotationIndex(fileLocType).iterator();
FeatureStructure fileLoc = it.get();
File inFile = new File(fileLoc.getStringValue(fileNameFeat));
System.out.println("Processed Document " + inFile.getName());
} else {
System.out.println("doc" + docsProcessed++ + " processed successfully");
}
}
}
/**
* @see org.apache.uima.collection.base_cpm.BaseStatusCallbackListener#aborted()
*/
public void aborted() {
System.out.println("Processing Aborted");
}
/**
* @see org.apache.uima.collection.base_cpm.BaseStatusCallbackListener#batchProcessComplete()
*/
public void batchProcessComplete() {
}
/**
* @see org.apache.uima.collection.base_cpm.BaseStatusCallbackListener#collectionProcessComplete()
*/
public void collectionProcessComplete() {
// output performance stats
if (statsLevel > 0) {
AnalysisEnginePerformanceReports performanceReports = new AnalysisEnginePerformanceReports(
mCPE.getPerformanceReport());
System.out.println("\n\nPERFORMANCE STATS\n-----------------\n\n");
if (statsLevel > 1) {
System.out.println(performanceReports.getFullReport());
System.out.println();
}
System.out.println(performanceReports);
}
}
/**
* @see org.apache.uima.collection.base_cpm.BaseStatusCallbackListener#paused()
*/
public void paused() {
}
/**
* @see org.apache.uima.collection.base_cpm.BaseStatusCallbackListener#resumed()
*/
public void resumed() {
}
/**
* Prints usage message.
*/
private void printUsageMessage() {
System.err.println("\nUsage: java " + this.getClass().getName()
+ " [OPTIONS] <AE descriptor filename> <input dir> [<output dir>] ");
System.err.println("\nIf <output dir> is not specified, the analysis "
+ "results will not be output. This can be useful when only interested "
+ "in performance statistics.");
System.err.println("\nOPTIONS\n-------");
System.err.println("-t <TagName> (XML Text Tag) - specifies the name of "
+ "an XML tag, found within the input documents, that contains the text "
+ "to be analyzed. The text will also be detagged. If this option is not "
+ "specified, the entire document will be processed.");
System.err.println("-q (Quiet) - supresses progress messages that are "
+ "normally printed as each document is processed.");
System.err.println("-s<x> (Stats level) - determines the verboseness of "
+ "performance statistics. s0=none, s1=brief, s2=full. The default is brief.");
System.err.println("-x - process input files as XCAS files.");
System.err.println("-xmi - process input files as XmiCas files.");
System.err.println("-lenient - ignore out-of-typesystem content when deserializing XML files.");
System.err.println("-l <ISO language> - specify the ISO Language code to set.");
System.err.println("-e <encoding> - specify the character encoding to use.");
}
/**
* Reads command line arguments and sets static class variables appropriately.
*
* @return true if command line args were valid, false if not
*/
private boolean processCmdLineArgs(String[] args) {
encoding = "UTF-8"; // default
int index = 0;
while (index < args.length) {
String arg = args[index++];
if (arg.equals("-q")) // quiet mode
{
genProgressMessages = false;
} else if (arg.equals("-s0")) // no stats
{
statsLevel = 0;
} else if (arg.equals("-s2")) // full stats
{
statsLevel = 2;
} else if (arg.equals("-t")) // XML tag text
{
// tag name is next argument
if (index >= args.length) {
return false;
}
xmlTagName = args[index++];
} else if (arg.equals("-l")) // Language
{
// language ISO code is next argument
if (index >= args.length) {
return false;
}
language = args[index++];
} else if (arg.equals("-e")) // Encoding
{
// encoding is next argument
if (index >= args.length) {
return false;
}
encoding = args[index++];
} else if (arg.equals("-x")) // XCAS file input
{
xcasInput = true;
} else if (arg.equals("-xmi")) // XMI file input
{
xmiInput = true;
} else if (arg.equals("-lenient")) // lenient XML deserialization
{
xLenient = true;
} else if (arg.startsWith("-")) // invalid option
{
System.err.println(arg + " is not a valid option");
return false;
} else // one of the standard params - whichever we haven't read yet
{
if (aeSpecifierFile == null) {
aeSpecifierFile = new File(arg);
if (!aeSpecifierFile.exists() || aeSpecifierFile.isDirectory()) {
System.err.println(arg + " does not exist");
System.exit(1);
}
} else if (inputDir == null) {
inputDir = new File(arg);
if (!inputDir.exists() || !inputDir.isDirectory()) {
System.err.println(arg + " does not exist or is not a directory");
System.exit(1);
}
} else if (outputDir == null) {
outputDir = new File(arg);
if (!outputDir.exists() && !outputDir.mkdirs()) {
System.err.println(arg + " does not exist and could not be created");
System.exit(1);
}
}
}
}
// make sure required values were specified
return (aeSpecifierFile != null) && (inputDir != null);
}
public static void main(String[] args) {
new RunAE(args);
}
}