| /* |
| * Licensed to the Apache Software Foundation (ASF) under one |
| * or more contributor license agreements. See the NOTICE file |
| * distributed with this work for additional information |
| * regarding copyright ownership. The ASF licenses this file |
| * to you under the Apache License, Version 2.0 (the |
| * "License"); you may not use this file except in compliance |
| * with the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, |
| * software distributed under the License is distributed on an |
| * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
| * KIND, either express or implied. See the License for the |
| * specific language governing permissions and limitations |
| * under the License. |
| */ |
| |
| package org.apache.uima.examples; |
| |
| import java.io.File; |
| import java.util.Iterator; |
| |
| import org.apache.uima.UIMAException; |
| import org.apache.uima.UIMAFramework; |
| import org.apache.uima.cas.CAS; |
| import org.apache.uima.cas.FSIterator; |
| import org.apache.uima.cas.Feature; |
| import org.apache.uima.cas.FeatureStructure; |
| import org.apache.uima.cas.Type; |
| import org.apache.uima.collection.CollectionProcessingEngine; |
| import org.apache.uima.collection.EntityProcessStatus; |
| import org.apache.uima.collection.StatusCallbackListener; |
| import org.apache.uima.collection.impl.metadata.cpe.CpeDescriptorFactory; |
| import org.apache.uima.collection.metadata.CasProcessorConfigurationParameterSettings; |
| import org.apache.uima.collection.metadata.CpeCasProcessor; |
| import org.apache.uima.collection.metadata.CpeCollectionReader; |
| import org.apache.uima.collection.metadata.CpeComponentDescriptor; |
| import org.apache.uima.collection.metadata.CpeDescription; |
| import org.apache.uima.collection.metadata.CpeSofaMapping; |
| import org.apache.uima.collection.metadata.CpeSofaMappings; |
| import org.apache.uima.resource.ResourceInitializationException; |
| import org.apache.uima.tools.components.FileSystemCollectionReader; |
| import org.apache.uima.tools.components.InlineXmlCasConsumer; |
| import org.apache.uima.tools.components.XmlDetagger; |
| import org.apache.uima.util.AnalysisEnginePerformanceReports; |
| |
| |
| /** |
| * An example application that reads documents from the file system, sends them though an Analysis |
| * Engine(AE), and produces XML files with inline annotations. This application uses a |
| * {@link CollectionProcessingEngine} to drive the processing. For a simpler introduction to using |
| * AEs in an application, see {@link ExampleApplication}. |
| * <p> |
| * <code>Usage: java org.apache.uima.examples.RunAE [OPTIONS] |
| * <AE descriptor or JAR file name> <input dir> |
| * [<output dir>]</code> |
| * <p> |
| * If <code>output dir</code> is not specified, the analysis results will not be output. This can |
| * be useful when only interested in performance statistics. |
| * <p> |
| * <u>OPTIONS</u> |
| * <p> |
| * -t <TagName> (XML Text Tag) - specifies the name of an XML tag, found within the input |
| * documents, that contains the text to be analyzed. The text will also be detagged. If this option |
| * is not specified, the entire document will be processed. <br> |
| * -l <ISO code> (Language) - specifies the ISO code for the language of the input documents. |
| * Some AEs (e.g. PersonTitleAnnotator) require this. <br> |
| * -e <Encoding> - specifies character encoding of the input documents. The default is UTF-8. |
| * <br> |
| * -q (Quiet) - supresses progress messages that are normally printed as each document is processed. |
| * <br> |
| * -s<x> (Stats level) - determines the verboseness of performance statistics. s0=none, |
| * s1=brief, s2=full. The default is brief. <br> |
| * -x - process input files as XCAS files. |
| */ |
| public class RunAE implements StatusCallbackListener { |
| |
| /** The ae specifier file. */ |
| // Values read from cmd line args |
| private File aeSpecifierFile = null; |
| |
| /** The input dir. */ |
| private File inputDir = null; |
| |
| /** The output dir. */ |
| private File outputDir = null; |
| |
| /** The xml tag name. */ |
| private String xmlTagName = null; |
| |
| /** The language. */ |
| private String language; |
| |
| /** The encoding. */ |
| private String encoding; |
| |
| /** The gen progress messages. */ |
| private boolean genProgressMessages = true; |
| |
| /** The stats level. */ |
| private int statsLevel = 1; |
| |
| /** The xcas input. */ |
| private boolean xcasInput = false; |
| |
| /** The xmi input. */ |
| private boolean xmiInput = false; |
| |
| /** The x lenient. */ |
| private boolean xLenient = false; |
| |
| /** The docs processed. */ |
| int docsProcessed; |
| |
| /** The m CPE. */ |
| private CollectionProcessingEngine mCPE; |
| |
| /** |
| * Constructor. Sets up and runs an Analysis Engine. |
| * |
| * @param args the args |
| */ |
| public RunAE(String[] args) { |
| try { |
| // Read and validate command line arguments |
| if (!processCmdLineArgs(args)) { |
| printUsageMessage(); |
| return; |
| } |
| |
| // Enable schema validation (omit this to speed up initialization) |
| // UIMAFramework.getXMLParser().enableSchemaValidation(true); |
| |
| // build a Collection Processing Engine descriptor that will drive processing |
| CpeDescription cpeDesc = CpeDescriptorFactory.produceDescriptor(); |
| |
| // add collection reader that will read input docs |
| cpeDesc.addCollectionReader(FileSystemCollectionReader.getDescriptorURL().toString()); |
| // specify configuration parameters for collection reader |
| CasProcessorConfigurationParameterSettings crSettings = CpeDescriptorFactory |
| .produceCasProcessorConfigurationParameterSettings(); |
| CpeCollectionReader cpeCollRdr = cpeDesc.getAllCollectionCollectionReaders()[0]; |
| cpeCollRdr.setConfigurationParameterSettings(crSettings); |
| crSettings.setParameterValue(FileSystemCollectionReader.PARAM_INPUTDIR, inputDir |
| .getAbsolutePath()); |
| crSettings.setParameterValue(FileSystemCollectionReader.PARAM_ENCODING, encoding); |
| crSettings.setParameterValue(FileSystemCollectionReader.PARAM_LANGUAGE, language); |
| if (xcasInput) { |
| crSettings.setParameterValue(FileSystemCollectionReader.PARAM_XCAS, "XCAS"); |
| } else if (xmiInput) { |
| crSettings.setParameterValue(FileSystemCollectionReader.PARAM_XCAS, "XMI"); |
| } |
| if (xLenient) { |
| crSettings.setParameterValue(FileSystemCollectionReader.PARAM_LENIENT, "true"); |
| } |
| |
| // if XML tag was specified, configure XmlDetagger annotator and add to CPE |
| CpeCasProcessor xmlDetaggerCasProc = null; |
| if (xmlTagName != null && xmlTagName.length() > 0) { |
| xmlDetaggerCasProc = CpeDescriptorFactory.produceCasProcessor("XmlDetagger"); |
| CpeComponentDescriptor cpeComponentDescriptor = |
| CpeDescriptorFactory.produceComponentDescriptor(XmlDetagger.getDescriptorURL().toString()); |
| xmlDetaggerCasProc.setCpeComponentDescriptor(cpeComponentDescriptor); |
| CasProcessorConfigurationParameterSettings detaggerSettings = CpeDescriptorFactory |
| .produceCasProcessorConfigurationParameterSettings(); |
| xmlDetaggerCasProc.setConfigurationParameterSettings(detaggerSettings); |
| detaggerSettings.setParameterValue(XmlDetagger.PARAM_TEXT_TAG, xmlTagName); |
| xmlDetaggerCasProc.setMaxErrorCount(0); |
| cpeDesc.addCasProcessor(xmlDetaggerCasProc); |
| } |
| |
| // add user's AE to CPE |
| CpeCasProcessor casProc = CpeDescriptorFactory.produceCasProcessor("UserAE"); |
| CpeComponentDescriptor cpeComponentDescriptor = |
| CpeDescriptorFactory.produceComponentDescriptor(aeSpecifierFile.getAbsolutePath()); |
| casProc.setCpeComponentDescriptor(cpeComponentDescriptor); |
| casProc.setMaxErrorCount(0); |
| cpeDesc.addCasProcessor(casProc); |
| |
| // add CAS Consumer that will write the output |
| // create and configure CAS consumer that will write the output |
| CpeCasProcessor casCon = null; |
| if (outputDir != null) { |
| casCon = CpeDescriptorFactory.produceCasProcessor("CasConsumer"); |
| cpeComponentDescriptor = |
| CpeDescriptorFactory.produceComponentDescriptor(InlineXmlCasConsumer.getDescriptorURL().toString()); |
| casCon.setCpeComponentDescriptor(cpeComponentDescriptor); |
| CasProcessorConfigurationParameterSettings consumerSettings = CpeDescriptorFactory |
| .produceCasProcessorConfigurationParameterSettings(); |
| casCon.setConfigurationParameterSettings(consumerSettings); |
| consumerSettings.setParameterValue(InlineXmlCasConsumer.PARAM_OUTPUTDIR, outputDir |
| .getAbsolutePath()); |
| if (xcasInput) { |
| consumerSettings.setParameterValue(InlineXmlCasConsumer.PARAM_XCAS, "XCAS"); |
| } else if (xmiInput) { |
| consumerSettings.setParameterValue(InlineXmlCasConsumer.PARAM_XCAS, "XMI"); |
| } |
| casCon.setMaxErrorCount(0); |
| cpeDesc.addCasProcessor(casCon); |
| } |
| |
| // if XML detagger is used, we need to configure sofa mappings for the CPE |
| if (xmlDetaggerCasProc != null) { |
| // For XML detagger map default sofa to "xmlDocument" |
| CpeSofaMapping sofaMapping = CpeDescriptorFactory.produceSofaMapping(); |
| sofaMapping.setComponentSofaName("xmlDocument"); |
| sofaMapping.setCpeSofaName(CAS.NAME_DEFAULT_SOFA); |
| CpeSofaMappings xmlDetaggerSofaMappings = CpeDescriptorFactory.produceSofaMappings(); |
| xmlDetaggerSofaMappings.setSofaNameMappings(new CpeSofaMapping[] { sofaMapping }); |
| xmlDetaggerCasProc.setSofaNameMappings(xmlDetaggerSofaMappings); |
| |
| // User AE and InlineXmlCasConsumer (if present) operate on the "plainTextDocument" |
| // sofa produced by the XmlDetagger |
| CpeSofaMapping aeSofaMapping = CpeDescriptorFactory.produceSofaMapping(); |
| aeSofaMapping.setCpeSofaName("plainTextDocument"); |
| CpeSofaMappings userAeSofaMappings = CpeDescriptorFactory.produceSofaMappings(); |
| userAeSofaMappings.setSofaNameMappings(new CpeSofaMapping[] { aeSofaMapping }); |
| casProc.setSofaNameMappings(userAeSofaMappings); |
| |
| if (casCon != null) { |
| CpeSofaMapping casConSofaMapping = CpeDescriptorFactory.produceSofaMapping(); |
| casConSofaMapping.setCpeSofaName("plainTextDocument"); |
| CpeSofaMappings consumerSofaMappings = CpeDescriptorFactory.produceSofaMappings(); |
| consumerSofaMappings.setSofaNameMappings(new CpeSofaMapping[] { casConSofaMapping }); |
| casCon.setSofaNameMappings(consumerSofaMappings); |
| } |
| } |
| |
| // instantiate CPE |
| mCPE = UIMAFramework.produceCollectionProcessingEngine(cpeDesc); |
| // register callback listener |
| mCPE.addStatusCallbackListener(this); |
| |
| // execute |
| docsProcessed = 0; |
| mCPE.process(); |
| } catch (Exception e) { |
| //special check for using XML detagger with remotes, which will generate an error |
| //since sofa mappings aren't supported for remotes |
| if (xmlTagName != null && xmlTagName.length() > 0 && e instanceof UIMAException && |
| ((UIMAException)e).hasMessageKey(ResourceInitializationException.SOFA_MAPPING_NOT_SUPPORTED_FOR_REMOTE)) { |
| System.err.println("The XML detagging feature (-t) is not supported for remote Analysis Engines or for Aggregates containing remotes."); |
| } |
| else { |
| e.printStackTrace(); |
| } |
| } |
| } |
| |
| |
| /** |
| * Initialization complete. |
| * |
| * @see org.apache.uima.collection.base_cpm.BaseStatusCallbackListener#initializationComplete() |
| */ |
| public void initializationComplete() { |
| } |
| |
| /** |
| * Entity process complete. |
| * |
| * @param aCas the a cas |
| * @param aStatus the a status |
| * @see org.apache.uima.collection.StatusCallbackListener#entityProcessComplete(org.apache.uima.cas.CAS, |
| * org.apache.uima.collection.EntityProcessStatus) |
| */ |
| public void entityProcessComplete(CAS aCas, EntityProcessStatus aStatus) { |
| if (aStatus.isException()) { |
| Iterator iter = aStatus.getExceptions().iterator(); |
| while (iter.hasNext()) { |
| ((Throwable) iter.next()).printStackTrace(); |
| } |
| } else if (genProgressMessages) { |
| // retrieve the filename of the input file from the CAS |
| // (it was put there by the FileSystemCollectionReader) |
| if (!(xcasInput || xmiInput)) { |
| Type fileLocType = aCas.getTypeSystem().getType( |
| "org.apache.uima.examples.SourceDocumentInformation"); |
| Feature fileNameFeat = fileLocType.getFeatureByBaseName("uri"); |
| FSIterator it = aCas.getAnnotationIndex(fileLocType).iterator(); |
| FeatureStructure fileLoc = it.get(); |
| File inFile = new File(fileLoc.getStringValue(fileNameFeat)); |
| System.out.println("Processed Document " + inFile.getName()); |
| } else { |
| System.out.println("doc" + docsProcessed++ + " processed successfully"); |
| } |
| } |
| } |
| |
| /** |
| * Aborted. |
| * |
| * @see org.apache.uima.collection.base_cpm.BaseStatusCallbackListener#aborted() |
| */ |
| public void aborted() { |
| System.out.println("Processing Aborted"); |
| |
| } |
| |
| /** |
| * Batch process complete. |
| * |
| * @see org.apache.uima.collection.base_cpm.BaseStatusCallbackListener#batchProcessComplete() |
| */ |
| public void batchProcessComplete() { |
| } |
| |
| /** |
| * Collection process complete. |
| * |
| * @see org.apache.uima.collection.base_cpm.BaseStatusCallbackListener#collectionProcessComplete() |
| */ |
| public void collectionProcessComplete() { |
| // output performance stats |
| if (statsLevel > 0) { |
| AnalysisEnginePerformanceReports performanceReports = new AnalysisEnginePerformanceReports( |
| mCPE.getPerformanceReport()); |
| System.out.println("\n\nPERFORMANCE STATS\n-----------------\n\n"); |
| if (statsLevel > 1) { |
| System.out.println(performanceReports.getFullReport()); |
| System.out.println(); |
| } |
| System.out.println(performanceReports); |
| } |
| } |
| |
| /** |
| * Paused. |
| * |
| * @see org.apache.uima.collection.base_cpm.BaseStatusCallbackListener#paused() |
| */ |
| public void paused() { |
| } |
| |
| /** |
| * Resumed. |
| * |
| * @see org.apache.uima.collection.base_cpm.BaseStatusCallbackListener#resumed() |
| */ |
| public void resumed() { |
| } |
| |
| /** |
| * Prints usage message. |
| */ |
| private void printUsageMessage() { |
| System.err.println("\nUsage: java " + this.getClass().getName() |
| + " [OPTIONS] <AE descriptor filename> <input dir> [<output dir>] "); |
| System.err.println("\nIf <output dir> is not specified, the analysis " |
| + "results will not be output. This can be useful when only interested " |
| + "in performance statistics."); |
| System.err.println("\nOPTIONS\n-------"); |
| System.err.println("-t <TagName> (XML Text Tag) - specifies the name of " |
| + "an XML tag, found within the input documents, that contains the text " |
| + "to be analyzed. The text will also be detagged. If this option is not " |
| + "specified, the entire document will be processed."); |
| System.err.println("-q (Quiet) - supresses progress messages that are " |
| + "normally printed as each document is processed."); |
| System.err.println("-s<x> (Stats level) - determines the verboseness of " |
| + "performance statistics. s0=none, s1=brief, s2=full. The default is brief."); |
| System.err.println("-x - process input files as XCAS files."); |
| System.err.println("-xmi - process input files as XmiCas files."); |
| System.err.println("-lenient - ignore out-of-typesystem content when deserializing XML files."); |
| System.err.println("-l <ISO language> - specify the ISO Language code to set."); |
| System.err.println("-e <encoding> - specify the character encoding to use."); |
| |
| } |
| |
| /** |
| * Reads command line arguments and sets static class variables appropriately. |
| * |
| * @param args the args |
| * @return true if command line args were valid, false if not |
| */ |
| private boolean processCmdLineArgs(String[] args) { |
| encoding = "UTF-8"; // default |
| int index = 0; |
| while (index < args.length) { |
| String arg = args[index++]; |
| if (arg.equals("-q")) // quiet mode |
| { |
| genProgressMessages = false; |
| } else if (arg.equals("-s0")) // no stats |
| { |
| statsLevel = 0; |
| } else if (arg.equals("-s2")) // full stats |
| { |
| statsLevel = 2; |
| } else if (arg.equals("-t")) // XML tag text |
| { |
| // tag name is next argument |
| if (index >= args.length) { |
| return false; |
| } |
| xmlTagName = args[index++]; |
| } else if (arg.equals("-l")) // Language |
| { |
| // language ISO code is next argument |
| if (index >= args.length) { |
| return false; |
| } |
| language = args[index++]; |
| } else if (arg.equals("-e")) // Encoding |
| { |
| // encoding is next argument |
| if (index >= args.length) { |
| return false; |
| } |
| encoding = args[index++]; |
| } else if (arg.equals("-x")) // XCAS file input |
| { |
| xcasInput = true; |
| } else if (arg.equals("-xmi")) // XMI file input |
| { |
| xmiInput = true; |
| } else if (arg.equals("-lenient")) // lenient XML deserialization |
| { |
| xLenient = true; |
| } else if (arg.startsWith("-")) // invalid option |
| { |
| System.err.println(arg + " is not a valid option"); |
| return false; |
| } else // one of the standard params - whichever we haven't read yet |
| { |
| if (aeSpecifierFile == null) { |
| aeSpecifierFile = new File(arg); |
| if (!aeSpecifierFile.exists() || aeSpecifierFile.isDirectory()) { |
| System.err.println(arg + " does not exist"); |
| System.exit(1); |
| } |
| } else if (inputDir == null) { |
| inputDir = new File(arg); |
| if (!inputDir.exists() || !inputDir.isDirectory()) { |
| System.err.println(arg + " does not exist or is not a directory"); |
| System.exit(1); |
| } |
| } else if (outputDir == null) { |
| outputDir = new File(arg); |
| if (!outputDir.exists() && !outputDir.mkdirs()) { |
| System.err.println(arg + " does not exist and could not be created"); |
| System.exit(1); |
| } |
| } |
| } |
| } |
| // make sure required values were specified |
| return (aeSpecifierFile != null) && (inputDir != null); |
| } |
| |
| /** |
| * The main method. |
| * |
| * @param args the arguments |
| */ |
| public static void main(String[] args) { |
| new RunAE(args); |
| } |
| } |