blob: 3a93618859b04f8efb9bbd52d814969b6f0a22c2 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.apache.uima.ruta.engine;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStream;
import java.net.URISyntaxException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Set;
import org.antlr.runtime.ANTLRFileStream;
import org.antlr.runtime.ANTLRInputStream;
import org.antlr.runtime.CharStream;
import org.antlr.runtime.CommonTokenStream;
import org.antlr.runtime.RecognitionException;
import org.apache.uima.UimaContext;
import org.apache.uima.analysis_component.AnalysisComponent;
import org.apache.uima.analysis_engine.AnalysisEngine;
import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
import org.apache.uima.cas.CAS;
import org.apache.uima.cas.Feature;
import org.apache.uima.cas.Type;
import org.apache.uima.cas.TypeSystem;
import org.apache.uima.cas.text.AnnotationFS;
import org.apache.uima.cas.text.AnnotationIndex;
import org.apache.uima.fit.component.JCasAnnotator_ImplBase;
import org.apache.uima.fit.descriptor.ConfigurationParameter;
import org.apache.uima.fit.factory.AnalysisEngineFactory;
import org.apache.uima.jcas.JCas;
import org.apache.uima.resource.ResourceConfigurationException;
import org.apache.uima.resource.ResourceInitializationException;
import org.apache.uima.ruta.FilterManager;
import org.apache.uima.ruta.RutaBlock;
import org.apache.uima.ruta.RutaModule;
import org.apache.uima.ruta.RutaStream;
import org.apache.uima.ruta.extensions.IEngineLoader;
import org.apache.uima.ruta.extensions.IRutaExtension;
import org.apache.uima.ruta.extensions.RutaEngineLoader;
import org.apache.uima.ruta.extensions.RutaExternalFactory;
import org.apache.uima.ruta.parser.RutaLexer;
import org.apache.uima.ruta.parser.RutaParser;
import org.apache.uima.ruta.seed.RutaAnnotationSeeder;
import org.apache.uima.ruta.verbalize.RutaVerbalizer;
import org.apache.uima.ruta.visitor.CreatedByVisitor;
import org.apache.uima.ruta.visitor.DebugInfoCollectorVisitor;
import org.apache.uima.ruta.visitor.InferenceCrowd;
import org.apache.uima.ruta.visitor.RutaInferenceVisitor;
import org.apache.uima.ruta.visitor.StatisticsVisitor;
import org.apache.uima.ruta.visitor.TimeProfilerVisitor;
import org.apache.uima.util.InvalidXMLException;
public class RutaEngine extends JCasAnnotator_ImplBase {
public static final String SCRIPT_FILE_EXTENSION = ".ruta";
public static final String SOURCE_DOCUMENT_INFORMATION = "org.apache.uima.examples.SourceDocumentInformation";
public static final String BASIC_TYPE = "org.apache.uima.ruta.type.RutaBasic";
public static final String OPTIONAL_TYPE = "org.apache.uima.ruta.type.RutaOptional";
/**
* Load script in Java notation, with "{@code .}" as package separator and no extension. File
* needs to be located in the path specified below with ending {@code .ruta}.
*/
public static final String PARAM_MAIN_SCRIPT = "mainScript";
@ConfigurationParameter(name = PARAM_MAIN_SCRIPT, mandatory = false)
private String mainScipt;
/**
* This parameter specifies the encoding of the rule files. Its default value is "UTF-8".
*/
public static final String PARAM_SCRIPT_ENCODING = "scriptEncoding";
@ConfigurationParameter(name = PARAM_SCRIPT_ENCODING, mandatory = false, defaultValue = "UTF-8")
private String scriptEncoding;
/**
* The parameter scriptPaths refers to a list of String values, which specify the possible
* locations of script files. The given locations are absolute paths. A typical value for this
* parameter is, for example, "C:/Ruta/MyProject/script/". If the parameter mainScript is set to
* org.apache.uima.Main, then the absolute path of the script file has to be
* "C:/Ruta/MyProject/script/org/apache/uima/Main.ruta". This parameter can contain multiple
* values, as the main script can refer to multiple projects similar to a class path in Java.
*/
public static final String PARAM_SCRIPT_PATHS = "scriptPaths";
@ConfigurationParameter(name = PARAM_SCRIPT_PATHS, mandatory = false)
private String[] scriptPaths;
/**
* This parameter specifies the possible locations for descriptors like analysis engines or type
* systems, similar to the parameter scriptPaths for the script files. A typical value for this
* parameter is for example "C:/Ruta/MyProject/descriptor/". The relative values of the parameter
* additionalEngines are resolved to these absolute locations. This parameter can contain multiple
* values, as the main script can refer to multiple projects similar to a class path in Java.
*/
public static final String PARAM_DESCRIPTOR_PATHS = "descriptorPaths";
@ConfigurationParameter(name = PARAM_DESCRIPTOR_PATHS, mandatory = false, defaultValue = {})
private String[] descriptorPaths;
/**
* This parameter specifies the possible locations of additional resources like word lists or CSV
* tables. The string values have to contain absolute locations, for example,
* "C:/Ruta/MyProject/resources/".
*/
public static final String PARAM_RESOURCE_PATHS = "resourcePaths";
@ConfigurationParameter(name = PARAM_RESOURCE_PATHS, mandatory = false, defaultValue = {})
private String[] resourcePaths;
/**
* The parameter additionalScripts is defined as a list of string values and contains script
* files, which are additionally loaded by the analysis engine. These script files are specified
* by their complete namespace, exactly like the value of the parameter mainScript and can be
* refered to by language elements, e.g., by executing the containing rules. An exemplary value of
* this parameter is "org.apache.uima.SecondaryScript". In this example, the main script could
* import this script file by the declaration "SCRIPT org.apache.uima.SecondaryScript;" and then
* could execute it with the rule "Document{-> CALL(SecondaryScript)};".
*/
public static final String PARAM_ADDITIONAL_SCRIPTS = "additionalScripts";
@ConfigurationParameter(name = PARAM_ADDITIONAL_SCRIPTS, mandatory = false, defaultValue = {})
private String[] additionalScripts;
/**
* This parameter contains a list of additional analysis engines, which can be executed by the
* UIMA Ruta rules. The single values are given by the name of the analysis engine with their
* complete namespace and have to be located relative to one value of the parameter
* descriptorPaths, the location where the analysis engine searches for the descriptor file. An
* example for one value of the parameter is "utils.HtmlAnnotator", which points to the descriptor
* "HtmlAnnotator.xml" in the folder "utils".
*/
public static final String PARAM_ADDITIONAL_ENGINES = "additionalEngines";
@ConfigurationParameter(name = PARAM_ADDITIONAL_ENGINES, mandatory = false, defaultValue = {})
private String[] additionalEngines;
/**
* List of additional uimaFIT analysis engines, which are loaded without descriptor.
*/
public static final String PARAM_ADDITIONAL_UIMAFIT_ENGINES = "additionalUimafitEngines";
@ConfigurationParameter(name = PARAM_ADDITIONAL_UIMAFIT_ENGINES, mandatory = false, defaultValue = {})
private String[] additionalUimafitEngines;
/**
* The parameter "additionalEngineLoaders" specifies a list of optional implementations of the
* interface "org.apache.uima.ruta.extensions.IEngineLoader", which can be used to
* application-specific configurations of additional analysis engines.
*/
public static final String PARAM_ADDITIONAL_ENGINE_LOADERS = "additionalEngineLoaders";
@ConfigurationParameter(name = PARAM_ADDITIONAL_ENGINE_LOADERS, mandatory = false, defaultValue = {})
private String[] additionalEngineLoaders;
/**
* This parameter specifies optional extensions of the UIMA Ruta language. The elements of the
* string list have to implement the interface "org.apache.uima.ruta.extensions.IRutaExtension".
* With these extensions, application-specific conditions and actions can be added to the set of
* provided ones.
*/
public static final String PARAM_ADDITIONAL_EXTENSIONS = "additionalExtensions";
@ConfigurationParameter(name = PARAM_ADDITIONAL_EXTENSIONS, mandatory = false, defaultValue = {})
private String[] additionalExtensions;
/**
* This boolean parameter indicates whether the script or resource files should be reloaded when
* processing a CAS. The default value is set to false. In this case, the script files are loaded
* when the analysis engine is initialized. If script files or resource files are extended, e.g.,
* a dictionary is filled yet when a collection of documents are processed, then the parameter is
* needed to be set to true in order to include the changes.
*/
public static final String PARAM_RELOAD_SCRIPT = "reloadScript";
@ConfigurationParameter(name = PARAM_RELOAD_SCRIPT, mandatory = false, defaultValue = "false")
private Boolean reloadScript;
/**
* This list of string values refers to implementations of the interface
* "org.apache.uima.ruta.seed.RutaAnnotationSeeder", which can be used to automatically add
* annotations to the CAS. The default value of the parameter is a single seeder, namely
* "org.apache.uima.ruta.seed.DefaultSeeder" that adds annotations for token classes like CW,
* MARKUP or SEMICOLON. Remember that additional annotations can also be added with an additional
* engine that is executed by a UIMA Ruta rule.
*/
public static final String PARAM_SEEDERS = "seeders";
@ConfigurationParameter(name = PARAM_SEEDERS, mandatory = false, defaultValue = { "org.apache.uima.ruta.seed.DefaultSeeder" })
private String[] seeders;
/**
* This parameter specifies a list of types, which are filtered by default when executing a script
* file. Using the default values of this parameter, whitespaces, line breaks and markup elements
* are not visible to Ruta rules. The visibility of annotations and, therefore, the covered text
* can be changed using the actions FILTERTYPE and RETAINTYPE.
*/
public static final String PARAM_DEFAULT_FILTERED_TYPES = "defaultFilteredTypes";
@ConfigurationParameter(name = PARAM_DEFAULT_FILTERED_TYPES, mandatory = false, defaultValue = {
"org.apache.uima.ruta.type.SPACE", "org.apache.uima.ruta.type.NBSP",
"org.apache.uima.ruta.type.BREAK", "org.apache.uima.ruta.type.MARKUP" })
private String[] defaultFilteredTypes;
/**
* This parameter specifies whether the inference annotations created by the analysis engine
* should be removed after processing the CAS. The default value is set to true.
*/
public static final String PARAM_REMOVE_BASICS = "removeBasics";
@ConfigurationParameter(name = PARAM_REMOVE_BASICS, mandatory = false, defaultValue = "true")
private Boolean removeBasics;
/**
* If this parameter is set to true, then the Ruta rules are not forced to start to match with the
* first rule element. Rather, the rule element referring to the most rare type is chosen. This
* option can be utilized to optimize the performance. Please mind that the matching result can
* vary in some cases when greedy rule elements are applied. The default value is set to false.
*/
public static final String PARAM_DYNAMIC_ANCHORING = "dynamicAnchoring";
@ConfigurationParameter(name = PARAM_DYNAMIC_ANCHORING, mandatory = false, defaultValue = "false")
private Boolean dynamicAnchoring;
/**
* This parameter specifies whether the memory consumption should be reduced. This parameter
* should be set to true for very large CAS documents (e.g., > 500k tokens), but it also reduces
* the performance. The default value is set to false.
*/
public static final String PARAM_LOW_MEMORY_PROFILE = "lowMemoryProfile";
@ConfigurationParameter(name = PARAM_LOW_MEMORY_PROFILE, mandatory = false, defaultValue = "false")
private Boolean lowMemoryProfile;
/**
* This parameter specifies whether a different inference strategy for composed rule elements
* should be applied. This option is only necessary when the composed rule element is expected to
* match very often, e.g., a rule element like (ANY ANY)+. The default value of this parameter is
* set to false.
*/
public static final String PARAM_SIMPLE_GREEDY_FOR_COMPOSED = "simpleGreedyForComposed";
@ConfigurationParameter(name = PARAM_SIMPLE_GREEDY_FOR_COMPOSED, mandatory = false, defaultValue = "false")
private Boolean simpleGreedyForComposed;
/**
* If this parameter is set to true, then additional information about the execution of a rule
* script is added to the CAS. The actual information is specified by the following parameters.
* The default value of this parameter is set to false.
*/
public static final String PARAM_DEBUG = "debug";
@ConfigurationParameter(name = PARAM_DEBUG, mandatory = false, defaultValue = "false")
private Boolean debug;
/**
* This parameter specificies whether the match information (covered text) of the rules should be
* stored in the CAS. The default value of this parameter is set to false.
*/
public static final String PARAM_DEBUG_WITH_MATCHES = "debugWithMatches";
@ConfigurationParameter(name = PARAM_DEBUG_WITH_MATCHES, mandatory = false, defaultValue = "false")
private Boolean debugWithMatches;
/**
* This parameter specifies a list of rule-ids that enumerate the rule for which debug information
* should be created. No specific ids are given by default.
*/
public static final String PARAM_DEBUG_ONLY_FOR = "debugOnlyFor";
@ConfigurationParameter(name = PARAM_DEBUG_ONLY_FOR, mandatory = false, defaultValue = {})
private String[] debugOnlyFor;
/**
* If this parameter is set to true, then additional information about the runtime of applied
* rules is added to the CAS. The default value of this parameter is set to false.
*/
public static final String PARAM_PROFILE = "profile";
@ConfigurationParameter(name = PARAM_PROFILE, mandatory = false, defaultValue = "false")
private Boolean profile;
/**
* If this parameter is set to true, then additional information about the runtime of UIMA Ruta
* language elements like conditions and actions is added to the CAS. The default value of this
* parameter is set to false.
*/
public static final String PARAM_STATISTICS = "statistics";
@ConfigurationParameter(name = PARAM_STATISTICS, mandatory = false, defaultValue = "false")
private Boolean statistics;
/**
* If this parameter is set to true, then additional information about what annotation was created
* by which rule is added to the CAS. The default value of this parameter is set to false.
*/
public static final String PARAM_CREATED_BY = "createdBy";
@ConfigurationParameter(name = PARAM_CREATED_BY, mandatory = false, defaultValue = "false")
private Boolean createdBy;
private UimaContext context;
private RutaModule script;
private RutaExternalFactory factory;
private RutaEngineLoader engineLoader;
private String mainScript;
private RutaVerbalizer verbalizer;
private boolean initialized = false;
private List<Type> seedTypes;
@Override
public void initialize(UimaContext aContext) throws ResourceInitializationException {
super.initialize(aContext);
if (aContext == null && context != null) {
aContext = context;
}
seeders = (String[]) aContext.getConfigParameterValue(PARAM_SEEDERS);
removeBasics = (Boolean) aContext.getConfigParameterValue(PARAM_REMOVE_BASICS);
scriptPaths = (String[]) aContext.getConfigParameterValue(PARAM_SCRIPT_PATHS);
descriptorPaths = (String[]) aContext.getConfigParameterValue(PARAM_DESCRIPTOR_PATHS);
mainScript = (String) aContext.getConfigParameterValue(PARAM_MAIN_SCRIPT);
additionalScripts = (String[]) aContext.getConfigParameterValue(PARAM_ADDITIONAL_SCRIPTS);
additionalEngines = (String[]) aContext.getConfigParameterValue(PARAM_ADDITIONAL_ENGINES);
additionalUimafitEngines = (String[]) aContext
.getConfigParameterValue(PARAM_ADDITIONAL_UIMAFIT_ENGINES);
additionalExtensions = (String[]) aContext.getConfigParameterValue(PARAM_ADDITIONAL_EXTENSIONS);
additionalEngineLoaders = (String[]) aContext
.getConfigParameterValue(PARAM_ADDITIONAL_ENGINE_LOADERS);
debug = (Boolean) aContext.getConfigParameterValue(PARAM_DEBUG);
debugOnlyFor = (String[]) aContext.getConfigParameterValue(PARAM_DEBUG_ONLY_FOR);
profile = (Boolean) aContext.getConfigParameterValue(PARAM_PROFILE);
statistics = (Boolean) aContext.getConfigParameterValue(PARAM_STATISTICS);
createdBy = (Boolean) aContext.getConfigParameterValue(PARAM_CREATED_BY);
debugWithMatches = (Boolean) aContext.getConfigParameterValue(PARAM_DEBUG_WITH_MATCHES);
resourcePaths = (String[]) aContext.getConfigParameterValue(PARAM_RESOURCE_PATHS);
scriptEncoding = (String) aContext.getConfigParameterValue(PARAM_SCRIPT_ENCODING);
defaultFilteredTypes = (String[]) aContext
.getConfigParameterValue(PARAM_DEFAULT_FILTERED_TYPES);
dynamicAnchoring = (Boolean) aContext.getConfigParameterValue(PARAM_DYNAMIC_ANCHORING);
reloadScript = (Boolean) aContext.getConfigParameterValue(PARAM_RELOAD_SCRIPT);
lowMemoryProfile = (Boolean) aContext.getConfigParameterValue(PARAM_LOW_MEMORY_PROFILE);
simpleGreedyForComposed = (Boolean) aContext
.getConfigParameterValue(PARAM_SIMPLE_GREEDY_FOR_COMPOSED);
resourcePaths = resourcePaths == null ? new String[0] : resourcePaths;
removeBasics = removeBasics == null ? false : removeBasics;
debug = debug == null ? false : debug;
debugOnlyFor = debugOnlyFor == null ? new String[0] : debugOnlyFor;
profile = profile == null ? false : profile;
statistics = statistics == null ? false : statistics;
createdBy = createdBy == null ? false : createdBy;
debugWithMatches = debugWithMatches == null ? true : debugWithMatches;
scriptEncoding = scriptEncoding == null ? "UTF-8" : scriptEncoding;
defaultFilteredTypes = defaultFilteredTypes == null ? new String[0] : defaultFilteredTypes;
dynamicAnchoring = dynamicAnchoring == null ? false : dynamicAnchoring;
reloadScript = reloadScript == null ? false : reloadScript;
lowMemoryProfile = lowMemoryProfile == null ? false : lowMemoryProfile;
simpleGreedyForComposed = simpleGreedyForComposed == null ? false : simpleGreedyForComposed;
this.context = aContext;
factory = new RutaExternalFactory();
engineLoader = new RutaEngineLoader();
verbalizer = new RutaVerbalizer();
if (!factory.isInitialized()) {
initializeExtensionWithClassPath();
}
if (!engineLoader.isInitialized()) {
initializeEngineLoaderWithClassPath();
}
if (!reloadScript) {
try {
initializeScript(CAS.NAME_DEFAULT_SOFA);
} catch (AnalysisEngineProcessException e) {
throw new ResourceInitializationException(e);
}
}
}
@Override
public void process(JCas jcas) throws AnalysisEngineProcessException {
CAS cas = jcas.getCas();
if (reloadScript || (!initialized && !cas.getViewName().equals(CAS.NAME_DEFAULT_SOFA))) {
initializeScript(cas.getViewName());
} else {
resetEnvironments(cas);
}
if (!initialized || reloadScript) {
initializeTypes(script, cas);
initialized = true;
}
InferenceCrowd crowd = initializeCrowd();
RutaStream stream = initializeStream(cas, crowd);
stream.setDynamicAnchoring(dynamicAnchoring);
try {
script.apply(stream, crowd);
} catch (Throwable e) {
throw new AnalysisEngineProcessException(AnalysisEngineProcessException.ANNOTATOR_EXCEPTION,
new Object[] {}, e);
}
crowd.finished(stream);
if (removeBasics) {
List<AnnotationFS> toRemove = new ArrayList<AnnotationFS>();
Type basicType = cas.getTypeSystem().getType(BASIC_TYPE);
AnnotationIndex<AnnotationFS> basicIndex = cas.getAnnotationIndex(basicType);
for (AnnotationFS fs : basicIndex) {
toRemove.add(fs);
}
for (Type seedType : seedTypes) {
AnnotationIndex<AnnotationFS> seedIndex = cas.getAnnotationIndex(seedType);
for (AnnotationFS fs : seedIndex) {
toRemove.add(fs);
}
}
for (AnnotationFS annotationFS : toRemove) {
cas.removeFsFromIndexes(annotationFS);
}
}
}
private void resetEnvironments(CAS cas) {
resetEnvironment(script, cas);
Collection<RutaModule> scripts = script.getScripts().values();
for (RutaModule module : scripts) {
resetEnvironment(module, cas);
}
}
private void resetEnvironment(RutaModule module, CAS cas) {
RutaBlock block = module.getBlock(null);
block.getEnvironment().reset(cas);
Collection<RutaBlock> blocks = module.getBlocks().values();
for (RutaBlock each : blocks) {
each.getEnvironment().reset(cas);
}
}
private void initializeTypes(RutaModule script, CAS cas) {
// TODO find a better solution for telling everyone about the types!
RutaBlock mainRootBlock = script.getBlock(null);
mainRootBlock.getEnvironment().initializeTypes(cas);
Collection<RutaModule> values = script.getScripts().values();
for (RutaModule eachModule : values) {
relinkEnvironments(eachModule, mainRootBlock, new ArrayList<RutaModule>());
// initializeTypes(eachModule, cas);
}
}
private void relinkEnvironments(RutaModule script, RutaBlock mainRootBlock,
Collection<RutaModule> processed) {
if (!processed.contains(script)) {
processed.add(script);
RutaBlock block = script.getBlock(null);
block.setParent(mainRootBlock);
Collection<RutaModule> innerScripts = script.getScripts().values();
for (RutaModule module : innerScripts) {
relinkEnvironments(module, mainRootBlock, processed);
}
}
}
private void initializeExtensionWithClassPath() {
if (additionalExtensions == null) {
return;
}
for (String each : additionalExtensions) {
try {
Class<?> forName = Class.forName(each);
if (IRutaExtension.class.isAssignableFrom(forName)) {
IRutaExtension extension = (IRutaExtension) forName.newInstance();
verbalizer.addExternalVerbalizers(extension);
for (String name : extension.getKnownExtensions()) {
factory.addExtension(name, extension);
}
}
} catch (Exception e) {
// System.out.println("EXTENSION ERROR: " + each);
}
}
}
private void initializeEngineLoaderWithClassPath() {
if (additionalEngineLoaders == null) {
return;
}
for (String each : additionalEngineLoaders) {
try {
Class<?> forName = Class.forName(each);
if (IEngineLoader.class.isAssignableFrom(forName)) {
IEngineLoader loader = (IEngineLoader) forName.newInstance();
for (String name : loader.getKnownEngines()) {
engineLoader.addLoader(name, loader);
}
}
} catch (Exception e) {
// System.out.println("LOADER ERROR: " + each);
}
}
}
private InferenceCrowd initializeCrowd() {
List<RutaInferenceVisitor> visitors = new ArrayList<RutaInferenceVisitor>();
if (debug) {
visitors.add(new DebugInfoCollectorVisitor(debug, debugWithMatches, Arrays
.asList(debugOnlyFor), verbalizer));
}
if (profile) {
visitors.add(new TimeProfilerVisitor());
}
if (statistics) {
visitors.add(new StatisticsVisitor(verbalizer));
}
if (createdBy) {
visitors.add(new CreatedByVisitor(verbalizer));
}
return new InferenceCrowd(visitors);
}
private RutaStream initializeStream(CAS cas, InferenceCrowd crowd)
throws AnalysisEngineProcessException {
Collection<Type> filterTypes = new ArrayList<Type>();
TypeSystem typeSystem = cas.getTypeSystem();
for (String each : defaultFilteredTypes) {
Type type = typeSystem.getType(each);
if (type != null) {
filterTypes.add(type);
}
}
FilterManager filter = new FilterManager(filterTypes, cas);
Type basicType = typeSystem.getType(BASIC_TYPE);
seedTypes = seedAnnotations(cas);
RutaStream stream = new RutaStream(cas, basicType, filter, lowMemoryProfile,
simpleGreedyForComposed, crowd);
stream.initalizeBasics();
return stream;
}
private List<Type> seedAnnotations(CAS cas) throws AnalysisEngineProcessException {
List<Type> result = new ArrayList<Type>();
if (seeders != null) {
for (String seederClass : seeders) {
Class<?> loadClass = null;
try {
loadClass = Class.forName(seederClass);
} catch (ClassNotFoundException e) {
throw new AnalysisEngineProcessException(e);
}
Object newInstance = null;
try {
newInstance = loadClass.newInstance();
} catch (Exception e) {
throw new AnalysisEngineProcessException(e);
}
try {
RutaAnnotationSeeder seeder = (RutaAnnotationSeeder) newInstance;
result.add(seeder.seed(cas.getDocumentText(), cas));
} catch (Exception e) {
throw new AnalysisEngineProcessException(e);
}
}
}
return result;
}
private void initializeScript(String viewName) throws AnalysisEngineProcessException {
if (mainScript == null) {
return;
}
String scriptLocation = locate(mainScript, scriptPaths, SCRIPT_FILE_EXTENSION);
if (scriptLocation == null) {
try {
String mainScriptPath = mainScript.replaceAll("\\.", "/") + SCRIPT_FILE_EXTENSION;
script = loadScriptIS(mainScriptPath);
} catch (IOException e) {
throw new AnalysisEngineProcessException(new FileNotFoundException("Script [" + mainScript
+ "] cannot be found at [" + collectionToString(scriptPaths)
+ "] or classpath with extension .ruta"));
} catch (RecognitionException e) {
throw new AnalysisEngineProcessException(new FileNotFoundException("Script [" + mainScript
+ "] cannot be found at [" + collectionToString(scriptPaths)
+ "] or classpath with extension .ruta"));
}
} else {
try {
script = loadScript(scriptLocation);
} catch (Exception e) {
throw new AnalysisEngineProcessException(e);
}
}
Map<String, RutaModule> additionalScriptsMap = new HashMap<String, RutaModule>();
Map<String, AnalysisEngine> additionalEnginesMap = new HashMap<String, AnalysisEngine>();
if (additionalUimafitEngines != null) {
for (String eachUimafitEngine : additionalUimafitEngines) {
AnalysisEngine eachEngine = null;
try {
@SuppressWarnings("unchecked")
// Class clazz = this.getClass().getClassLoader().loadClass(eachUimafitEngine) ;
Class<? extends AnalysisComponent> uimafitClass = (Class<? extends AnalysisComponent>) Class
.forName(eachUimafitEngine);
eachEngine = AnalysisEngineFactory.createEngine(uimafitClass);
} catch (ClassNotFoundException e) {
throw new AnalysisEngineProcessException(e);
} catch (ResourceInitializationException e) {
throw new AnalysisEngineProcessException(e);
}
try {
additionalEnginesMap.put(eachUimafitEngine, eachEngine);
String[] eachEngineLocationPartArray = eachUimafitEngine.split("\\.");
if (eachEngineLocationPartArray.length > 1) {
String shortEachEngineLocation = eachEngineLocationPartArray[eachEngineLocationPartArray.length - 1];
additionalEnginesMap.put(shortEachEngineLocation, eachEngine);
}
} catch (Exception e) {
throw new AnalysisEngineProcessException(e);
}
}
}
if (additionalEngines != null) {
for (String eachEngineLocation : additionalEngines) {
AnalysisEngine eachEngine;
String location = locate(eachEngineLocation, descriptorPaths, ".xml");
if (location == null) {
String locationIS = locateIS(eachEngineLocation, descriptorPaths, ".xml");
try {
eachEngine = engineLoader.loadEngineIS(locationIS, viewName);
} catch (InvalidXMLException e) {
throw new AnalysisEngineProcessException(new FileNotFoundException("Engine at ["
+ eachEngineLocation + "] cannot be found in ["
+ collectionToString(descriptorPaths)
+ "] with extension .xml (from mainScript=" + mainScript + " in "
+ collectionToString(scriptPaths)));
} catch (ResourceInitializationException e) {
throw new AnalysisEngineProcessException(new FileNotFoundException("Engine at ["
+ eachEngineLocation + "] cannot be found in ["
+ collectionToString(descriptorPaths)
+ "] with extension .xml (from mainScript=" + mainScript + " in "
+ collectionToString(scriptPaths)));
} catch (IOException e) {
throw new AnalysisEngineProcessException(new FileNotFoundException("Engine at ["
+ eachEngineLocation + "] cannot be found in ["
+ collectionToString(descriptorPaths)
+ "] with extension .xml (from mainScript=" + mainScript + " in "
+ collectionToString(scriptPaths)));
} catch (ResourceConfigurationException e) {
throw new AnalysisEngineProcessException(e);
} catch (URISyntaxException e) {
throw new AnalysisEngineProcessException(e);
}
} else {
try {
eachEngine = engineLoader.loadEngine(location, viewName);
} catch (Exception e) {
throw new AnalysisEngineProcessException(e);
}
}
try {
additionalEnginesMap.put(eachEngineLocation, eachEngine);
String[] eachEngineLocationPartArray = eachEngineLocation.split("\\.");
if (eachEngineLocationPartArray.length > 1) {
String shortEachEngineLocation = eachEngineLocationPartArray[eachEngineLocationPartArray.length - 1];
additionalEnginesMap.put(shortEachEngineLocation, eachEngine);
}
} catch (Exception e) {
throw new AnalysisEngineProcessException(e);
}
}
}
if (additionalScripts != null) {
for (String add : additionalScripts) {
recursiveLoadScript(add, additionalScriptsMap, additionalEnginesMap, viewName);
}
}
for (RutaModule each : additionalScriptsMap.values()) {
each.setScriptDependencies(additionalScriptsMap);
}
script.setScriptDependencies(additionalScriptsMap);
for (RutaModule each : additionalScriptsMap.values()) {
each.setEngineDependencies(additionalEnginesMap);
}
script.setEngineDependencies(additionalEnginesMap);
}
public static void addSourceDocumentInformation(CAS cas, File each) {
Type sdiType = cas.getTypeSystem()
.getType("org.apache.uima.examples.SourceDocumentInformation");
if (sdiType != null) {
if (cas.getAnnotationIndex(sdiType).size() == 0) {
AnnotationFS sdi = cas.createAnnotation(sdiType, cas.getDocumentAnnotation().getBegin(),
cas.getDocumentAnnotation().getEnd());
Feature uriFeature = sdiType.getFeatureByBaseName("uri");
sdi.setStringValue(uriFeature, each.toURI().getPath());
cas.addFsToIndexes(sdi);
}
}
}
public static void removeSourceDocumentInformation(CAS cas) {
Type sdiType = cas.getTypeSystem()
.getType("org.apache.uima.examples.SourceDocumentInformation");
if (sdiType != null) {
AnnotationIndex<AnnotationFS> annotationIndex = cas.getAnnotationIndex(sdiType);
List<AnnotationFS> toRemove = new ArrayList<AnnotationFS>();
for (AnnotationFS annotationFS : annotationIndex) {
toRemove.add(annotationFS);
}
for (AnnotationFS annotationFS : toRemove) {
cas.removeFsFromIndexes(annotationFS);
}
}
}
public static String locate(String name, String[] paths, String suffix) {
return locate(name, paths, suffix, true);
}
public static String locateIS(String name, String[] paths, String suffix) {
return locateIS(name, paths, suffix, true);
}
public static String locate(String name, String[] paths, String suffix, boolean mustExist) {
if (name == null || paths == null) {
return null;
}
name = name.replaceAll("[.]", "/");
for (String each : paths) {
File file = new File(each, name + suffix);
if (!mustExist || file.exists()) {
return file.getAbsolutePath();
}
}
return null;
}
public static String locateIS(String name, String[] paths, String suffix, boolean mustExist) {
if (name == null) {
return null;
}
name = name.replaceAll("[.]", "/");
return name + suffix;
}
private void recursiveLoadScript(String toLoad, Map<String, RutaModule> additionalScripts,
Map<String, AnalysisEngine> additionalEngines, String viewName)
throws AnalysisEngineProcessException {
String location = locate(toLoad, scriptPaths, SCRIPT_FILE_EXTENSION);
RutaModule eachScript = null;
if (location == null) {
try {
String scriptPath = toLoad.replaceAll("\\.", "/") + SCRIPT_FILE_EXTENSION;
eachScript = loadScriptIS(scriptPath);
} catch (IOException e) {
throw new AnalysisEngineProcessException(new FileNotFoundException("Script [" + toLoad
+ "] cannot be found at [" + collectionToString(scriptPaths)
+ "] with extension .ruta"));
} catch (RecognitionException e) {
throw new AnalysisEngineProcessException(new FileNotFoundException("Script [" + toLoad
+ "] cannot be found at [" + collectionToString(scriptPaths)
+ "] with extension .ruta"));
}
} else {
try {
eachScript = loadScript(location);
} catch (IOException e) {
throw new AnalysisEngineProcessException(new FileNotFoundException("Script [" + toLoad
+ "] cannot be found at [" + collectionToString(scriptPaths)
+ "] with extension .ruta"));
} catch (RecognitionException e) {
throw new AnalysisEngineProcessException(new FileNotFoundException("Script [" + toLoad
+ "] cannot be found at [" + collectionToString(scriptPaths)
+ "] with extension .ruta"));
}
}
additionalScripts.put(toLoad, eachScript);
for (String add : eachScript.getScripts().keySet()) {
if (!additionalScripts.containsKey(add)) {
recursiveLoadScript(add, additionalScripts, additionalEngines, viewName);
}
}
Set<String> engineKeySet = eachScript.getEngines().keySet();
for (String eachEngineLocation : engineKeySet) {
if (!additionalEngines.containsKey(eachEngineLocation)) {
String engineLocation = locate(eachEngineLocation, descriptorPaths, ".xml");
if (engineLocation == null) {
String engineLocationIS = locateIS(eachEngineLocation, descriptorPaths, ".xml");
try {
AnalysisEngine eachEngine = engineLoader.loadEngineIS(engineLocationIS, viewName);
additionalEngines.put(eachEngineLocation, eachEngine);
} catch (Exception e) {
// uimaFit engine?
try {
@SuppressWarnings("unchecked")
Class<? extends AnalysisComponent> uimafitClass = (Class<? extends AnalysisComponent>) Class
.forName(eachEngineLocation);
AnalysisEngine eachEngine = AnalysisEngineFactory.createEngine(uimafitClass);
additionalEngines.put(eachEngineLocation, eachEngine);
} catch (Exception e1) {
throw new AnalysisEngineProcessException(e1);
}
}
} else {
try {
AnalysisEngine eachEngine = engineLoader.loadEngine(engineLocation, viewName);
additionalEngines.put(eachEngineLocation, eachEngine);
} catch (Exception e) {
throw new AnalysisEngineProcessException(e);
}
}
}
}
}
private RutaModule loadScript(String scriptLocation) throws IOException, RecognitionException {
File scriptFile = new File(scriptLocation);
CharStream st = new ANTLRFileStream(scriptLocation, scriptEncoding);
RutaLexer lexer = new RutaLexer(st);
CommonTokenStream tokens = new CommonTokenStream(lexer);
RutaParser parser = new RutaParser(tokens);
parser.setExternalFactory(factory);
parser.setResourcePaths(resourcePaths);
String name = scriptFile.getName();
int lastIndexOf = name.lastIndexOf(SCRIPT_FILE_EXTENSION);
if (lastIndexOf != -1) {
name = name.substring(0, lastIndexOf);
}
RutaModule script = parser.file_input(name);
return script;
}
private RutaModule loadScriptIS(String scriptLocation) throws IOException, RecognitionException {
InputStream scriptInputStream = getClass().getClassLoader().getResourceAsStream(scriptLocation);
if (scriptInputStream == null) {
throw new FileNotFoundException("No script found in location [" + scriptLocation + "]");
}
CharStream st = new ANTLRInputStream(scriptInputStream, scriptEncoding);
RutaLexer lexer = new RutaLexer(st);
CommonTokenStream tokens = new CommonTokenStream(lexer);
RutaParser parser = new RutaParser(tokens);
parser.setExternalFactory(factory);
parser.setResourcePaths(resourcePaths);
String name = scriptLocation;
if (scriptLocation.indexOf("/") != -1) {
String[] split = scriptLocation.split("[/]");
name = split[split.length - 1];
}
int lastIndexOf = name.lastIndexOf(SCRIPT_FILE_EXTENSION);
if (lastIndexOf != -1) {
name = name.substring(0, lastIndexOf);
}
RutaModule script = parser.file_input(name);
return script;
}
public RutaExternalFactory getFactory() {
return factory;
}
public RutaEngineLoader getEngineLoader() {
return engineLoader;
}
private String collectionToString(Collection<?> collection) {
StringBuilder collectionSB = new StringBuilder();
collectionSB.append("{");
for (Object element : collection) {
collectionSB.append("[").append(element.toString()).append("]");
}
collectionSB.append("}");
return collectionSB.toString();
}
private String collectionToString(Object[] collection) {
if (collection == null) {
return "";
}
else {
return collectionToString(Arrays.asList(collection));
}
}
@Override
public void batchProcessComplete() throws AnalysisEngineProcessException {
super.batchProcessComplete();
Collection<AnalysisEngine> values = script.getEngines().values();
for (AnalysisEngine each : values) {
each.batchProcessComplete();
}
}
@Override
public void collectionProcessComplete() throws AnalysisEngineProcessException {
super.collectionProcessComplete();
Collection<AnalysisEngine> values = script.getEngines().values();
for (AnalysisEngine each : values) {
each.collectionProcessComplete();
}
}
}