blob: c1658957b66b9282ccd0a0c23ec816c02fb1fc62 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.tika.parser.ctakes;
import java.io.IOException;
import java.io.OutputStream;
import java.net.URISyntaxException;
import org.apache.ctakes.typesystem.type.refsem.UmlsConcept;
import org.apache.ctakes.typesystem.type.textsem.IdentifiedAnnotation;
import org.apache.uima.UIMAFramework;
import org.apache.uima.analysis_engine.AnalysisEngine;
import org.apache.uima.cas.impl.XCASSerializer;
import org.apache.uima.cas.impl.XmiCasSerializer;
import org.apache.uima.cas.impl.XmiSerializationSharedData;
import org.apache.uima.jcas.JCas;
import org.apache.uima.jcas.cas.FSArray;
import org.apache.uima.resource.ResourceInitializationException;
import org.apache.uima.resource.ResourceSpecifier;
import org.apache.uima.util.InvalidXMLException;
import org.apache.uima.util.XMLInputSource;
import org.apache.uima.util.XmlCasSerializer;
import org.xml.sax.SAXException;
/**
* This class provides methods to extract biomedical information from plain text
* using {@link CTAKESContentHandler} that relies on Apache cTAKES.
*
* <p>
* Apache cTAKES is built on top of <a href="https://uima.apache.org/">Apache
* UIMA</a> framework and <a href="https://opennlp.apache.org/">OpenNLP</a>
* toolkit.
* </p>
*/
public class CTAKESUtils {
// UMLS username property
private final static String CTAKES_UMLS_USER = "ctakes.umlsuser";
// UMLS password property
private final static String CTAKES_UMLS_PASS = "ctakes.umlspw";
/**
* Returns a new UIMA Analysis Engine (AE). This method ensures that only
* one instance of an AE is created.
*
* <p>
* An Analysis Engine is a component responsible for analyzing unstructured
* information, discovering and representing semantic content. Unstructured
* information includes, but is not restricted to, text documents.
* </p>
*
* @param aeDescriptor pathname for XML file including an AnalysisEngineDescription
* that contains all of the information needed to instantiate and
* use an AnalysisEngine.
* @param umlsUser UMLS username for NLM database
* @param umlsPass UMLS password for NLM database
* @return an Analysis Engine for analyzing unstructured information.
* @throws IOException if any I/O error occurs.
* @throws InvalidXMLException if the input XML is not valid or does not
* specify a valid ResourceSpecifier.
* @throws ResourceInitializationException if a failure occurred during production of the
* resource.
* @throws URISyntaxException if URL of the resource is not formatted
* strictly according to RFC2396 and cannot be
* converted to a URI.
*/
public static AnalysisEngine getAnalysisEngine(String aeDescriptor, String umlsUser,
String umlsPass)
throws IOException, InvalidXMLException, ResourceInitializationException,
URISyntaxException {
// UMLS user ID and password.
String aeDescriptorPath = CTAKESUtils.class.getResource(aeDescriptor).toURI().getPath();
// get Resource Specifier from XML
XMLInputSource aeIputSource = new XMLInputSource(aeDescriptorPath);
ResourceSpecifier aeSpecifier =
UIMAFramework.getXMLParser().parseResourceSpecifier(aeIputSource);
// UMLS user ID and password
if ((umlsUser != null) && (!umlsUser.isEmpty()) && (umlsPass != null) &&
(!umlsPass.isEmpty())) {
/*
* It is highly recommended that you change UMLS credentials in the
* XML configuration file instead of giving user and password using
* CTAKESConfig.
*/
System.setProperty(CTAKES_UMLS_USER, umlsUser);
System.setProperty(CTAKES_UMLS_PASS, umlsPass);
}
// create AE
AnalysisEngine ae = UIMAFramework.produceAnalysisEngine(aeSpecifier);
return ae;
}
/**
* Returns a new JCas () appropriate for the given Analysis Engine. This
* method ensures that only one instance of a JCas is created. A Jcas is a
* Java Cover Classes based Object-oriented CAS (Common Analysis System)
* API.
*
* <p>
* Important: It is highly recommended that you reuse CAS objects rather
* than creating new CAS objects prior to each analysis. This is because CAS
* objects may be expensive to create and may consume a significant amount
* of memory.
* </p>
*
* @param ae AnalysisEngine used to create an appropriate JCas object.
* @return a JCas object appropriate for the given AnalysisEngine.
* @throws ResourceInitializationException if a CAS could not be created because this
* AnalysisEngine's CAS metadata (type system, type
* priorities, or FS indexes)
* are invalid.
*/
public static JCas getJCas(AnalysisEngine ae) throws ResourceInitializationException {
JCas jcas = ae.newJCas();
return jcas;
}
/**
* Serializes a CAS in the given format.
*
* @param jcas CAS (Common Analysis System) to be serialized.
* @param type type of cTAKES (UIMA) serializer used to write CAS.
* @param prettyPrint {@code true} to do pretty printing of output.
* @param stream {@link OutputStream} object used to print out information
* extracted by using cTAKES.
* @throws SAXException if there was a SAX exception.
* @throws IOException if any I/O error occurs.
*/
public static void serialize(JCas jcas, CTAKESSerializer type, boolean prettyPrint,
OutputStream stream) throws SAXException, IOException {
if (type == CTAKESSerializer.XCAS) {
XCASSerializer.serialize(jcas.getCas(), stream, prettyPrint);
} else if (type == CTAKESSerializer.XMI) {
XmiCasSerializer.serialize(jcas.getCas(), jcas.getTypeSystem(), stream, prettyPrint,
new XmiSerializationSharedData());
} else {
XmlCasSerializer.serialize(jcas.getCas(), jcas.getTypeSystem(), stream);
}
}
/**
* Returns the annotation value based on the given annotation type.
*
* @param annotation {@link IdentifiedAnnotation} object.
* @param property {@link CTAKESAnnotationProperty} enum used to identify the
* annotation type.
* @return the annotation value.
*/
public static String getAnnotationProperty(IdentifiedAnnotation annotation,
CTAKESAnnotationProperty property) {
String value = null;
if (property == CTAKESAnnotationProperty.BEGIN) {
value = Integer.toString(annotation.getBegin());
} else if (property == CTAKESAnnotationProperty.END) {
value = Integer.toString(annotation.getEnd());
} else if (property == CTAKESAnnotationProperty.CONDITIONAL) {
value = Boolean.toString(annotation.getConditional());
} else if (property == CTAKESAnnotationProperty.CONFIDENCE) {
value = Float.toString(annotation.getConfidence());
} else if (property == CTAKESAnnotationProperty.DISCOVERY_TECNIQUE) {
value = Integer.toString(annotation.getDiscoveryTechnique());
} else if (property == CTAKESAnnotationProperty.GENERIC) {
value = Boolean.toString(annotation.getGeneric());
} else if (property == CTAKESAnnotationProperty.HISTORY_OF) {
value = Integer.toString(annotation.getHistoryOf());
} else if (property == CTAKESAnnotationProperty.ID) {
value = Integer.toString(annotation.getId());
} else if (property == CTAKESAnnotationProperty.ONTOLOGY_CONCEPT_ARR) {
FSArray mentions = annotation.getOntologyConceptArr();
StringBuilder sb = new StringBuilder();
if (mentions != null) {
for (int i = 0; i < mentions.size(); i++) {
if (mentions.get(i) instanceof UmlsConcept) {
UmlsConcept concept = (UmlsConcept) mentions.get(i);
sb.append("cui=").append(concept.getCui()).append(",").
append(concept.getCodingScheme()).append("=").
append(concept.getCode());
if (i < mentions.size() - 1) {
sb.append(",");
}
}
}
}
value = sb.toString();
} else if (property == CTAKESAnnotationProperty.POLARITY) {
String polarity_pref = "POLARITY";
value = new StringBuilder(polarity_pref).append("=").
append(Integer.toString(annotation.getPolarity())).toString();
}
return value;
}
/**
* Resets cTAKES objects, if created. This method ensures that new cTAKES
* objects (a.k.a., Analysis Engine and JCas) will be created if getters of
* this class are called.
*
* @param ae UIMA Analysis Engine
* @param jcas JCas object
*/
public static void reset(AnalysisEngine ae, JCas jcas) {
// Analysis Engine
resetAE(ae);
// JCas
resetCAS(jcas);
jcas = null;
}
/**
* Resets the CAS (Common Analysis System), emptying it of all content.
*
* @param jcas JCas object
*/
public static void resetCAS(JCas jcas) {
if (jcas != null) {
jcas.reset();
}
}
/**
* Resets the AE (AnalysisEngine), releasing all resources held by the
* current AE.
*
* @param ae UIMA Analysis Engine
*/
public static void resetAE(AnalysisEngine ae) {
if (ae != null) {
ae.destroy();
ae = null;
}
}
}