blob: 73f65f88af934ac48d6db186a61f4dce407b0f1e [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.apache.uima.examples;
import java.io.InputStream;
import java.util.Iterator;
import javax.xml.parsers.SAXParser;
import javax.xml.parsers.SAXParserFactory;
import org.apache.uima.UimaContext;
import org.apache.uima.analysis_component.CasAnnotator_ImplBase;
import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
import org.apache.uima.cas.CAS;
import org.apache.uima.cas.FeatureStructure;
import org.apache.uima.cas.Type;
import org.apache.uima.cas.TypeSystem;
import org.apache.uima.resource.ResourceInitializationException;
import org.xml.sax.Attributes;
import org.xml.sax.SAXException;
import org.xml.sax.helpers.DefaultHandler;
/**
* A multi-sofa annotator that does XML detagging. Reads XML data from the input Sofa (named
* "xmlDocument"); this data can be stored in the CAS as a string or array, or it can be a URI to a
* remote file. The XML is parsed using the JVM's default parser, and the plain-text content is
* written to a new sofa called "plainTextDocument".
*/
public class XmlDetagger extends CasAnnotator_ImplBase {
/**
* Name of optional configuration parameter that contains the name of an XML tag that appears in
* the input file. Only text that falls within this XML tag will be considered part of the
* "document" that it is added to the CAS by this CAS Initializer. If not specified, the entire
* file will be considered the document.
*/
public static final String PARAM_XMLTAG = "XmlTagContainingText";
private SAXParserFactory parserFactory = SAXParserFactory.newInstance();
private Type sourceDocInfoType;
private String mXmlTagContainingText = null;
public void initialize(UimaContext aContext) throws ResourceInitializationException {
super.initialize(aContext);
// Get config param setting
mXmlTagContainingText = (String) getContext().getConfigParameterValue(PARAM_XMLTAG);
}
public void typeSystemInit(TypeSystem aTypeSystem) throws AnalysisEngineProcessException {
sourceDocInfoType = aTypeSystem.getType("org.apache.uima.examples.SourceDocumentInformation");
}
public void process(CAS aCAS) throws AnalysisEngineProcessException {
// get handle to CAS view containing XML document
CAS xmlCas = aCAS.getView("xmlDocument");
InputStream xmlStream = xmlCas.getSofa().getSofaDataStream();
// parse with detag handler
DetagHandler handler = new DetagHandler();
try {
SAXParser parser = parserFactory.newSAXParser();
parser.parse(xmlStream, handler);
} catch (Exception e) {
throw new AnalysisEngineProcessException(e);
}
// create the plain text view and set its document text
CAS plainTextView = aCAS.createView("plainTextDocument");
plainTextView.setDocumentText(handler.getDetaggedText());
// Index the SourceDocumentInformation object, if there is one, in the new sofa.
// This is needed by the SemanticSearchCasIndexer
Iterator iter = xmlCas.getAnnotationIndex(sourceDocInfoType).iterator();
if (iter.hasNext()) {
FeatureStructure sourceDocInfoFs = (FeatureStructure) iter.next();
plainTextView.getIndexRepository().addFS(sourceDocInfoFs);
}
}
class DetagHandler extends DefaultHandler {
private StringBuffer detaggedText = new StringBuffer();
private boolean insideTextTag;
public DetagHandler() {
insideTextTag = (mXmlTagContainingText == null);
}
public void startElement(String uri, String localName, String qName, Attributes attributes) throws SAXException {
if (qName.equalsIgnoreCase(mXmlTagContainingText)) {
insideTextTag = true;
}
}
public void endElement(String uri, String localName, String qName) throws SAXException {
if (qName.equalsIgnoreCase(mXmlTagContainingText)) {
insideTextTag = false;
}
}
public void characters(char[] ch, int start, int length) throws SAXException {
if (insideTextTag) {
detaggedText.append(ch, start, length);
}
}
public void ignorableWhitespace(char[] ch, int start, int length) throws SAXException {
if (insideTextTag) {
detaggedText.append(ch, start, length);
}
}
String getDetaggedText() {
return detaggedText.toString();
}
}
}