blob: 74a328cf5f61ce8b9be455782117de055e1b66d8 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.apache.uima.tika;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
import org.apache.uima.cas.CAS;
import org.apache.uima.cas.CASException;
import org.apache.uima.cas.Type;
import org.apache.uima.jcas.JCas;
import org.apache.uima.jcas.cas.FSArray;
import org.xml.sax.Attributes;
import org.xml.sax.ContentHandler;
import org.xml.sax.Locator;
import org.xml.sax.SAXException;
/*******************************************************************************
* SAX Handler which gets events from the Tika parser events and create UIMA
* annotations accordingly.
*
******************************************************************************/
public class MarkupHandler implements ContentHandler {
private StringBuffer textBuffer;
private List<ProtoAnnotation> protoAnnotations;
private LinkedList<ProtoAnnotation> startedAnnotations;
public MarkupHandler() {
textBuffer = new StringBuffer();
protoAnnotations = new LinkedList<ProtoAnnotation>();
startedAnnotations = new LinkedList<ProtoAnnotation>();
}
public void characters(char[] ch, int start, int length)
throws SAXException {
// MS doc spits out funny characters
// we replace them with ' '
for (int c = start;c<start+length;c++){
if (!Character.isISOControl(ch[c])) continue;
if (Character.isWhitespace(ch[c])) continue;
ch[c] = ' ';
}
// store the characters in the textBuffer
textBuffer.append(ch, start, length);
}
public void startDocument() throws SAXException {
}
public void endDocument() throws SAXException {
// there should be no annotation left at this stage
if (startedAnnotations.size() != 0) {
// TODO log + error message
}
}
public void startElement(String uri, String localName, String qName,
Attributes atts) throws SAXException {
int startOffset = textBuffer.length();
ProtoAnnotation proto = new ProtoAnnotation(uri,localName, qName,atts, startOffset);
this.startedAnnotations.addLast(proto);
}
public void endElement(String uri, String localName, String qName)
throws SAXException {
int endOffset = textBuffer.length();
// try to get the corresponding annotation
// we start from the last temporary
// and go up the stack
Iterator<ProtoAnnotation> iter = startedAnnotations.iterator();
ProtoAnnotation startedAnnot = null;
while (iter.hasNext()){
ProtoAnnotation temp = iter.next();
if (temp.getLocalName().equals(localName)){
startedAnnot = temp;
break;
}
}
// found something?
if (startedAnnot==null){
// TODO log etc...
return;
}
startedAnnot.setEnd(endOffset);
startedAnnotations.remove(startedAnnot);
protoAnnotations.add(startedAnnot);
// add a \n otherwise we get everything
// on a single line
textBuffer.append("\n");
}
public void ignorableWhitespace(char[] ch, int start, int length)
throws SAXException {
}
// the following methods are simply ignored
public void startPrefixMapping(String prefix, String uri)
throws SAXException {
}
public void endPrefixMapping(String prefix) throws SAXException {
}
public void setDocumentLocator(Locator locator) {
}
public void skippedEntity(String name) throws SAXException {
}
public void processingInstruction(String target, String data)
throws SAXException {
}
public void populateCAS(CAS cas){
// set the text
cas.setDocumentText(this.textBuffer.toString());
Type markupType = cas.getTypeSystem().getType("org.apache.uima.MarkupAnnotation");
Type attributeType = cas.getTypeSystem().getType("org.apache.uima.AttributeFS");
JCas jcas;
try {
jcas = cas.getJCas();
} catch (CASException e) {
throw new RuntimeException(e);
}
// now convert the proto annotations into real ones
for (ProtoAnnotation proto : protoAnnotations) {
MarkupAnnotation markup = new MarkupAnnotation(jcas);
markup.setBegin(proto.getStart());
markup.setEnd(proto.getEnd());
// generate attributes
Attributes protoAttributes = proto.getAtts();
FSArray attribs = (FSArray) cas.createArrayFS(protoAttributes.getLength());
for (int index=0; index< protoAttributes.getLength();index++){
org.apache.uima.tika.AttributeFS afs = (AttributeFS) cas.createFS(attributeType);
afs.setLocalName(protoAttributes.getLocalName(index));
afs.setQualifiedName(protoAttributes.getQName(index));
afs.setUri(protoAttributes.getURI(index));
afs.setValue(protoAttributes.getValue(index));
afs.addToIndexes();
attribs.set(index, afs);
}
markup.setAttributes(attribs);
markup.setUri(proto.getUri());
markup.setName(proto.getLocalName());
markup.setQualifiedName(proto.getQName());
markup.addToIndexes();
}
}
}