blob: f5cbf16e17b62f53904b401afb3eba1965909221 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.apache.uima.ruta.engine;
import java.util.ArrayList;
import java.util.LinkedList;
import java.util.List;
import org.apache.uima.cas.CAS;
import org.apache.uima.cas.Feature;
import org.apache.uima.cas.Type;
import org.apache.uima.cas.text.AnnotationFS;
import org.apache.uima.jcas.JCas;
import org.apache.uima.jcas.cas.StringArray;
import org.htmlparser.Attribute;
import org.htmlparser.Node;
import org.htmlparser.Remark;
import org.htmlparser.Tag;
import org.htmlparser.visitors.NodeVisitor;
public class HtmlVisitor extends NodeVisitor {
private CAS cas;
private List<AnnotationFS> annotations = new ArrayList<AnnotationFS>();
private LinkedList<AnnotationFS> annotationStack = new LinkedList<AnnotationFS>();
private JCas jcas;
private boolean onlyContent;
public HtmlVisitor(JCas jcas, boolean onlyContent) {
super();
this.jcas = jcas;
this.cas = jcas.getCas();
this.onlyContent = onlyContent;
}
public void visitTag(Tag tag) {
String name = getName(tag);
Type type = getType(name, cas);
boolean tagClosed = isTagClosed(tag);
boolean tagStillOpen = false;
int begin = getBeginOffset(tag);
int end = begin;
if (tagClosed) {
end = getEndOffset(tag);
} else {
Tag endTag = tag.getEndTag();
if (endTag != null) {
end = getEndOffset(endTag);
} else {
end = getEndOffset(tag);
tagStillOpen = true;
}
}
AnnotationFS annotation = cas.createAnnotation(type, begin, end);
processAttributes(annotation, tag);
Feature nameFeature = annotation.getType().getFeatureByBaseName("name");
annotation.setStringValue(nameFeature, name);
if (tagStillOpen) {
annotationStack.add(annotation);
} else {
annotations.add(annotation);
}
}
public void visitRemarkNode(Remark node) {
Type type = getType("REMARK", cas);
int begin = getBeginOffset(node);
int end = getEndOffset(node);
AnnotationFS annotation = cas.createAnnotation(type, begin, end);
Feature feature = type.getFeatureByBaseName("comment");
annotation.setStringValue(feature, node.getText());
annotations.add(annotation);
}
public void visitEndTag(Tag tag) {
String name = getName(tag);
AnnotationFS found = null;
for (int i = annotationStack.size() - 1; i >= 0; i--) {
AnnotationFS each = (AnnotationFS) annotationStack.get(i);
// // Java 6:
// Iterator<AnnotationFS> descendingIterator = annotationStack.descendingIterator();
// while (descendingIterator.hasNext()) {
// AnnotationFS each = (AnnotationFS) descendingIterator.next();
Feature nameFeature = each.getType().getFeatureByBaseName("name");
String eachName = each.getStringValue(nameFeature);
if (name.equals(eachName)) {
int endOffset = getEndOffset(tag);
Feature endFeature = each.getType().getFeatureByBaseName("end");
each.setIntValue(endFeature, endOffset);
found = each;
break;
}
}
if (found != null) {
annotationStack.remove(found);
annotations.add(found);
}
}
private boolean isTagClosed(Tag tag) {
return tag.getRawTagName().endsWith("/");
}
private String getName(Tag tag) {
String result = tag.getRawTagName().toUpperCase();
if (result.endsWith("/")) {
result = result.substring(0, result.length() - 1);
}
if (result.startsWith("/")) {
result = result.substring(1);
}
return result;
}
private Type getType(String name, CAS cas) {
String typeName = HtmlAnnotator.NAMESPACE + name;
Type type = cas.getTypeSystem().getType(typeName);
if (type == null) {
type = cas.getTypeSystem().getType(HtmlAnnotator.NAMESPACE + "TAG");
}
return type;
}
private int getBeginOffset(Node tag) {
if (onlyContent) {
return tag.getEndPosition();
} else {
return tag.getStartPosition();
}
}
private int getEndOffset(Node tag) {
if (onlyContent) {
return tag.getStartPosition();
} else {
return tag.getEndPosition();
}
}
private void processAttributes(AnnotationFS annotation, Tag tag) {
int size = tag.getAttributesEx().size() - 1;
StringArray attributeName = new StringArray(jcas, size);
StringArray attributeValue = new StringArray(jcas, size);
for (int i = 0; i < size; i++) {
Attribute attribute = (Attribute) tag.getAttributesEx().elementAt(i + 1);
attributeName.set(i, attribute.getName());
attributeValue.set(i, attribute.getValue());
}
Feature feature1 = annotation.getType().getFeatureByBaseName("attributeName");
annotation.setFeatureValue(feature1, attributeName);
Feature feature2 = annotation.getType().getFeatureByBaseName("attributeValue");
annotation.setFeatureValue(feature2, attributeValue);
}
public List<AnnotationFS> getAnnotations() {
return annotations;
}
public LinkedList<AnnotationFS> getAnnotationStack() {
return annotationStack;
}
}