blob: 995db0dc30b9e941b99724a763cb8e9c288be6f0 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.any23.extractor.html;
import org.jsoup.nodes.Attribute;
import org.jsoup.select.NodeTraversor;
import org.jsoup.select.NodeVisitor;
import org.w3c.dom.Comment;
import org.w3c.dom.Document;
import org.w3c.dom.Text;
import java.io.IOException;
import java.io.InputStream;
/**
* The parsing configuration for a {@link TagSoupParser}
*
* @author Hans Brende
*/
abstract class TagSoupParsingConfiguration {
String name() {
return getClass().getSimpleName();
}
abstract Document parse(InputStream input, String documentIRI, String encoding) throws IOException;
static TagSoupParsingConfiguration getDefault() {
return JsoupConfig.instance;
}
private static class JsoupConfig extends TagSoupParsingConfiguration {
private static final JsoupConfig instance = new JsoupConfig();
@Override
Document parse(InputStream input, String documentIRI, String encoding) throws IOException {
org.jsoup.nodes.Document document = JsoupUtils.parse(input, documentIRI, encoding);
return convert(document);
}
private static Document convert(org.jsoup.nodes.Document document) {
Document w3cDoc = new org.apache.html.dom.HTMLDocumentImpl();
org.jsoup.nodes.Element rootEl = document.children().first();
if (rootEl != null) {
NodeTraversor.traverse(new DocumentConverter(w3cDoc), rootEl);
}
return w3cDoc;
}
private static class DocumentConverter implements NodeVisitor {
private final Document doc;
private org.w3c.dom.Element dest;
DocumentConverter(Document doc) {
this.doc = doc;
}
@Override
public void head(org.jsoup.nodes.Node source, int depth) {
if (source instanceof org.jsoup.nodes.Element) {
org.jsoup.nodes.Element sourceEl = (org.jsoup.nodes.Element) source;
org.w3c.dom.Element el = doc.createElement(sourceEl.tagName());
copyAttributes(sourceEl, el);
if (dest == null) {
doc.appendChild(el);
} else {
dest.appendChild(el);
}
dest = el;
} else if (source instanceof org.jsoup.nodes.TextNode) {
org.jsoup.nodes.TextNode sourceText = (org.jsoup.nodes.TextNode) source;
Text text = doc.createTextNode(sourceText.getWholeText());
dest.appendChild(text);
} else if (source instanceof org.jsoup.nodes.Comment) {
org.jsoup.nodes.Comment sourceComment = (org.jsoup.nodes.Comment) source;
Comment comment = doc.createComment(sourceComment.getData());
dest.appendChild(comment);
} else if (source instanceof org.jsoup.nodes.DataNode) {
org.jsoup.nodes.DataNode sourceData = (org.jsoup.nodes.DataNode) source;
Text node = doc.createTextNode(stripCDATA(sourceData.getWholeData()));
dest.appendChild(node);
}
}
@Override
public void tail(org.jsoup.nodes.Node source, int depth) {
if (source instanceof org.jsoup.nodes.Element && dest.getParentNode() instanceof org.w3c.dom.Element) {
dest = (org.w3c.dom.Element) dest.getParentNode();
}
}
private void copyAttributes(org.jsoup.nodes.Node source, org.w3c.dom.Element el) {
for (Attribute attribute : source.attributes()) {
// valid xml attribute names are: ^[a-zA-Z_:][-a-zA-Z0-9_:.]
String key = attribute.getKey().replaceAll("[^-a-zA-Z0-9_:.]", "");
if (key.matches("[a-zA-Z_:][-a-zA-Z0-9_:.]*"))
el.setAttribute(key, attribute.getValue());
}
}
}
private static String stripCDATA(String string) {
return reduceToContent(string, "<![CDATA[", "]]>");
}
private static String reduceToContent(String string, String startMarker, String endMarker) {
int i = 0;
int startContent = -1;
int l1 = startMarker.length();
int l2;
char c;
for (l2 = endMarker.length(); i < string.length() - l1 - l2; ++i) {
c = string.charAt(i);
if (!Character.isWhitespace(c)) {
if (c == startMarker.charAt(0) && startMarker.equals(string.substring(i, l1 + i))) {
startContent = i + l1;
break;
}
return string;
}
}
if (startContent != -1) {
for (i = string.length() - 1; i > startContent + l2; --i) {
c = string.charAt(i);
if (!Character.isWhitespace(c)) {
if (c == endMarker.charAt(l2 - 1) && endMarker.equals(string.substring(i - l2 + 1, i + 1))) {
return string.substring(startContent, i - 2);
}
return string;
}
}
}
return string;
}
}
}