core/src/main/java/org/apache/any23/extractor/html/TagSoupParsingConfiguration.java - any23 - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License.  You may obtain a copy of the License at
  *
  *  http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */

 package org.apache.any23.extractor.html;

 import org.jsoup.nodes.Attribute;
 import org.jsoup.select.NodeTraversor;
 import org.jsoup.select.NodeVisitor;
 import org.w3c.dom.Comment;
 import org.w3c.dom.Document;
 import org.w3c.dom.Text;

 import java.io.IOException;
 import java.io.InputStream;

 /**
  * The parsing configuration for a {@link TagSoupParser}
  *
  * @author Hans Brende
  */
 abstract class TagSoupParsingConfiguration {

     String name() {
         return getClass().getSimpleName();
     }

     abstract Document parse(InputStream input, String documentIRI, String encoding) throws IOException;

     static TagSoupParsingConfiguration getDefault() {
         return JsoupConfig.instance;
     }

     private static class JsoupConfig extends TagSoupParsingConfiguration {

         private static final JsoupConfig instance = new JsoupConfig();

         @Override
         Document parse(InputStream input, String documentIRI, String encoding) throws IOException {

             org.jsoup.nodes.Document document = JsoupUtils.parse(input, documentIRI, encoding);

             return convert(document);
         }

         private static Document convert(org.jsoup.nodes.Document document) {
             Document w3cDoc = new org.apache.html.dom.HTMLDocumentImpl();

             org.jsoup.nodes.Element rootEl = document.children().first();
             if (rootEl != null) {
                 NodeTraversor.traverse(new DocumentConverter(w3cDoc), rootEl);
             }

             return w3cDoc;
         }

         private static class DocumentConverter implements NodeVisitor {

             private final Document doc;
             private org.w3c.dom.Element dest;

             DocumentConverter(Document doc) {
                 this.doc = doc;
             }

             @Override
             public void head(org.jsoup.nodes.Node source, int depth) {
                 if (source instanceof org.jsoup.nodes.Element) {
                     org.jsoup.nodes.Element sourceEl = (org.jsoup.nodes.Element) source;

                     org.w3c.dom.Element el = doc.createElement(sourceEl.tagName());
                     copyAttributes(sourceEl, el);
                     if (dest == null) {
                         doc.appendChild(el);
                     } else {
                         dest.appendChild(el);
                     }
                     dest = el;
                 } else if (source instanceof org.jsoup.nodes.TextNode) {
                     org.jsoup.nodes.TextNode sourceText = (org.jsoup.nodes.TextNode) source;
                     Text text = doc.createTextNode(sourceText.getWholeText());
                     dest.appendChild(text);
                 } else if (source instanceof org.jsoup.nodes.Comment) {
                     org.jsoup.nodes.Comment sourceComment = (org.jsoup.nodes.Comment) source;
                     Comment comment = doc.createComment(sourceComment.getData());
                     dest.appendChild(comment);
                 } else if (source instanceof org.jsoup.nodes.DataNode) {
                     org.jsoup.nodes.DataNode sourceData = (org.jsoup.nodes.DataNode) source;
                     Text node = doc.createTextNode(stripCDATA(sourceData.getWholeData()));
                     dest.appendChild(node);
                 }
             }

             @Override
             public void tail(org.jsoup.nodes.Node source, int depth) {
                 if (source instanceof org.jsoup.nodes.Element && dest.getParentNode() instanceof org.w3c.dom.Element) {
                     dest = (org.w3c.dom.Element) dest.getParentNode();
                 }
             }

             private void copyAttributes(org.jsoup.nodes.Node source, org.w3c.dom.Element el) {
                 for (Attribute attribute : source.attributes()) {
                     // valid xml attribute names are: ^[a-zA-Z_:][-a-zA-Z0-9_:.]
                     String key = attribute.getKey().replaceAll("[^-a-zA-Z0-9_:.]", "");
                     if (key.matches("[a-zA-Z_:][-a-zA-Z0-9_:.]*"))
                         el.setAttribute(key, attribute.getValue());
                 }
             }
         }

         private static String stripCDATA(String string) {
             return reduceToContent(string, "<![CDATA[", "]]>");
         }

         private static String reduceToContent(String string, String startMarker, String endMarker) {
             int i = 0;
             int startContent = -1;
             int l1 = startMarker.length();

             int l2;
             char c;
             for (l2 = endMarker.length(); i < string.length() - l1 - l2; ++i) {
                 c = string.charAt(i);
                 if (!Character.isWhitespace(c)) {
                     if (c == startMarker.charAt(0) && startMarker.equals(string.substring(i, l1 + i))) {
                         startContent = i + l1;
                         break;
                     }

                     return string;
                 }
             }

             if (startContent != -1) {
                 for (i = string.length() - 1; i > startContent + l2; --i) {
                     c = string.charAt(i);
                     if (!Character.isWhitespace(c)) {
                         if (c == endMarker.charAt(l2 - 1) && endMarker.equals(string.substring(i - l2 + 1, i + 1))) {

                             return string.substring(startContent, i - 2);
                         }

                         return string;
                     }
                 }

             }
             return string;
         }

     }

 }
	/*
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/

	package org.apache.any23.extractor.html;

	import org.jsoup.nodes.Attribute;
	import org.jsoup.select.NodeTraversor;
	import org.jsoup.select.NodeVisitor;
	import org.w3c.dom.Comment;
	import org.w3c.dom.Document;
	import org.w3c.dom.Text;

	import java.io.IOException;
	import java.io.InputStream;

	/**
	* The parsing configuration for a {@link TagSoupParser}
	*
	* @author Hans Brende
	*/
	abstract class TagSoupParsingConfiguration {

	String name() {
	return getClass().getSimpleName();
	}

	abstract Document parse(InputStream input, String documentIRI, String encoding) throws IOException;

	static TagSoupParsingConfiguration getDefault() {
	return JsoupConfig.instance;
	}

	private static class JsoupConfig extends TagSoupParsingConfiguration {

	private static final JsoupConfig instance = new JsoupConfig();

	@Override
	Document parse(InputStream input, String documentIRI, String encoding) throws IOException {

	org.jsoup.nodes.Document document = JsoupUtils.parse(input, documentIRI, encoding);

	return convert(document);
	}

	private static Document convert(org.jsoup.nodes.Document document) {
	Document w3cDoc = new org.apache.html.dom.HTMLDocumentImpl();

	org.jsoup.nodes.Element rootEl = document.children().first();
	if (rootEl != null) {
	NodeTraversor.traverse(new DocumentConverter(w3cDoc), rootEl);
	}

	return w3cDoc;
	}

	private static class DocumentConverter implements NodeVisitor {

	private final Document doc;
	private org.w3c.dom.Element dest;

	DocumentConverter(Document doc) {
	this.doc = doc;
	}

	@Override
	public void head(org.jsoup.nodes.Node source, int depth) {
	if (source instanceof org.jsoup.nodes.Element) {
	org.jsoup.nodes.Element sourceEl = (org.jsoup.nodes.Element) source;

	org.w3c.dom.Element el = doc.createElement(sourceEl.tagName());
	copyAttributes(sourceEl, el);
	if (dest == null) {
	doc.appendChild(el);
	} else {
	dest.appendChild(el);
	}
	dest = el;
	} else if (source instanceof org.jsoup.nodes.TextNode) {
	org.jsoup.nodes.TextNode sourceText = (org.jsoup.nodes.TextNode) source;
	Text text = doc.createTextNode(sourceText.getWholeText());
	dest.appendChild(text);
	} else if (source instanceof org.jsoup.nodes.Comment) {
	org.jsoup.nodes.Comment sourceComment = (org.jsoup.nodes.Comment) source;
	Comment comment = doc.createComment(sourceComment.getData());
	dest.appendChild(comment);
	} else if (source instanceof org.jsoup.nodes.DataNode) {
	org.jsoup.nodes.DataNode sourceData = (org.jsoup.nodes.DataNode) source;
	Text node = doc.createTextNode(stripCDATA(sourceData.getWholeData()));
	dest.appendChild(node);
	}
	}

	@Override
	public void tail(org.jsoup.nodes.Node source, int depth) {
	if (source instanceof org.jsoup.nodes.Element && dest.getParentNode() instanceof org.w3c.dom.Element) {
	dest = (org.w3c.dom.Element) dest.getParentNode();
	}
	}

	private void copyAttributes(org.jsoup.nodes.Node source, org.w3c.dom.Element el) {
	for (Attribute attribute : source.attributes()) {
	// valid xml attribute names are: ^[a-zA-Z_:][-a-zA-Z0-9_:.]
	String key = attribute.getKey().replaceAll("[^-a-zA-Z0-9_:.]", "");
	if (key.matches("[a-zA-Z_:][-a-zA-Z0-9_:.]*"))
	el.setAttribute(key, attribute.getValue());
	}
	}
	}

	private static String stripCDATA(String string) {
	return reduceToContent(string, "<![CDATA[", "]]>");
	}

	private static String reduceToContent(String string, String startMarker, String endMarker) {
	int i = 0;
	int startContent = -1;
	int l1 = startMarker.length();

	int l2;
	char c;
	for (l2 = endMarker.length(); i < string.length() - l1 - l2; ++i) {
	c = string.charAt(i);
	if (!Character.isWhitespace(c)) {
	if (c == startMarker.charAt(0) && startMarker.equals(string.substring(i, l1 + i))) {
	startContent = i + l1;
	break;
	}

	return string;
	}
	}

	if (startContent != -1) {
	for (i = string.length() - 1; i > startContent + l2; --i) {
	c = string.charAt(i);
	if (!Character.isWhitespace(c)) {
	if (c == endMarker.charAt(l2 - 1) && endMarker.equals(string.substring(i - l2 + 1, i + 1))) {

	return string.substring(startContent, i - 2);
	}

	return string;
	}
	}

	}
	return string;
	}

	}

	}