blob: cbc9830cc5d7de0621cffe011b894950f3c32442 [file] [log] [blame]
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.camel.dataformat.tagsoup;
import java.io.InputStream;
import java.io.OutputStream;
import java.io.StringWriter;
import java.io.Writer;
import java.util.Map;
import java.util.Map.Entry;
import javax.xml.transform.Transformer;
import javax.xml.transform.TransformerFactory;
import javax.xml.transform.dom.DOMResult;
import javax.xml.transform.sax.SAXSource;
import org.w3c.dom.Node;
import org.xml.sax.ContentHandler;
import org.xml.sax.InputSource;
import org.xml.sax.XMLReader;
import org.apache.camel.CamelException;
import org.apache.camel.Exchange;
import org.apache.camel.spi.DataFormat;
import org.apache.camel.util.ObjectHelper;
import org.ccil.cowan.tagsoup.HTMLSchema;
import org.ccil.cowan.tagsoup.Parser;
import org.ccil.cowan.tagsoup.Schema;
import org.ccil.cowan.tagsoup.XMLWriter;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/**
* Dataformat for TidyMarkup (aka Well formed HTML in XML form.. may or may not
* be XHTML) This dataformat is intended to convert bad HTML from a site (or
* file) into a well formed HTML document which can then be sent to XSLT or
* xpath'ed on.
*
*/
public class TidyMarkupDataFormat implements DataFormat {
/*
* Our Logger
*/
private static final transient Logger LOG = LoggerFactory.getLogger(TidyMarkupDataFormat.class);
private static final String NO = "no";
private static final String YES = "yes";
private static final String XML = "xml";
/**
* When returning a String, do we omit the XML ?
*/
private boolean isOmitXmlDeclaration;
/**
* String or Node to return
*/
private Class dataObjectType;
/**
* What is the default output format ?
*/
private String method;
/**
* The Schema which we are parsing (default HTMLSchema)
*/
private Schema parsingSchema;
/**
* User supplied Parser features
* <p>
* {@link http://home.ccil.org/~cowan/XML/tagsoup/#properties}
* {@link http://www.saxproject.org/apidoc/org/xml/sax/package-summary.html}
* </p>
*/
private Map<String, Boolean> parserFeatures;
/**
* User supplied Parser properties
* <p>
* {@link http://home.ccil.org/~cowan/XML/tagsoup/#properties}
* {@link http://www.saxproject.org/apidoc/org/xml/sax/package-summary.html}
* </p>
*/
private Map<String, Object> parserPropeties;
/**
* Unsupported operation. We cannot create ugly HTML.
*/
public void marshal(Exchange exchange, Object object, OutputStream outputStream) throws Exception {
throw new CamelException("Marshalling from Well Formed HTML to ugly HTML is not supported."
+ " Only unmarshal is supported");
}
/**
* Unmarshal the data
*
* @throws Exception
*/
@SuppressWarnings("unchecked")
public Object unmarshal(Exchange exchange, InputStream inputStream) throws Exception {
ObjectHelper.notNull(dataObjectType, "dataObjectType", this);
if (dataObjectType.isAssignableFrom(Node.class)) {
return asNodeTidyMarkup(inputStream);
} else if (dataObjectType.isAssignableFrom(String.class)) {
return asStringTidyMarkup(inputStream);
} else {
throw new IllegalArgumentException("The return type [" + dataObjectType.getCanonicalName()
+ "] is unsupported");
}
}
/**
* Return the tidy markup as a string
*
* @param inputStream
* @return String of XML
* @throws CamelException
*/
public String asStringTidyMarkup(InputStream inputStream) throws CamelException {
XMLReader parser = createTagSoupParser();
StringWriter w = new StringWriter();
parser.setContentHandler(createContentHandler(w));
try {
parser.parse(new InputSource(inputStream));
return w.toString();
} catch (Exception e) {
throw new CamelException("Failed to convert the HTML to tidy Markup", e);
} finally {
try {
inputStream.close();
} catch (Exception e) {
LOG.warn("Failed to close the inputStream");
}
}
}
/**
* Return the HTML Markup as an {@link org.w3c.dom.Node}
*
* @param inputStream
* The input Stream to convert
* @return org.w3c.dom.Node The HTML Markup as a DOM Node
* @throws CamelException
*/
public Node asNodeTidyMarkup(InputStream inputStream) throws CamelException {
XMLReader parser = createTagSoupParser();
StringWriter w = new StringWriter();
parser.setContentHandler(createContentHandler(w));
try {
Transformer transformer = TransformerFactory.newInstance().newTransformer();
DOMResult result = new DOMResult();
transformer.transform(new SAXSource(parser, new InputSource(inputStream)), result);
return result.getNode();
} catch (Exception e) {
throw new CamelException("Failed to convert the HTML to tidy Markup", e);
}
}
/**
* Create the tagSoup Parser
*
* @return
* @throws CamelException
*/
protected XMLReader createTagSoupParser() throws CamelException {
XMLReader reader = new Parser();
try {
reader.setFeature(Parser.namespacesFeature, false);
reader.setFeature(Parser.namespacePrefixesFeature, false);
/*
* set each parser feature that the user may have supplied.
* http://www.saxproject.org/apidoc/org/xml/sax/package-summary.html
* http://home.ccil.org/~cowan/XML/tagsoup/#properties
*/
if (getParserFeatures() != null) {
for (Entry<String, Boolean> e : getParserFeatures().entrySet()) {
reader.setFeature(e.getKey(), e.getValue());
}
}
/*
* set each parser feature that the user may have supplied. {@link
* http://home.ccil.org/~cowan/XML/tagsoup/#properties}
*/
if (getParserPropeties() != null) {
for (Entry<String, Object> e : getParserPropeties().entrySet()) {
reader.setProperty(e.getKey(), e.getValue());
}
}
/*
* default the schema to HTML
*/
if (this.getParsingSchema() != null) {
reader.setProperty(Parser.schemaProperty, getParsingSchema());
}
} catch (Exception e) {
throw new IllegalArgumentException("Problem configuring the parser", e);
}
return reader;
}
/**
* @param htmlSchema
* the htmlSchema to set
*/
public void setParsingSchema(Schema schema) {
this.parsingSchema = schema;
}
/**
* @return the htmlSchema
*/
public Schema getParsingSchema() {
if (parsingSchema == null) {
this.parsingSchema = new HTMLSchema();
}
return parsingSchema;
}
protected ContentHandler createContentHandler(Writer w) {
XMLWriter xmlWriter = new XMLWriter(w);
// we might need to expose more than these two but that is pretty good
// for a default well formed Html generator
if (getMethod() != null) {
xmlWriter.setOutputProperty(XMLWriter.METHOD, getMethod());
} else {
xmlWriter.setOutputProperty(XMLWriter.METHOD, XML);
}
if (isOmitXmlDeclaration) {
xmlWriter.setOutputProperty(XMLWriter.OMIT_XML_DECLARATION, YES);
} else {
xmlWriter.setOutputProperty(XMLWriter.OMIT_XML_DECLARATION, NO);
}
return xmlWriter;
}
/**
* @param parserFeatures
* the parserFeatures to set
*/
public void setParserFeatures(Map<String, Boolean> parserFeatures) {
this.parserFeatures = parserFeatures;
}
/**
* @return the parserFeatures
*/
public Map<String, Boolean> getParserFeatures() {
return parserFeatures;
}
/**
* @param parserPropeties
* the parserPropeties to set
*/
public void setParserPropeties(Map<String, Object> parserPropeties) {
this.parserPropeties = parserPropeties;
}
/**
* @return the parserPropeties
*/
public Map<String, Object> getParserPropeties() {
return parserPropeties;
}
/**
* @param method
* the method to set
*/
public void setMethod(String method) {
this.method = method;
}
/**
* @return the method
*/
public String getMethod() {
return method;
}
/**
* @return the dataObjectType
*/
public Class getDataObjectType() {
return dataObjectType;
}
/**
* @param dataObjectType
* the dataObjectType to set
*/
public void setDataObjectType(Class dataObjectType) {
this.dataObjectType = dataObjectType;
}
}