| <?php |
| /** |
| * File containing the ezcDocumentXhtml class |
| * |
| * Licensed to the Apache Software Foundation (ASF) under one |
| * or more contributor license agreements. See the NOTICE file |
| * distributed with this work for additional information |
| * regarding copyright ownership. The ASF licenses this file |
| * to you under the Apache License, Version 2.0 (the |
| * "License"); you may not use this file except in compliance |
| * with the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, |
| * software distributed under the License is distributed on an |
| * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
| * KIND, either express or implied. See the License for the |
| * specific language governing permissions and limitations |
| * under the License. |
| * |
| * @package Document |
| * @version //autogen// |
| * @license http://www.apache.org/licenses/LICENSE-2.0 Apache License, Version 2.0 |
| */ |
| |
| /** |
| * The document handler for XHTML document markup. |
| * |
| * @package Document |
| * @version //autogen// |
| */ |
| class ezcDocumentXhtml extends ezcDocumentXmlBase implements ezcDocumentValidation |
| { |
| /** |
| * Array with filter objects for the input HTML document. |
| * |
| * @var array(ezcDocumentXhtmlFilter) |
| */ |
| protected $filters; |
| |
| /** |
| * Construct document xml base. |
| * |
| * @ignore |
| * @param ezcDocumentXhtmlOptions $options |
| * @return void |
| */ |
| public function __construct( ezcDocumentXhtmlOptions $options = null ) |
| { |
| parent::__construct( $options === null ? |
| new ezcDocumentXhtmlOptions() : |
| $options ); |
| |
| $this->filters = array( |
| new ezcDocumentXhtmlElementFilter(), |
| new ezcDocumentXhtmlMetadataFilter(), |
| ); |
| } |
| |
| /** |
| * Create document from input string |
| * |
| * Create a document of the current type handler class and parse it into a |
| * usable internal structure. |
| * |
| * @param string $string |
| * @return void |
| */ |
| public function loadString( $string ) |
| { |
| // Use internal error handling to handle XML errors manually. |
| $oldXmlErrorHandling = libxml_use_internal_errors( true ); |
| libxml_clear_errors(); |
| |
| // Load XML document |
| $this->document = new DOMDocument(); |
| $this->document->registerNodeClass( 'DOMElement', 'ezcDocumentPropertyContainerDomElement' ); |
| |
| // Use the loadHtml method here, as it for example convers tag names |
| // and attribute names to lower case, and handles some more errors |
| // common in HTML documents. |
| $this->document->loadHtml( $string ); |
| |
| $errors = ( $this->options->failOnError ? |
| libxml_get_errors() : |
| null ); |
| |
| libxml_clear_errors(); |
| libxml_use_internal_errors( $oldXmlErrorHandling ); |
| |
| // If there are errors and the error handling is activated throw an |
| // exception with the occured errors. |
| if ( $errors ) |
| { |
| throw new ezcDocumentErroneousXmlException( $errors ); |
| } |
| } |
| |
| /** |
| * Set filters |
| * |
| * Set an array with filter objects, which extract the sematic |
| * information from the given XHtml document. |
| * |
| * @param array $filters |
| * @return void |
| */ |
| public function setFilters( array $filters ) |
| { |
| $this->filters = $filters; |
| } |
| |
| /** |
| * Build docbook document out of annotated XHtml document |
| * |
| * @param DOMDocument $document |
| * @return DOMDocument |
| */ |
| protected function buildDocbookDocument( DOMDocument $document ) |
| { |
| $docbook = new DOMDocument( '1.0', 'utf-8' ); |
| $docbook->preserveWhiteSpace = false; |
| $docbook->formatOutput = true; |
| |
| $root = $docbook->createElementNs( 'http://docbook.org/ns/docbook', 'article' ); |
| $docbook->appendChild( $root ); |
| |
| $xpath = new DOMXPath( $document ); |
| $html = $xpath->query( '/*[local-name() = "html"]' )->item( 0 ); |
| $this->transformToDocbook( $html, $root ); |
| |
| return $docbook; |
| } |
| |
| /** |
| * Check if the current node is an inline element |
| * |
| * Textual content is only allowed in inline element. This method returns |
| * true if the current element is an inline element, otherwise text |
| * contents might be ignored in the output. |
| * |
| * @param DOMElement $element |
| * @return void |
| */ |
| protected function isInlineElement( DOMElement $element ) |
| { |
| return in_array( $element->tagName, array( |
| 'abbrev', |
| 'abstract', |
| 'acronym', |
| 'anchor', |
| 'attribution', |
| 'author', |
| 'authors', |
| 'citation', |
| 'contrib', |
| 'copyright', |
| 'date', |
| 'email', |
| 'emphasis', |
| 'footnote', |
| 'footnoteref', |
| 'inlinemediaobject', |
| 'link', |
| 'literal', |
| 'literallayout', |
| 'para', |
| 'pubdate', |
| 'publisher', |
| 'quote', |
| 'releaseinfo', |
| 'subscript', |
| 'subtitle', |
| 'superscript', |
| 'term', |
| 'title', |
| 'ulink', |
| ) ); |
| } |
| |
| /** |
| * Recursively transform annotated XHtml elements to docbook |
| * |
| * @param DOMElement $xhtml |
| * @param DOMElement $docbook |
| * @param bool $significantWhitespace |
| * @return void |
| */ |
| protected function transformToDocbook( DOMElement $xhtml, DOMElement $docbook, $significantWhitespace = false ) |
| { |
| if ( ( $tagName = $xhtml->getProperty( 'type' ) ) !== false ) |
| { |
| $node = new DOMElement( $tagName ); |
| $docbook->appendChild( $node ); |
| $docbook = $node; |
| |
| if ( ( $attributes = $xhtml->getProperty( 'attributes' ) ) !== false ) |
| { |
| foreach ( $attributes as $name => $value ) |
| { |
| $node->setAttribute( $name, htmlspecialchars( $value ) ); |
| } |
| } |
| } |
| |
| foreach ( $xhtml->childNodes as $child ) |
| { |
| switch ( $child->nodeType ) |
| { |
| case XML_ELEMENT_NODE: |
| $this->transformToDocbook( $child, $docbook, $significantWhitespace || $xhtml->getProperty( 'whitespace' ) === 'significant' ); |
| break; |
| |
| case XML_TEXT_NODE: |
| // Skip pure whitespace text nodes, except for |
| // intentionally converted <br> elements. |
| if ( ( trim( $text = $child->data ) === '' ) && |
| ( !$significantWhitespace ) && |
| ( $xhtml->getProperty( 'whitespace' ) !== 'significant' ) ) |
| { |
| continue; |
| } |
| |
| if ( ( $xhtml->getProperty( 'whitespace' ) === 'significant' ) || |
| ( $significantWhitespace ) ) |
| { |
| // Don't normalize inside nodes with significant whitespaces. |
| $text = new DOMText( $text ); |
| $docbook->appendChild( $text ); |
| } |
| else if ( $this->isInlineElement( $docbook ) ) |
| { |
| $text = new DOMText( preg_replace( '(\s+)', ' ', $text ) ); |
| $docbook->appendChild( $text ); |
| } |
| else |
| { |
| // Wrap contents into a paragraph, if we are yet |
| // outside of an inline element. |
| $text = new DOMText( trim( preg_replace( '(\s+)', ' ', $text ) ) ); |
| $para = $docbook->ownerDocument->createElement( 'para' ); |
| $para->appendChild( $text ); |
| $docbook->appendChild( $para ); |
| } |
| break; |
| |
| case XML_CDATA_SECTION_NODE: |
| // $data = new DOMCharacterData(); |
| // $data->appendData( $child->data ); |
| // $docbook->appendChild( $data ); |
| break; |
| |
| case XML_ENTITY_NODE: |
| // Seems not required, as entities in the source document |
| // are automatically transformed back to their text |
| // targets. |
| break; |
| |
| case XML_COMMENT_NODE: |
| // Ignore comments |
| break; |
| |
| $comment = new DOMElement( 'comment', $child->data ); |
| $docbook->appendChild( $comment ); |
| break; |
| } |
| } |
| } |
| |
| /** |
| * Return document compiled to the docbook format |
| * |
| * The internal document structure is compiled to the docbook format and |
| * the resulting docbook document is returned. |
| * |
| * This method is required for all formats to have one central format, so |
| * that each format can be compiled into each other format using docbook as |
| * an intermediate format. |
| * |
| * You may of course just call an existing converter for this conversion. |
| * |
| * @return ezcDocumentDocbook |
| */ |
| public function getAsDocbook() |
| { |
| foreach ( $this->filters as $filter ) |
| { |
| $filter->filter( $this->document ); |
| } |
| |
| $docbook = new ezcDocumentDocbook(); |
| $docbook->setDomDocument( |
| $this->buildDocbookDocument( $this->document ) |
| ); |
| $docbook->setPath( $this->path ); |
| return $docbook; |
| } |
| |
| /** |
| * Create document from docbook document |
| * |
| * A document of the docbook format is provided and the internal document |
| * structure should be created out of this. |
| * |
| * This method is required for all formats to have one central format, so |
| * that each format can be compiled into each other format using docbook as |
| * an intermediate format. |
| * |
| * You may of course just call an existing converter for this conversion. |
| * |
| * @param ezcDocumentDocbook $document |
| * @return void |
| */ |
| public function createFromDocbook( ezcDocumentDocbook $document ) |
| { |
| if ( $this->options->validate && |
| $document->validateString( $document ) !== true ) |
| { |
| $this->triggerError( E_WARNING, "You try to convert an invalid docbook document. This may lead to invalid output." ); |
| } |
| |
| $this->path = $document->getPath(); |
| |
| $converter = new ezcDocumentDocbookToHtmlConverter(); |
| $converter->options->errorReporting = $this->options->errorReporting; |
| $doc = $converter->convert( $document ); |
| $this->document = $doc->getDomDocument(); |
| } |
| |
| /** |
| * Return document as string |
| * |
| * Serialize the document to a string an return it. |
| * |
| * @return string |
| */ |
| public function save() |
| { |
| $source = $this->document->saveXml( $this->document, LIBXML_NOEMPTYTAG ); |
| |
| // Append DOCTYPE to document, as this is not possible using the DOM |
| // API we do this with a regular expression hack. |
| return preg_replace( |
| '(^<\\?xml[^>]*>(?:\r\n|\r|\n)?)', |
| ( $this->options->xmlHeader ? "\\0" : '' ), |
| $source |
| ); |
| } |
| |
| /** |
| * Validate the input file |
| * |
| * Validate the input file against the specification of the current |
| * document format. |
| * |
| * Returns true, if the validation succeded, and an array with |
| * ezcDocumentValidationError objects otherwise. |
| * |
| * @param string $file |
| * @return mixed |
| */ |
| public function validateFile( $file ) |
| { |
| $oldSetting = libxml_use_internal_errors( true ); |
| libxml_clear_errors(); |
| $document = new DOMDocument(); |
| $document->load( $file ); |
| $document->schemaValidate( dirname( __FILE__ ) . '/xhtml/schema/xhtml1-transitional.xsd' ); |
| |
| // Get all errors |
| $xmlErrors = libxml_get_errors(); |
| $errors = array(); |
| foreach ( $xmlErrors as $error ) |
| { |
| $errors[] = new ezcDocumentValidationError( $error ); |
| } |
| libxml_clear_errors(); |
| libxml_use_internal_errors( $oldSetting ); |
| |
| return ( count( $errors ) ? $errors : true ); |
| } |
| |
| /** |
| * Validate the input string |
| * |
| * Validate the input string against the specification of the current |
| * document format. |
| * |
| * Returns true, if the validation succeded, and an array with |
| * ezcDocumentValidationError objects otherwise. |
| * |
| * @param string $string |
| * @return mixed |
| */ |
| public function validateString( $string ) |
| { |
| $oldSetting = libxml_use_internal_errors( true ); |
| libxml_clear_errors(); |
| $document = new DOMDocument(); |
| $document->loadXml( $string ); |
| $document->schemaValidate( dirname( __FILE__ ) . '/xhtml/schema/xhtml1-transitional.xsd' ); |
| |
| // Get all errors |
| $xmlErrors = libxml_get_errors(); |
| $errors = array(); |
| foreach ( $xmlErrors as $error ) |
| { |
| $errors[] = ezcDocumentValidationError::createFromLibXmlError( $error ); |
| } |
| libxml_clear_errors(); |
| libxml_use_internal_errors( $oldSetting ); |
| |
| return ( count( $errors ) ? $errors : true ); |
| } |
| } |
| |
| ?> |