Document/src/document/xml/xhtml.php - zetacomponents - Git at Google

 <?php
 /**
  * File containing the ezcDocumentXhtml class
  *
  * Licensed to the Apache Software Foundation (ASF) under one
  * or more contributor license agreements.  See the NOTICE file
  * distributed with this work for additional information
  * regarding copyright ownership.  The ASF licenses this file
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
  *
  *   http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing,
  * software distributed under the License is distributed on an
  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
  * KIND, either express or implied.  See the License for the
  * specific language governing permissions and limitations
  * under the License.
  *
  * @package Document
  * @version //autogen//
  * @license http://www.apache.org/licenses/LICENSE-2.0 Apache License, Version 2.0
  */

 /**
  * The document handler for XHTML document markup.
  *
  * @package Document
  * @version //autogen//
  */
 class ezcDocumentXhtml extends ezcDocumentXmlBase implements ezcDocumentValidation
 {
     /**
      * Array with filter objects for the input HTML document.
      *
      * @var array(ezcDocumentXhtmlFilter)
      */
     protected $filters;

     /**
      * Construct document xml base.
      *
      * @ignore
      * @param ezcDocumentXhtmlOptions $options
      * @return void
      */
     public function __construct( ezcDocumentXhtmlOptions $options = null )
     {
         parent::__construct( $options === null ?
             new ezcDocumentXhtmlOptions() :
             $options );

         $this->filters = array(
             new ezcDocumentXhtmlElementFilter(),
             new ezcDocumentXhtmlMetadataFilter(),
         );
     }

     /**
      * Create document from input string
      *
      * Create a document of the current type handler class and parse it into a
      * usable internal structure.
      *
      * @param string $string
      * @return void
      */
     public function loadString( $string )
     {
         // Use internal error handling to handle XML errors manually.
         $oldXmlErrorHandling = libxml_use_internal_errors( true );
         libxml_clear_errors();

         // Load XML document
         $this->document = new DOMDocument();
         $this->document->registerNodeClass( 'DOMElement', 'ezcDocumentPropertyContainerDomElement' );

         // Use the loadHtml method here, as it for example convers tag names
         // and attribute names to lower case, and handles some more errors
         // common in HTML documents.
         $this->document->loadHtml( $string );

         $errors = ( $this->options->failOnError ?
             libxml_get_errors() :
             null );

         libxml_clear_errors();
         libxml_use_internal_errors( $oldXmlErrorHandling );

         // If there are errors and the error handling is activated throw an
         // exception with the occured errors.
         if ( $errors )
         {
             throw new ezcDocumentErroneousXmlException( $errors );
         }
     }

     /**
      * Set filters
      *
      * Set an array with filter objects, which extract the sematic
      * information from the given XHtml document.
      *
      * @param array $filters
      * @return void
      */
     public function setFilters( array $filters )
     {
         $this->filters = $filters;
     }

     /**
      * Build docbook document out of annotated XHtml document
      *
      * @param DOMDocument $document
      * @return DOMDocument
      */
     protected function buildDocbookDocument( DOMDocument $document )
     {
         $docbook = new DOMDocument( '1.0', 'utf-8' );
         $docbook->preserveWhiteSpace = false;
         $docbook->formatOutput = true;

         $root = $docbook->createElementNs( 'http://docbook.org/ns/docbook', 'article' );
         $docbook->appendChild( $root );

         $xpath = new DOMXPath( $document );
         $html = $xpath->query( '/*[local-name() = "html"]' )->item( 0 );
         $this->transformToDocbook( $html, $root );

         return $docbook;
     }

     /**
      * Check if the current node is an inline element
      *
      * Textual content is only allowed in inline element. This method returns
      * true if the current element is an inline element, otherwise text
      * contents might be ignored in the output.
      *
      * @param DOMElement $element
      * @return void
      */
     protected function isInlineElement( DOMElement $element )
     {
         return in_array( $element->tagName, array(
             'abbrev',
             'abstract',
             'acronym',
             'anchor',
             'attribution',
             'author',
             'authors',
             'citation',
             'contrib',
             'copyright',
             'date',
             'email',
             'emphasis',
             'footnote',
             'footnoteref',
             'inlinemediaobject',
             'link',
             'literal',
             'literallayout',
             'para',
             'pubdate',
             'publisher',
             'quote',
             'releaseinfo',
             'subscript',
             'subtitle',
             'superscript',
             'term',
             'title',
             'ulink',
         ) );
     }

     /**
      * Recursively transform annotated XHtml elements to docbook
      *
      * @param DOMElement $xhtml
      * @param DOMElement $docbook
      * @param bool $significantWhitespace
      * @return void
      */
     protected function transformToDocbook( DOMElement $xhtml, DOMElement $docbook, $significantWhitespace = false )
     {
         if ( ( $tagName = $xhtml->getProperty( 'type' ) ) !== false )
         {
             $node = new DOMElement( $tagName );
             $docbook->appendChild( $node );
             $docbook = $node;

             if ( ( $attributes = $xhtml->getProperty( 'attributes' ) ) !== false )
             {
                 foreach ( $attributes as $name => $value )
                 {
                     $node->setAttribute( $name, htmlspecialchars( $value ) );
                 }
             }
         }

         foreach ( $xhtml->childNodes as $child )
         {
             switch ( $child->nodeType )
             {
                 case XML_ELEMENT_NODE:
                     $this->transformToDocbook( $child, $docbook, $significantWhitespace || $xhtml->getProperty( 'whitespace' ) === 'significant' );
                     break;

                 case XML_TEXT_NODE:
                     // Skip pure whitespace text nodes, except for
                     // intentionally converted <br> elements.
                     if ( ( trim( $text = $child->data ) === '' ) &&
                          ( !$significantWhitespace ) &&
                          ( $xhtml->getProperty( 'whitespace' ) !== 'significant' ) )
                     {
                         continue;
                     }

                     if ( ( $xhtml->getProperty( 'whitespace' ) === 'significant' ) ||
                          ( $significantWhitespace ) )
                     {
                         // Don't normalize inside nodes with significant whitespaces.
                         $text = new DOMText( $text );
                         $docbook->appendChild( $text );
                     }
                     else if ( $this->isInlineElement( $docbook ) )
                     {
                         $text = new DOMText( preg_replace( '(\s+)', ' ', $text ) );
                         $docbook->appendChild( $text );
                     }
                     else
                     {
                         // Wrap contents into a paragraph, if we are yet
                         // outside of an inline element.
                         $text = new DOMText( trim( preg_replace( '(\s+)', ' ', $text ) ) );
                         $para = $docbook->ownerDocument->createElement( 'para' );
                         $para->appendChild( $text );
                         $docbook->appendChild( $para );
                     }
                     break;

                 case XML_CDATA_SECTION_NODE:
 //                    $data = new DOMCharacterData();
 //                    $data->appendData( $child->data );
 //                    $docbook->appendChild( $data );
                     break;

                 case XML_ENTITY_NODE:
                     // Seems not required, as entities in the source document
                     // are automatically transformed back to their text
                     // targets.
                     break;

                 case XML_COMMENT_NODE:
                     // Ignore comments
                     break;

                     $comment = new DOMElement( 'comment', $child->data );
                     $docbook->appendChild( $comment );
                     break;
             }
         }
     }

     /**
      * Return document compiled to the docbook format
      *
      * The internal document structure is compiled to the docbook format and
      * the resulting docbook document is returned.
      *
      * This method is required for all formats to have one central format, so
      * that each format can be compiled into each other format using docbook as
      * an intermediate format.
      *
      * You may of course just call an existing converter for this conversion.
      *
      * @return ezcDocumentDocbook
      */
     public function getAsDocbook()
     {
         foreach ( $this->filters as $filter )
         {
             $filter->filter( $this->document );
         }

         $docbook = new ezcDocumentDocbook();
         $docbook->setDomDocument(
             $this->buildDocbookDocument( $this->document )
         );
         $docbook->setPath( $this->path );
         return $docbook;
     }

     /**
      * Create document from docbook document
      *
      * A document of the docbook format is provided and the internal document
      * structure should be created out of this.
      *
      * This method is required for all formats to have one central format, so
      * that each format can be compiled into each other format using docbook as
      * an intermediate format.
      *
      * You may of course just call an existing converter for this conversion.
      *
      * @param ezcDocumentDocbook $document
      * @return void
      */
     public function createFromDocbook( ezcDocumentDocbook $document )
     {
         if ( $this->options->validate &&
              $document->validateString( $document ) !== true )
         {
             $this->triggerError( E_WARNING, "You try to convert an invalid docbook document. This may lead to invalid output." );
         }

         $this->path = $document->getPath();

         $converter = new ezcDocumentDocbookToHtmlConverter();
         $converter->options->errorReporting = $this->options->errorReporting;
         $doc = $converter->convert( $document );
         $this->document = $doc->getDomDocument();
     }

     /**
      * Return document as string
      *
      * Serialize the document to a string an return it.
      *
      * @return string
      */
     public function save()
     {
         $source = $this->document->saveXml( $this->document, LIBXML_NOEMPTYTAG );

         // Append DOCTYPE to document, as this is not possible using the DOM
         // API we do this with a regular expression hack.
         return preg_replace(
             '(^<\\?xml[^>]*>(?:\r\n|\r|\n)?)',
             ( $this->options->xmlHeader ? "\\0" : '' ),
             $source
         );
     }

     /**
      * Validate the input file
      *
      * Validate the input file against the specification of the current
      * document format.
      *
      * Returns true, if the validation succeded, and an array with
      * ezcDocumentValidationError objects otherwise.
      *
      * @param string $file
      * @return mixed
      */
     public function validateFile( $file )
     {
         $oldSetting = libxml_use_internal_errors( true );
         libxml_clear_errors();
         $document = new DOMDocument();
         $document->load( $file );
         $document->schemaValidate( dirname( __FILE__ ) . '/xhtml/schema/xhtml1-transitional.xsd' );

         // Get all errors
         $xmlErrors = libxml_get_errors();
         $errors = array();
         foreach ( $xmlErrors as $error )
         {
             $errors[] = new ezcDocumentValidationError( $error );
         }
         libxml_clear_errors();
         libxml_use_internal_errors( $oldSetting );

         return ( count( $errors ) ? $errors : true );
     }

     /**
      * Validate the input string
      *
      * Validate the input string against the specification of the current
      * document format.
      *
      * Returns true, if the validation succeded, and an array with
      * ezcDocumentValidationError objects otherwise.
      *
      * @param string $string
      * @return mixed
      */
     public function validateString( $string )
     {
         $oldSetting = libxml_use_internal_errors( true );
         libxml_clear_errors();
         $document = new DOMDocument();
         $document->loadXml( $string );
         $document->schemaValidate( dirname( __FILE__ ) . '/xhtml/schema/xhtml1-transitional.xsd' );

         // Get all errors
         $xmlErrors = libxml_get_errors();
         $errors = array();
         foreach ( $xmlErrors as $error )
         {
             $errors[] = ezcDocumentValidationError::createFromLibXmlError( $error );
         }
         libxml_clear_errors();
         libxml_use_internal_errors( $oldSetting );

         return ( count( $errors ) ? $errors : true );
     }
 }

 ?>
	<?php
	/**
	* File containing the ezcDocumentXhtml class
	*
	* Licensed to the Apache Software Foundation (ASF) under one
	* or more contributor license agreements. See the NOTICE file
	* distributed with this work for additional information
	* regarding copyright ownership. The ASF licenses this file
	* to you under the Apache License, Version 2.0 (the
	* "License"); you may not use this file except in compliance
	* with the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing,
	* software distributed under the License is distributed on an
	* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
	* KIND, either express or implied. See the License for the
	* specific language governing permissions and limitations
	* under the License.
	*
	* @package Document
	* @version //autogen//
	* @license http://www.apache.org/licenses/LICENSE-2.0 Apache License, Version 2.0
	*/

	/**
	* The document handler for XHTML document markup.
	*
	* @package Document
	* @version //autogen//
	*/
	class ezcDocumentXhtml extends ezcDocumentXmlBase implements ezcDocumentValidation
	{
	/**
	* Array with filter objects for the input HTML document.
	*
	* @var array(ezcDocumentXhtmlFilter)
	*/
	protected $filters;

	/**
	* Construct document xml base.
	*
	* @ignore
	* @param ezcDocumentXhtmlOptions $options
	* @return void
	*/
	public function __construct( ezcDocumentXhtmlOptions $options = null )
	{
	parent::__construct( $options === null ?
	new ezcDocumentXhtmlOptions() :
	$options );

	$this->filters = array(
	new ezcDocumentXhtmlElementFilter(),
	new ezcDocumentXhtmlMetadataFilter(),
	);
	}

	/**
	* Create document from input string
	*
	* Create a document of the current type handler class and parse it into a
	* usable internal structure.
	*
	* @param string $string
	* @return void
	*/
	public function loadString( $string )
	{
	// Use internal error handling to handle XML errors manually.
	$oldXmlErrorHandling = libxml_use_internal_errors( true );
	libxml_clear_errors();

	// Load XML document
	$this->document = new DOMDocument();
	$this->document->registerNodeClass( 'DOMElement', 'ezcDocumentPropertyContainerDomElement' );

	// Use the loadHtml method here, as it for example convers tag names
	// and attribute names to lower case, and handles some more errors
	// common in HTML documents.
	$this->document->loadHtml( $string );

	$errors = ( $this->options->failOnError ?
	libxml_get_errors() :
	null );

	libxml_clear_errors();
	libxml_use_internal_errors( $oldXmlErrorHandling );

	// If there are errors and the error handling is activated throw an
	// exception with the occured errors.
	if ( $errors )
	{
	throw new ezcDocumentErroneousXmlException( $errors );
	}
	}

	/**
	* Set filters
	*
	* Set an array with filter objects, which extract the sematic
	* information from the given XHtml document.
	*
	* @param array $filters
	* @return void
	*/
	public function setFilters( array $filters )
	{
	$this->filters = $filters;
	}

	/**
	* Build docbook document out of annotated XHtml document
	*
	* @param DOMDocument $document
	* @return DOMDocument
	*/
	protected function buildDocbookDocument( DOMDocument $document )
	{
	$docbook = new DOMDocument( '1.0', 'utf-8' );
	$docbook->preserveWhiteSpace = false;
	$docbook->formatOutput = true;

	$root = $docbook->createElementNs( 'http://docbook.org/ns/docbook', 'article' );
	$docbook->appendChild( $root );

	$xpath = new DOMXPath( $document );
	$html = $xpath->query( '/*[local-name() = "html"]' )->item( 0 );
	$this->transformToDocbook( $html, $root );

	return $docbook;
	}

	/**
	* Check if the current node is an inline element
	*
	* Textual content is only allowed in inline element. This method returns
	* true if the current element is an inline element, otherwise text
	* contents might be ignored in the output.
	*
	* @param DOMElement $element
	* @return void
	*/
	protected function isInlineElement( DOMElement $element )
	{
	return in_array( $element->tagName, array(
	'abbrev',
	'abstract',
	'acronym',
	'anchor',
	'attribution',
	'author',
	'authors',
	'citation',
	'contrib',
	'copyright',
	'date',
	'email',
	'emphasis',
	'footnote',
	'footnoteref',
	'inlinemediaobject',
	'link',
	'literal',
	'literallayout',
	'para',
	'pubdate',
	'publisher',
	'quote',
	'releaseinfo',
	'subscript',
	'subtitle',
	'superscript',
	'term',
	'title',
	'ulink',
	) );
	}

	/**
	* Recursively transform annotated XHtml elements to docbook
	*
	* @param DOMElement $xhtml
	* @param DOMElement $docbook
	* @param bool $significantWhitespace
	* @return void
	*/
	protected function transformToDocbook( DOMElement $xhtml, DOMElement $docbook, $significantWhitespace = false )
	{
	if ( ( $tagName = $xhtml->getProperty( 'type' ) ) !== false )
	{
	$node = new DOMElement( $tagName );
	$docbook->appendChild( $node );
	$docbook = $node;

	if ( ( $attributes = $xhtml->getProperty( 'attributes' ) ) !== false )
	{
	foreach ( $attributes as $name => $value )
	{
	$node->setAttribute( $name, htmlspecialchars( $value ) );
	}
	}
	}

	foreach ( $xhtml->childNodes as $child )
	{
	switch ( $child->nodeType )
	{
	case XML_ELEMENT_NODE:
	$this->transformToDocbook( $child, $docbook, $significantWhitespace \|\| $xhtml->getProperty( 'whitespace' ) === 'significant' );
	break;

	case XML_TEXT_NODE:
	// Skip pure whitespace text nodes, except for
	// intentionally converted <br> elements.
	if ( ( trim( $text = $child->data ) === '' ) &&
	( !$significantWhitespace ) &&
	( $xhtml->getProperty( 'whitespace' ) !== 'significant' ) )
	{
	continue;
	}

	if ( ( $xhtml->getProperty( 'whitespace' ) === 'significant' ) \|\|
	( $significantWhitespace ) )
	{
	// Don't normalize inside nodes with significant whitespaces.
	$text = new DOMText( $text );
	$docbook->appendChild( $text );
	}
	else if ( $this->isInlineElement( $docbook ) )
	{
	$text = new DOMText( preg_replace( '(\s+)', ' ', $text ) );
	$docbook->appendChild( $text );
	}
	else
	{
	// Wrap contents into a paragraph, if we are yet
	// outside of an inline element.
	$text = new DOMText( trim( preg_replace( '(\s+)', ' ', $text ) ) );
	$para = $docbook->ownerDocument->createElement( 'para' );
	$para->appendChild( $text );
	$docbook->appendChild( $para );
	}
	break;

	case XML_CDATA_SECTION_NODE:
	// $data = new DOMCharacterData();
	// $data->appendData( $child->data );
	// $docbook->appendChild( $data );
	break;

	case XML_ENTITY_NODE:
	// Seems not required, as entities in the source document
	// are automatically transformed back to their text
	// targets.
	break;

	case XML_COMMENT_NODE:
	// Ignore comments
	break;

	$comment = new DOMElement( 'comment', $child->data );
	$docbook->appendChild( $comment );
	break;
	}
	}
	}

	/**
	* Return document compiled to the docbook format
	*
	* The internal document structure is compiled to the docbook format and
	* the resulting docbook document is returned.
	*
	* This method is required for all formats to have one central format, so
	* that each format can be compiled into each other format using docbook as
	* an intermediate format.
	*
	* You may of course just call an existing converter for this conversion.
	*
	* @return ezcDocumentDocbook
	*/
	public function getAsDocbook()
	{
	foreach ( $this->filters as $filter )
	{
	$filter->filter( $this->document );
	}

	$docbook = new ezcDocumentDocbook();
	$docbook->setDomDocument(
	$this->buildDocbookDocument( $this->document )
	);
	$docbook->setPath( $this->path );
	return $docbook;
	}

	/**
	* Create document from docbook document
	*
	* A document of the docbook format is provided and the internal document
	* structure should be created out of this.
	*
	* This method is required for all formats to have one central format, so
	* that each format can be compiled into each other format using docbook as
	* an intermediate format.
	*
	* You may of course just call an existing converter for this conversion.
	*
	* @param ezcDocumentDocbook $document
	* @return void
	*/
	public function createFromDocbook( ezcDocumentDocbook $document )
	{
	if ( $this->options->validate &&
	$document->validateString( $document ) !== true )
	{
	$this->triggerError( E_WARNING, "You try to convert an invalid docbook document. This may lead to invalid output." );
	}

	$this->path = $document->getPath();

	$converter = new ezcDocumentDocbookToHtmlConverter();
	$converter->options->errorReporting = $this->options->errorReporting;
	$doc = $converter->convert( $document );
	$this->document = $doc->getDomDocument();
	}

	/**
	* Return document as string
	*
	* Serialize the document to a string an return it.
	*
	* @return string
	*/
	public function save()
	{
	$source = $this->document->saveXml( $this->document, LIBXML_NOEMPTYTAG );

	// Append DOCTYPE to document, as this is not possible using the DOM
	// API we do this with a regular expression hack.
	return preg_replace(
	'(^<\\?xml[^>]*>(?:\r\n\|\r\|\n)?)',
	( $this->options->xmlHeader ? "\\0" : '' ),
	$source
	);
	}

	/**
	* Validate the input file
	*
	* Validate the input file against the specification of the current
	* document format.
	*
	* Returns true, if the validation succeded, and an array with
	* ezcDocumentValidationError objects otherwise.
	*
	* @param string $file
	* @return mixed
	*/
	public function validateFile( $file )
	{
	$oldSetting = libxml_use_internal_errors( true );
	libxml_clear_errors();
	$document = new DOMDocument();
	$document->load( $file );
	$document->schemaValidate( dirname( __FILE__ ) . '/xhtml/schema/xhtml1-transitional.xsd' );

	// Get all errors
	$xmlErrors = libxml_get_errors();
	$errors = array();
	foreach ( $xmlErrors as $error )
	{
	$errors[] = new ezcDocumentValidationError( $error );
	}
	libxml_clear_errors();
	libxml_use_internal_errors( $oldSetting );

	return ( count( $errors ) ? $errors : true );
	}

	/**
	* Validate the input string
	*
	* Validate the input string against the specification of the current
	* document format.
	*
	* Returns true, if the validation succeded, and an array with
	* ezcDocumentValidationError objects otherwise.
	*
	* @param string $string
	* @return mixed
	*/
	public function validateString( $string )
	{
	$oldSetting = libxml_use_internal_errors( true );
	libxml_clear_errors();
	$document = new DOMDocument();
	$document->loadXml( $string );
	$document->schemaValidate( dirname( __FILE__ ) . '/xhtml/schema/xhtml1-transitional.xsd' );

	// Get all errors
	$xmlErrors = libxml_get_errors();
	$errors = array();
	foreach ( $xmlErrors as $error )
	{
	$errors[] = ezcDocumentValidationError::createFromLibXmlError( $error );
	}
	libxml_clear_errors();
	libxml_use_internal_errors( $oldSetting );

	return ( count( $errors ) ? $errors : true );
	}
	}

	?>