Document/src/document/wiki/tokenizer/creole.php - zetacomponents - Git at Google

 <?php
 /**
  * File containing the ezcDocumentWikiCreoleTokenizer
  *
  * Licensed to the Apache Software Foundation (ASF) under one
  * or more contributor license agreements.  See the NOTICE file
  * distributed with this work for additional information
  * regarding copyright ownership.  The ASF licenses this file
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
  *
  *   http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing,
  * software distributed under the License is distributed on an
  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
  * KIND, either express or implied.  See the License for the
  * specific language governing permissions and limitations
  * under the License.
  *
  * @package Document
  * @version //autogen//
  * @license http://www.apache.org/licenses/LICENSE-2.0 Apache License, Version 2.0
  */

 /**
  * Tokenizer for Creole wiki documents.
  *
  * The Creole wiki syntax is a started effort to unify wiki markup languages.
  * Its documentation can be found at:
  *
  * http://www.wikicreole.org/
  *
  * For the basic workings of the tokenizer see the class level documentation in
  * the ezcDocumentWikiTokenizer class.
  *
  * @package Document
  * @version //autogen//
  */
 class ezcDocumentWikiCreoleTokenizer extends ezcDocumentWikiTokenizer
 {
     /**
      * Common whitespace characters. The vertical tab is excluded, because it
      * causes strange problems with PCRE.
      */
     const WHITESPACE_CHARS  = '[\\x20\\t]';

     /**
      * Characters ending a pure text section.
      */
     const TEXT_END_CHARS    = '/*^,#_~\\\\\\[\\]{}|=\\r\\n\\t\\x20-';

     /**
      * Special characters, which do have some special meaaning and though may
      * not have been matched otherwise.
      */
     const SPECIAL_CHARS     = '/*^,#_~\\\\\\[\\]{}|=-';

     /**
      * Construct tokenizer
      *
      * Create token array with regular repression matching the respective
      * token.
      *
      * @return void
      */
     public function __construct()
     {
         $this->tokens = array(
             // Match tokens which require to be at the start of a line before
             // matching the actual newlines, because they are the indicator for
             // line starts.
             array(
                 'class' => 'ezcDocumentWikiTitleToken',
                 'match' => '(\\A\\n(?P<value>=+)' . self::WHITESPACE_CHARS . '+)S' ),
             array(
                 'class' => 'ezcDocumentWikiTitleToken',
                 'match' => '(\\A(?P<match>' . self::WHITESPACE_CHARS . '+(?P<value>=+))\\n)S' ),
             array(
                 'class' => 'ezcDocumentWikiBulletListItemToken',
                 'match' => '(\\A\\n' . self::WHITESPACE_CHARS . '*(?P<value>[*-]+)' . self::WHITESPACE_CHARS . '+)S' ),
             array(
                 'class' => 'ezcDocumentWikiEnumeratedListItemToken',
                 'match' => '(\\A\\n' . self::WHITESPACE_CHARS . '*(?P<value>#+)' . self::WHITESPACE_CHARS . '+)S' ),
             array(
                 'class' => 'ezcDocumentWikiPageBreakToken',
                 'match' => '(\\A(?P<match>\n' . self::WHITESPACE_CHARS . '*(?P<value>-{4})' . self::WHITESPACE_CHARS . '*)\\n)S' ),
             array(
                 'class' => 'ezcDocumentWikiLiteralBlockToken',
                 'match' => '(\\A(?P<match>\\n\\{\\{\\{\\n(?P<value>.+)\\n\\}\\}\\})\\n)SUs' ),
             array(
                 'class' => 'ezcDocumentWikiTableRowToken',
                 'match' => '(\\A(?P<match>\\n)(?P<value>\\|))S' ),
             array(
                 'class' => 'ezcDocumentWikiParagraphIndentationToken',
                 'match' => '(\\A\\n(?P<value>(?:>|:)+)' . self::WHITESPACE_CHARS . '*)S' ),

             // Whitespaces
             array(
                 'class' => 'ezcDocumentWikiNewLineToken',
                 'match' => '(\\A' . self::WHITESPACE_CHARS . '*(?P<value>\\r\\n|\\r|\\n))S' ),
             array(
                 'class' => 'ezcDocumentWikiWhitespaceToken',
                 'match' => '(\\A(?P<value>' . self::WHITESPACE_CHARS . '+))S' ),
             array(
                 'class' => 'ezcDocumentWikiEndOfFileToken',
                 'match' => '(\\A(?P<value>\\x0c))S' ),

             // Escape character
             array(
                 'class' => 'ezcDocumentWikiEscapeCharacterToken',
                 'match' => '(\\A(?P<value>~))S' ),

             // Inline markup
             array(
                 'class' => 'ezcDocumentWikiBoldToken',
                 'match' => '(\\A(?P<value>\\*\\*))S' ),
             array(
                 'class' => 'ezcDocumentWikiItalicToken',
                 'match' => '(\\A(?P<value>//))S' ),
             array(
                 'class' => 'ezcDocumentWikiMonospaceToken',
                 'match' => '(\\A(?P<value>##))S' ),
             array(
                 'class' => 'ezcDocumentWikiSuperscriptToken',
                 'match' => '(\\A(?P<value>\\^\\^))S' ),
             array(
                 'class' => 'ezcDocumentWikiSubscriptToken',
                 'match' => '(\\A(?P<value>,,))S' ),
             array(
                 'class' => 'ezcDocumentWikiUnderlineToken',
                 'match' => '(\\A(?P<value>__))S' ),
             array(
                 'class' => 'ezcDocumentWikiInlineLiteralToken',
                 'match' => '(\\A\\{\\{\\{(?P<value>.+?\\}*)\\}\\}\\})Ss' ),
             array(
                 'class' => 'ezcDocumentWikiLineBreakToken',
                 'match' => '(\\A(?P<value>\\\\\\\\))S' ),
             array(
                 'class' => 'ezcDocumentWikiImageStartToken',
                 'match' => '(\\A(?P<value>\\{\\{))S' ),
             array(
                 'class' => 'ezcDocumentWikiImageEndToken',
                 'match' => '(\\A(?P<value>\\}\\}))S' ),
             array(
                 'class' => 'ezcDocumentWikiLinkStartToken',
                 'match' => '(\\A(?P<value>\\[\\[))S' ),
             array(
                 'class' => 'ezcDocumentWikiLinkEndToken',
                 'match' => '(\\A(?P<value>\\]\\]))S' ),
             array(
                 'class' => 'ezcDocumentWikiTableHeaderToken',
                 'match' => '(\\A(?P<value>\\|=))S' ),
             array(
                 'class' => 'ezcDocumentWikiSeparatorToken',
                 'match' => '(\\A(?P<value>\\||' . self::WHITESPACE_CHARS . '*->' . self::WHITESPACE_CHARS . '*))S' ),
             array(
                 'class' => 'ezcDocumentWikiInterWikiLinkToken',
                 'match' => '(\\A(?P<value>([A-Za-z]+):(?:[A-Z][a-z0-9_-]+){2,}))S' ),
             array(
                 'class' => 'ezcDocumentWikiInternalLinkToken',
                 'match' => '(\\A(?P<value>(?:[A-Z][a-z]+){2,}))S' ),
             array(
                 'class' => 'ezcDocumentWikiExternalLinkToken',
                 'match' => '(\\A(?P<match>(?P<value>[a-z]+://\S+?))[,.?!:;"\']?(?:' . self::WHITESPACE_CHARS . '|\\n|\\||]]|\\||$))S' ),

             // Handle plugins
             array(
                 'class' => 'ezcDocumentWikiPluginToken',
                 'match' => '(\\A<<(?P<value>.*?)>>)Ss' ),

             // Match text except
             array(
                 'class' => 'ezcDocumentWikiTextLineToken',
                 'match' => '(\\A(?P<value>[^' . self::TEXT_END_CHARS . ']+))S' ),

             // Match all special characters, which are not valid textual chars,
             // but do not have been matched by any other expression.
             array(
                 'class' => 'ezcDocumentWikiSpecialCharsToken',
                 'match' => '(\\A(?P<value>([' . self::SPECIAL_CHARS . '])\\2*))S' ),
         );
     }

     /**
      * Parse plugin contents
      *
      * Plugins are totally different in each wiki component and its contents
      * should not be passed through the normal wiki parser. So we fetch its
      * contents completely and let each tokinzer extract names and parameters
      * from the complete token itself.
      *
      * @param ezcDocumentWikiPluginToken $plugin
      * @return void
      */
     protected function parsePluginContents( ezcDocumentWikiPluginToken $plugin )
     {
         // Match name of plugin
         if ( preg_match( '(^[a-z]+)i', $plugin->content, $match ) )
         {
             $plugin->type = $match[0];
         }

         // Match plugin parameters
         $parameters = array();
         if ( preg_match_all( '(\s+(?P<key>[a-zA-Z_-]+)=([\'"])(?P<value>.*?)(?!\\\\)\\2)s', $plugin->content, $match ) )
         {
             foreach ( $match['key'] as $nr => $key )
             {
                 $parameters[$key] = $match['value'][$nr];
             }
         }
         $plugin->parameters = $parameters;
     }

     /**
      * Filter tokens
      *
      * Method to filter tokens, after the input string ahs been tokenized. The
      * filter should extract additional information from tokens, which are not
      * generally available yet, like the depth of a title depending on the
      * title markup.
      *
      * @param array $tokens
      * @return array
      */
     protected function filterTokens( array $tokens )
     {
         foreach ( $tokens as $token )
         {
             switch ( true )
             {
                 // Extract the title / indentation level from the tokens
                 // length.
                 case $token instanceof ezcDocumentWikiTitleToken:
                 case $token instanceof ezcDocumentWikiParagraphIndentationToken:
                     $token->level = strlen( trim( $token->content ) );
                     break;

                 case $token instanceof ezcDocumentWikiBulletListItemToken:
                 case $token instanceof ezcDocumentWikiEnumeratedListItemToken:
                     $token->indentation = strlen( $token->content );
                     break;

                 case $token instanceof ezcDocumentWikiPluginToken:
                     $this->parsePluginContents( $token );
                     break;
             }
         }

         return $tokens;
     }
 }

 ?>
	<?php
	/**
	* File containing the ezcDocumentWikiCreoleTokenizer
	*
	* Licensed to the Apache Software Foundation (ASF) under one
	* or more contributor license agreements. See the NOTICE file
	* distributed with this work for additional information
	* regarding copyright ownership. The ASF licenses this file
	* to you under the Apache License, Version 2.0 (the
	* "License"); you may not use this file except in compliance
	* with the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing,
	* software distributed under the License is distributed on an
	* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
	* KIND, either express or implied. See the License for the
	* specific language governing permissions and limitations
	* under the License.
	*
	* @package Document
	* @version //autogen//
	* @license http://www.apache.org/licenses/LICENSE-2.0 Apache License, Version 2.0
	*/

	/**
	* Tokenizer for Creole wiki documents.
	*
	* The Creole wiki syntax is a started effort to unify wiki markup languages.
	* Its documentation can be found at:
	*
	* http://www.wikicreole.org/
	*
	* For the basic workings of the tokenizer see the class level documentation in
	* the ezcDocumentWikiTokenizer class.
	*
	* @package Document
	* @version //autogen//
	*/
	class ezcDocumentWikiCreoleTokenizer extends ezcDocumentWikiTokenizer
	{
	/**
	* Common whitespace characters. The vertical tab is excluded, because it
	* causes strange problems with PCRE.
	*/
	const WHITESPACE_CHARS = '[\\x20\\t]';

	/**
	* Characters ending a pure text section.
	*/
	const TEXT_END_CHARS = '/*^,#_~\\\\\\[\\]{}\|=\\r\\n\\t\\x20-';

	/**
	* Special characters, which do have some special meaaning and though may
	* not have been matched otherwise.
	*/
	const SPECIAL_CHARS = '/*^,#_~\\\\\\[\\]{}\|=-';

	/**
	* Construct tokenizer
	*
	* Create token array with regular repression matching the respective
	* token.
	*
	* @return void
	*/
	public function __construct()
	{
	$this->tokens = array(
	// Match tokens which require to be at the start of a line before
	// matching the actual newlines, because they are the indicator for
	// line starts.
	array(
	'class' => 'ezcDocumentWikiTitleToken',
	'match' => '(\\A\\n(?P<value>=+)' . self::WHITESPACE_CHARS . '+)S' ),
	array(
	'class' => 'ezcDocumentWikiTitleToken',
	'match' => '(\\A(?P<match>' . self::WHITESPACE_CHARS . '+(?P<value>=+))\\n)S' ),
	array(
	'class' => 'ezcDocumentWikiBulletListItemToken',
	'match' => '(\\A\\n' . self::WHITESPACE_CHARS . '(?P<value>[-]+)' . self::WHITESPACE_CHARS . '+)S' ),
	array(
	'class' => 'ezcDocumentWikiEnumeratedListItemToken',
	'match' => '(\\A\\n' . self::WHITESPACE_CHARS . '*(?P<value>#+)' . self::WHITESPACE_CHARS . '+)S' ),
	array(
	'class' => 'ezcDocumentWikiPageBreakToken',
	'match' => '(\\A(?P<match>\n' . self::WHITESPACE_CHARS . '(?P<value>-{4})' . self::WHITESPACE_CHARS . ')\\n)S' ),
	array(
	'class' => 'ezcDocumentWikiLiteralBlockToken',
	'match' => '(\\A(?P<match>\\n\\{\\{\\{\\n(?P<value>.+)\\n\\}\\}\\})\\n)SUs' ),
	array(
	'class' => 'ezcDocumentWikiTableRowToken',
	'match' => '(\\A(?P<match>\\n)(?P<value>\\\|))S' ),
	array(
	'class' => 'ezcDocumentWikiParagraphIndentationToken',
	'match' => '(\\A\\n(?P<value>(?:>\|:)+)' . self::WHITESPACE_CHARS . '*)S' ),

	// Whitespaces
	array(
	'class' => 'ezcDocumentWikiNewLineToken',
	'match' => '(\\A' . self::WHITESPACE_CHARS . '*(?P<value>\\r\\n\|\\r\|\\n))S' ),
	array(
	'class' => 'ezcDocumentWikiWhitespaceToken',
	'match' => '(\\A(?P<value>' . self::WHITESPACE_CHARS . '+))S' ),
	array(
	'class' => 'ezcDocumentWikiEndOfFileToken',
	'match' => '(\\A(?P<value>\\x0c))S' ),

	// Escape character
	array(
	'class' => 'ezcDocumentWikiEscapeCharacterToken',
	'match' => '(\\A(?P<value>~))S' ),

	// Inline markup
	array(
	'class' => 'ezcDocumentWikiBoldToken',
	'match' => '(\\A(?P<value>\\\\))S' ),
	array(
	'class' => 'ezcDocumentWikiItalicToken',
	'match' => '(\\A(?P<value>//))S' ),
	array(
	'class' => 'ezcDocumentWikiMonospaceToken',
	'match' => '(\\A(?P<value>##))S' ),
	array(
	'class' => 'ezcDocumentWikiSuperscriptToken',
	'match' => '(\\A(?P<value>\\^\\^))S' ),
	array(
	'class' => 'ezcDocumentWikiSubscriptToken',
	'match' => '(\\A(?P<value>,,))S' ),
	array(
	'class' => 'ezcDocumentWikiUnderlineToken',
	'match' => '(\\A(?P<value>__))S' ),
	array(
	'class' => 'ezcDocumentWikiInlineLiteralToken',
	'match' => '(\\A\\{\\{\\{(?P<value>.+?\\}*)\\}\\}\\})Ss' ),
	array(
	'class' => 'ezcDocumentWikiLineBreakToken',
	'match' => '(\\A(?P<value>\\\\\\\\))S' ),
	array(
	'class' => 'ezcDocumentWikiImageStartToken',
	'match' => '(\\A(?P<value>\\{\\{))S' ),
	array(
	'class' => 'ezcDocumentWikiImageEndToken',
	'match' => '(\\A(?P<value>\\}\\}))S' ),
	array(
	'class' => 'ezcDocumentWikiLinkStartToken',
	'match' => '(\\A(?P<value>\\[\\[))S' ),
	array(
	'class' => 'ezcDocumentWikiLinkEndToken',
	'match' => '(\\A(?P<value>\\]\\]))S' ),
	array(
	'class' => 'ezcDocumentWikiTableHeaderToken',
	'match' => '(\\A(?P<value>\\\|=))S' ),
	array(
	'class' => 'ezcDocumentWikiSeparatorToken',
	'match' => '(\\A(?P<value>\\\|\|' . self::WHITESPACE_CHARS . '->' . self::WHITESPACE_CHARS . '))S' ),
	array(
	'class' => 'ezcDocumentWikiInterWikiLinkToken',
	'match' => '(\\A(?P<value>([A-Za-z]+):(?:[A-Z][a-z0-9_-]+){2,}))S' ),
	array(
	'class' => 'ezcDocumentWikiInternalLinkToken',
	'match' => '(\\A(?P<value>(?:[A-Z][a-z]+){2,}))S' ),
	array(
	'class' => 'ezcDocumentWikiExternalLinkToken',
	'match' => '(\\A(?P<match>(?P<value>[a-z]+://\S+?))[,.?!:;"\']?(?:' . self::WHITESPACE_CHARS . '\|\\n\|\\\|\|]]\|\\\|\|$))S' ),

	// Handle plugins
	array(
	'class' => 'ezcDocumentWikiPluginToken',
	'match' => '(\\A<<(?P<value>.*?)>>)Ss' ),

	// Match text except
	array(
	'class' => 'ezcDocumentWikiTextLineToken',
	'match' => '(\\A(?P<value>[^' . self::TEXT_END_CHARS . ']+))S' ),

	// Match all special characters, which are not valid textual chars,
	// but do not have been matched by any other expression.
	array(
	'class' => 'ezcDocumentWikiSpecialCharsToken',
	'match' => '(\\A(?P<value>([' . self::SPECIAL_CHARS . '])\\2*))S' ),
	);
	}

	/**
	* Parse plugin contents
	*
	* Plugins are totally different in each wiki component and its contents
	* should not be passed through the normal wiki parser. So we fetch its
	* contents completely and let each tokinzer extract names and parameters
	* from the complete token itself.
	*
	* @param ezcDocumentWikiPluginToken $plugin
	* @return void
	*/
	protected function parsePluginContents( ezcDocumentWikiPluginToken $plugin )
	{
	// Match name of plugin
	if ( preg_match( '(^[a-z]+)i', $plugin->content, $match ) )
	{
	$plugin->type = $match[0];
	}

	// Match plugin parameters
	$parameters = array();
	if ( preg_match_all( '(\s+(?P<key>[a-zA-Z_-]+)=([\'"])(?P<value>.*?)(?!\\\\)\\2)s', $plugin->content, $match ) )
	{
	foreach ( $match['key'] as $nr => $key )
	{
	$parameters[$key] = $match['value'][$nr];
	}
	}
	$plugin->parameters = $parameters;
	}

	/**
	* Filter tokens
	*
	* Method to filter tokens, after the input string ahs been tokenized. The
	* filter should extract additional information from tokens, which are not
	* generally available yet, like the depth of a title depending on the
	* title markup.
	*
	* @param array $tokens
	* @return array
	*/
	protected function filterTokens( array $tokens )
	{
	foreach ( $tokens as $token )
	{
	switch ( true )
	{
	// Extract the title / indentation level from the tokens
	// length.
	case $token instanceof ezcDocumentWikiTitleToken:
	case $token instanceof ezcDocumentWikiParagraphIndentationToken:
	$token->level = strlen( trim( $token->content ) );
	break;

	case $token instanceof ezcDocumentWikiBulletListItemToken:
	case $token instanceof ezcDocumentWikiEnumeratedListItemToken:
	$token->indentation = strlen( $token->content );
	break;

	case $token instanceof ezcDocumentWikiPluginToken:
	$this->parsePluginContents( $token );
	break;
	}
	}

	return $tokens;
	}
	}

	?>