blob: f4428d5d3b9b9b40c82a89e49aabd22f41dc2e50 [file] [log] [blame]
<?php
/**
* File containing the ezcDocumentWikiDokuwikiTokenizer
*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*
* @package Document
* @version //autogen//
* @license http://www.apache.org/licenses/LICENSE-2.0 Apache License, Version 2.0
*/
/**
* Tokenizer for Dokuwiki wiki documents.
*
* The Dokuwiki wiki is a very popular wiki, which for example is currently
* used at http://wiki.php.net. The Dokuwiki syntax definition can be found at:
*
* http://www.dokuwiki.org/syntax
*
* For the basic workings of the tokenizer see the class level documentation in
* the ezcDocumentWikiTokenizer class.
*
* @package Document
* @version //autogen//
*/
class ezcDocumentWikiDokuwikiTokenizer extends ezcDocumentWikiTokenizer
{
/**
* Common whitespace characters. The vertical tab is excluded, because it
* causes strange problems with PCRE.
*/
const WHITESPACE_CHARS = '[\\x20\\t]';
/**
* Characters ending a pure text section.
*/
const TEXT_END_CHARS = '/*^,\'_<>\\\\\\[\\]{}()|=\\r\\n\\t\\x20';
/**
* Special characters, which do have some special meaaning and though may
* not have been matched otherwise.
*/
const SPECIAL_CHARS = '/*^,\'_<>\\\\\\[\\]{}()|=';
/**
* Construct tokenizer
*
* Create token array with regular repression matching the respective
* token.
*
* @return void
*/
public function __construct()
{
$this->tokens = array(
// Match tokens which require to be at the start of a line before
// matching the actual newlines, because they are the indicator for
// line starts.
array(
'class' => 'ezcDocumentWikiTitleToken',
'match' => '(\\A(?P<match>(?:\\n|' . self::WHITESPACE_CHARS . '+)(?P<value>={2,6}))(?:\\n|' . self::WHITESPACE_CHARS . '+))S' ),
array(
'class' => 'ezcDocumentWikiBulletListItemToken',
'match' => '(\\A\\n(?P<value>\\x20*\\*)' . self::WHITESPACE_CHARS . '+)S' ),
array(
'class' => 'ezcDocumentWikiEnumeratedListItemToken',
'match' => '(\\A\\n(?P<value>\\x20*-)' . self::WHITESPACE_CHARS . '+)S' ),
array(
'class' => 'ezcDocumentWikiLiteralBlockToken',
'match' => '(\\A(?P<match>\\n<(code|file)>\\n(?P<value>.+)\\n</\\2>)\\n)SUsi' ),
array(
'class' => 'ezcDocumentWikiLiteralBlockToken',
'match' => '(\\A(?P<match>\\n(?P<value>(' . self::WHITESPACE_CHARS . '+).*\n(?:\\3.*\n)*)))S' ),
array(
'class' => 'ezcDocumentWikiTextLineToken',
'match' => '(\\A(?P<match>\\n<nowiki>\\n(?P<value>.+)\\n</nowiki>)\\n)SUsi' ),
array(
'class' => 'ezcDocumentWikiTableRowToken',
'match' => '(\\A(?P<match>\\n)(?P<value>[|^]))S' ),
array(
'class' => 'ezcDocumentWikiParagraphIndentationToken',
'match' => '(\\A\\n(?P<value>>+)' . self::WHITESPACE_CHARS . '*)S' ),
// Whitespaces
array(
'class' => 'ezcDocumentWikiNewLineToken',
'match' => '(\\A' . self::WHITESPACE_CHARS . '*(?P<value>\\r\\n|\\r|\\n))S' ),
array(
'class' => 'ezcDocumentWikiWhitespaceToken',
'match' => '(\\A(?P<value>' . self::WHITESPACE_CHARS . '+))S' ),
array(
'class' => 'ezcDocumentWikiEndOfFileToken',
'match' => '(\\A(?P<value>\\x0c))S' ),
// Escape character
/*
array(
'class' => 'ezcDocumentWikiEscapeCharacterToken',
'match' => '(\\A(?P<value>~))S' ),
// */
// Inline markup
array(
'class' => 'ezcDocumentWikiBoldToken',
'match' => '(\\A(?P<value>\\*\\*))S' ),
array(
'class' => 'ezcDocumentWikiItalicToken',
'match' => '(\\A(?P<value>//))S' ),
array(
'class' => 'ezcDocumentWikiMonospaceToken',
'match' => '(\\A(?P<value>\'\'))S' ),
array(
'class' => 'ezcDocumentWikiSuperscriptToken',
'match' => '(\\A(?P<value></?sup>))Si' ),
array(
'class' => 'ezcDocumentWikiSubscriptToken',
'match' => '(\\A(?P<value></?sub>))Si' ),
array(
'class' => 'ezcDocumentWikiUnderlineToken',
'match' => '(\\A(?P<value>__))S' ),
array(
'class' => 'ezcDocumentWikiDeletedToken',
'match' => '(\\A(?P<value></?del>))Si' ),
array(
'class' => 'ezcDocumentWikiInlineLiteralToken',
'match' => '(\\A<nowiki>(?P<value>.*)</nowiki>)SUi' ),
array(
'class' => 'ezcDocumentWikiTextLineToken',
'match' => '(\\A%%(?P<value>.*)%%)SUi' ),
array(
'class' => 'ezcDocumentWikiLineBreakToken',
'match' => '(\\A(?P<match>(?P<value>\\\\\\\\))(?:' . self::WHITESPACE_CHARS . '|\\n))S' ),
array(
'class' => 'ezcDocumentWikiLinkStartToken',
'match' => '(\\A(?P<value>\\[\\[))S' ),
array(
'class' => 'ezcDocumentWikiLinkEndToken',
'match' => '(\\A(?P<value>\\]\\]))S' ),
array(
'class' => 'ezcDocumentWikiSeparatorToken',
'match' => '(\\A(?P<value>\\||' . self::WHITESPACE_CHARS . '*->' . self::WHITESPACE_CHARS . '*))S' ),
array(
'class' => 'ezcDocumentWikiExternalLinkToken',
'match' => '(\\A
(?P<match>
(?P<value>
# Match common URLs
[a-z]+://\S+? |
# Match mail addresses enclosed by <>
<[a-z0-9!#$%&\'*+/=?^_`{|}~-]+(?:\\.[a-z0-9!#$%&\'*+/=?^_`{|}~-]+)*@(?:[a-z0-9](?:[a-z0-9-]*[a-z0-9])?\\.)+[a-z0-9](?:[a-z0-9-]*[a-z0-9])?>
)
# Greedy match on text end chars, which should NOT be included in URLs
)[,.?!:;"\']?(?:' . self::WHITESPACE_CHARS . '|\\n|\\||]]|\\}\\}|$)
)Sx' ),
array(
'class' => 'ezcDocumentWikiInterWikiLinkToken',
'match' => '(\\A(?P<value>([A-Za-z]+)>[^\\]|]+))S' ),
array(
'class' => 'ezcDocumentWikiImageStartToken',
'match' => '(\\A(?P<value>\\{\\{))S' ),
array(
'class' => 'ezcDocumentWikiImageEndToken',
'match' => '(\\A(?P<value>\\}\\}))S' ),
array(
'class' => 'ezcDocumentWikiFootnoteStartToken',
'match' => '(\\A(?P<value>\\(\\())S' ),
array(
'class' => 'ezcDocumentWikiFootnoteEndToken',
'match' => '(\\A(?P<value>\\)\\)))S' ),
array(
'class' => 'ezcDocumentWikiTableHeaderToken',
'match' => '(\\A(?P<value>\\^))S' ),
array(
'class' => 'ezcDocumentWikiPluginToken',
'match' => '(\\A(?P<value><([a-zA-Z]+).*?</\\2>))Ss' ),
// Match text except
array(
'class' => 'ezcDocumentWikiTextLineToken',
'match' => '(\\A(?P<value>[^' . self::TEXT_END_CHARS . ']+))S' ),
// Match all special characters, which are not valid textual chars,
// but do not have been matched by any other expression.
array(
'class' => 'ezcDocumentWikiSpecialCharsToken',
'match' => '(\\A(?P<value>([' . self::SPECIAL_CHARS . '])\\2*))S' ),
);
}
/**
* Parse plugin contents
*
* Plugins are totally different in each wiki component and its contents
* should not be passed through the normal wiki parser. So we fetch its
* contents completely and let each tokinzer extract names and parameters
* from the complete token itself.
*
* @param ezcDocumentWikiPluginToken $plugin
* @return void
*/
protected function parsePluginContents( ezcDocumentWikiPluginToken $plugin )
{
// Match name of plugin
if ( preg_match( '(^\\s*<(?P<type>[a-zA-Z]+)(?:\\s+(?P<params>[^>]+))?>(?P<content>.*?)\\s*</\\1>\\s*)si', $plugin->content, $match ) )
{
$plugin->type = strtolower( $match['type'] );
$plugin->parameters = isset( $match['params'] ) && $match['params'] ? array( $match['params'] ) : array();
$plugin->text = $match['content'];
}
}
/**
* Filter tokens
*
* Method to filter tokens, after the input string ahs been tokenized. The
* filter should extract additional information from tokens, which are not
* generally available yet, like the depth of a title depending on the
* title markup.
*
* @param array $tokens
* @return array
*/
protected function filterTokens( array $tokens )
{
$lastImageStartToken = null;
foreach ( $tokens as $nr => $token )
{
switch ( true )
{
// Extract the title / indentation level from the tokens
// length.
case $token instanceof ezcDocumentWikiTitleToken:
$token->level = 7 - strlen( trim( $token->content ) );
break;
case $token instanceof ezcDocumentWikiParagraphIndentationToken:
$token->level = strlen( trim( $token->content ) );
break;
case $token instanceof ezcDocumentWikiImageStartToken:
// Check if an alignement has been specified by whitespace
// tokens.
$lastImageStartToken = $token;
if ( $tokens[$next = $nr + 1] instanceof ezcDocumentWikiWhitespaceToken )
{
$token->alignement = 'right';
unset( $tokens[$nr + 1] );
++$next;
}
if ( preg_match( '(\\?(?P<width>\d+)(?:x(?P<height>\d+))?$)', $tokens[$next]->content, $match ) )
{
$tokens[$next]->content = substr( $tokens[$next]->content, 0, -strlen( $match[0] ) );
$token->width = isset( $match['width'] ) ? (int) $match['width'] : null;
$token->height = isset( $match['height'] ) ? (int) $match['height'] : null;
}
break;
case $token instanceof ezcDocumentWikiImageEndToken:
case $token instanceof ezcDocumentWikiSeparatorToken:
// Check if an alignement has been specified by whitespace
// tokens.
if ( ( $tokens[$nr - 1] instanceof ezcDocumentWikiWhitespaceToken ) &&
( $lastImageStartToken !== null ) )
{
$lastImageStartToken->alignement = $lastImageStartToken->alignement === 'right' ? 'center' : 'left';
unset( $tokens[$nr - 1] );
}
$lastImageStartToken = null;
break;
case $token instanceof ezcDocumentWikiBulletListItemToken:
case $token instanceof ezcDocumentWikiEnumeratedListItemToken:
$token->indentation = substr_count( $token->content, ' ' );
break;
case $token instanceof ezcDocumentWikiPluginToken:
$this->parsePluginContents( $token );
break;
}
}
return $tokens;
}
}
?>