blob: caf4ae7100841803c9fb346d82b614253899e137 [file] [log] [blame]
<?php
/**
* File containing the ezcDocumentWikiMediawikiTokenizer
*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*
* @package Document
* @version //autogen//
* @license http://www.apache.org/licenses/LICENSE-2.0 Apache License, Version 2.0
* @access private
*/
/**
* Tokenizer for Mediawiki wiki documents.
*
* Mediawiki is probably the most popular wiki, and the driving force behing
* Wikipedia. The markup has a lot extension, but the basics are defined at:
*
* http://www.mediawiki.org/wiki/Markup_spec
*
* @package Document
* @version //autogen//
* @access private
*/
class ezcDocumentWikiMediawikiTokenizer extends ezcDocumentWikiTokenizer
{
/**
* Common whitespace characters. The vertical tab is excluded, because it
* causes strange problems with PCRE.
*/
const WHITESPACE_CHARS = '[\\x20\\t]';
/**
* Regular sub expression to match newlines.
*/
const NEW_LINE = '(?:\\r\\n|\\r|\\n)';
/**
* Characters ending a pure text section.
*/
const TEXT_END_CHARS = '/*^,\'_<\\\\\\[\\]{}()|=\\r\\n\\t\\x20';
/**
* Special characters, which do have some special meaaning and though may
* not have been matched otherwise.
*/
const SPECIAL_CHARS = '/*^,\'_<>\\\\\\[\\]{}()|=';
/**
* Construct tokenizer
*
* Create token array with regular repression matching the respective
* token.
*
* @return void
*/
public function __construct()
{
$this->tokens = array(
/*
// Match tokens which require to be at the start of a line before
// matching the actual newlines, because they are the indicator for
// line starts.
array(
'class' => 'ezcDocumentWikiTitleToken',
'match' => '(\\A\\n(?P<value>=+)' . self::WHITESPACE_CHARS . '+)S' ),
array(
'class' => 'ezcDocumentWikiTitleToken',
'match' => '(\\A(?P<match>' . self::WHITESPACE_CHARS . '+(?P<value>=+))\\n)S' ),
*/
array(
'class' => 'ezcDocumentWikiBulletListItemToken',
'match' => '(\\A\\n(?P<value>[:*#]*\\*)' . self::WHITESPACE_CHARS . '*)S' ),
array(
'class' => 'ezcDocumentWikiEnumeratedListItemToken',
'match' => '(\\A\\n(?P<value>[:*#]*#)' . self::WHITESPACE_CHARS . '*)S' ),
array(
'class' => 'ezcDocumentWikiDefinitionListItemToken',
'match' => '(\\A\\n(?P<value>[:*#]*:)' . self::WHITESPACE_CHARS . '*)S' ),
array(
'class' => 'ezcDocumentWikiLiteralLineToken',
'match' => '(\\A\\n(?P<value>' . self::WHITESPACE_CHARS . '))SUs' ),
/*
array(
'class' => 'ezcDocumentWikiPageBreakToken',
'match' => '(\\A(?P<match>\n' . self::WHITESPACE_CHARS . '*(?P<value>-{4})' . self::WHITESPACE_CHARS . '*)\\n)S' ),
array(
'class' => 'ezcDocumentWikiTableRowToken',
'match' => '(\\A(?P<match>\\n)(?P<value>\\|))S' ),
*/
// Whitespaces
array(
'class' => 'ezcDocumentWikiNewLineToken',
'match' => '(\\A' . self::WHITESPACE_CHARS . '*(?P<value>\\r\\n|\\r|\\n))S' ),
array(
'class' => 'ezcDocumentWikiWhitespaceToken',
'match' => '(\\A(?P<value>' . self::WHITESPACE_CHARS . '+))S' ),
array(
'class' => 'ezcDocumentWikiEndOfFileToken',
'match' => '(\\A(?P<value>\\x0c))S' ),
/*
// Escape character
array(
'class' => 'ezcDocumentWikiEscapeCharacterToken',
'match' => '(\\A(?P<value>~))S' ),
*/
// Inline markup
array(
'class' => 'ezcDocumentWikiMediawikiEmphasisToken',
'match' => '(\\A(?P<value>\'{2,}))S' ),
array(
'class' => 'ezcDocumentWikiTextLineToken',
'match' => '(\\A(?P<match><nowiki>(?P<value>.+)</nowiki>))SUsi' ),
/*
array(
'class' => 'ezcDocumentWikiMonospaceToken',
'match' => '(\\A(?P<value>##))S' ),
array(
'class' => 'ezcDocumentWikiSuperscriptToken',
'match' => '(\\A(?P<value>\\^\\^))S' ),
array(
'class' => 'ezcDocumentWikiSubscriptToken',
'match' => '(\\A(?P<value>,,))S' ),
array(
'class' => 'ezcDocumentWikiUnderlineToken',
'match' => '(\\A(?P<value>__))S' ),
array(
'class' => 'ezcDocumentWikiInlineLiteralToken',
'match' => '(\\A\\{\\{\\{(?P<value>.+?\\}*)\\}\\}\\})Ss' ),
array(
'class' => 'ezcDocumentWikiLineBreakToken',
'match' => '(\\A(?P<value>\\\\\\\\))S' ),
array(
'class' => 'ezcDocumentWikiImageStartToken',
'match' => '(\\A(?P<value>\\{\\{))S' ),
array(
'class' => 'ezcDocumentWikiImageEndToken',
'match' => '(\\A(?P<value>\\}\\}))S' ),
*/
array(
'class' => 'ezcDocumentWikiLinkStartToken',
'match' => '(\\A(?P<value>\\[\\[))S' ),
array(
'class' => 'ezcDocumentWikiLinkEndToken',
'match' => '(\\A(?P<value>\\]\\]))S' ),
/*
array(
'class' => 'ezcDocumentWikiTableHeaderToken',
'match' => '(\\A(?P<value>\\|=))S' ),
*/
array(
'class' => 'ezcDocumentWikiSeparatorToken',
'match' => '(\\A(?P<value>\\|))S' ),
/*
array(
'class' => 'ezcDocumentWikiInterWikiLinkToken',
'match' => '(\\A(?P<value>([A-Za-z]+):(?:[A-Z][a-z0-9_-]+){2,}))S' ),
array(
'class' => 'ezcDocumentWikiInternalLinkToken',
'match' => '(\\A(?P<value>(?:[A-Z][a-z]+){2,}))S' ),
array(
'class' => 'ezcDocumentWikiExternalLinkToken',
'match' => '(\\A(?P<match>(?P<value>[a-z]+://\S+?))[,.?!:;"\']?(?:' . self::WHITESPACE_CHARS . '|\\n|\\||]]|\\||$))S' ),
// Handle plugins
array(
'class' => 'ezcDocumentWikiPluginToken',
'match' => '(\\A<<(?P<value>.*?)>>)Ss' ),
*/
// Match text except
array(
'class' => 'ezcDocumentWikiTextLineToken',
'match' => '(\\A(?P<value>[^' . self::TEXT_END_CHARS . ']+))S' ),
);
}
/**
* Tokenize the given string
*
* The method tries to tokenize the passed strings and returns an array of
* ezcDocumentWikiToken struct on succes, or throws a
* ezcDocumentTokenizerException, if something could not be matched by any
* token.
*
* @param string $string
* @return array
*/
public function tokenizeString( $string )
{
// Remove all comments, since they are ignored anyways and make some
// checks a lot harder.
$string = preg_replace( '(' . self::NEW_LINE . '?<!--.*?(?:-->|\\Z))Ss', '', $string );
return parent::tokenizeString( $string );
}
/**
* Filter tokens
*
* Method to filter tokens, after the input string ahs been tokenized. The
* filter should extract additional information from tokens, which are not
* generally available yet, like the depth of a title depending on the
* title markup.
*
* @param array $tokens
* @return array
*/
protected function filterTokens( array $tokens )
{
foreach ( $tokens as $nr => $token )
{
switch ( true )
{
// Extract the title / indentation level from the tokens
// length.
case $token instanceof ezcDocumentWikiTitleToken:
case $token instanceof ezcDocumentWikiParagraphIndentationToken:
$token->level = strlen( trim( $token->content ) );
break;
}
}
return $tokens;
}
}
?>