blob: 75de199a5a0cda009e6df984b718fa51a4f9321a [file] [log] [blame]
<?php
/**
* File containing the ezcDocumentRstTokenizer
*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*
* @package Document
* @version //autogen//
* @copyright Copyright (C) 2005-2010 eZ Systems AS. All rights reserved.
* @license http://www.apache.org/licenses/LICENSE-2.0 Apache License, Version 2.0
*/
/**
* Tokenizer for RST documents
*
* @package Document
* @version //autogen//
*/
class ezcDocumentRstTokenizer
{
/**
* Common whitespace characters. The vertical tab is excluded, because it
* causes strange problems with PCRE.
*/
const WHITESPACE_CHARS = ' \\t';
/**
* Allowed character sets for headlines.
*
* @see http://docutils.sourceforge.net/docs/ref/rst/restructuredtext.html#sections
*/
const SPECIAL_CHARS = '!"#$%&\'()*+,./:;<=>?@[\\]^_`{|}~-';
/**
* Characters ending a pure text section.
*
* @see http://docutils.sourceforge.net/docs/ref/rst/restructuredtext.html#enumerated-lists
*/
const TEXT_END_CHARS = '`*_\\\\[\\]|()"\':.\\r\\n\\t ';
/**
* List with tokens and a regular expression matching the given token.
*
* The tokens are matched in the given order.
*
* @var array
*/
protected $tokens = array();
/**
* Construct tokenizer
*
* Create token array with regular repression matching the respective
* token.
*
* @return void
*/
public function __construct()
{
$this->tokens = array(
// Whitespaces
ezcDocumentRstToken::NEWLINE =>
'(\\A[' . self::WHITESPACE_CHARS . ']*(?P<value>\\r\\n|\\r|\\n))S',
ezcDocumentRstToken::WHITESPACE =>
'(\\A(?P<value>[' . self::WHITESPACE_CHARS . ']+))S',
// Sequences of special characters
ezcDocumentRstToken::SPECIAL_CHARS =>
'(\\A(?P<value>([' . self::SPECIAL_CHARS . ']|\\xe2\\x80\\xa2|\\xe2\\x80\\xa3|\\xe2\\x81\\x83)\\2*))S',
ezcDocumentRstToken::BACKSLASH =>
'(\\A(?P<value>\\\\))S',
ezcDocumentRstToken::EOF =>
'(\\A(?P<value> ))S',
// This should be last match
ezcDocumentRstToken::TEXT_LINE =>
'(\\A(?P<value>(?: [^' . self::TEXT_END_CHARS . ']|[^' . self::TEXT_END_CHARS . '])+))S',
);
}
/**
* Tokenize the given file
*
* The method tries to tokenize the passed files and returns an array of
* ezcDocumentRstToken struct on succes, or throws a
* ezcDocumentTokenizerException, if something could not be matched by any
* token.
*
* @param string $file
* @return array
*/
public function tokenizeFile( $file )
{
if ( !file_exists( $file ) || !is_readable( $file ) )
{
throw new ezcBaseFileNotFoundException( $file );
}
return $this->tokenizeString( file_get_contents( $file ) );
}
/**
* Convert tabs to spaces
*
* Convert all tabs to spaces, as defined in:
* http://docutils.sourceforge.net/docs/ref/rst/restructuredtext.html#whitespace
*
* @param ezcDocumentRstToken $token
* @return void
*/
protected function convertTabs( ezcDocumentRstToken $token )
{
while ( ( $position = strpos( $token->content, "\t" ) ) !== false )
{
$token->content =
substr( $token->content, 0, $position ) .
str_repeat( ' ', 9 - ( ( $position + $token->position ) % 8 ) ) .
substr( $token->content, $position + 1 );
}
}
/**
* Tokenize the given string
*
* The method tries to tokenize the passed strings and returns an array of
* ezcDocumentRstToken struct on succes, or throws a
* ezcDocumentTokenizerException, if something could not be matched by any
* token.
*
* @param string $string
* @return array
*/
public function tokenizeString( $string )
{
$line = 1;
$position = 1;
$tokens = array();
while ( strlen( $string ) > 0 )
{
foreach ( $this->tokens as $token => $expression )
{
if ( preg_match( $expression, $string, $matches ) )
{
// A token matched, so add the matched token to the token
// list and update all variables.
$newToken = new ezcDocumentRstToken(
$token,
( isset( $matches['value'] ) ? $matches['value'] : null ),
$line,
$position
);
// Removed matched stuff from input string
$string = substr( $string, $length = strlen( $matches[0] ) );
// On a newline token reset the line position and increase the line value
if ( $token === ezcDocumentRstToken::NEWLINE )
{
++$line;
$position = 0;
}
// Convert tabs to spaces for whitespace tokens
if ( $token === ezcDocumentRstToken::WHITESPACE )
{
$this->convertTabs( $newToken );
}
// If we found an explicit EOF token, just exit the parsing process.
if ( $token === ezcDocumentRstToken::EOF )
{
break 2;
}
// Add token to extracted token list
$tokens[] = $newToken;
// Update position, not before converting tabs to spaces.
$position += ( $token === ezcDocumentRstToken::NEWLINE ) ? 1 : strlen( $newToken->content );
// Restart the while loop, because we matched a token and
// can retry with shortened string.
continue 2;
}
}
// None of the token definitions matched the input string. We throw
// an exception with the position of the content in the input
// string and the contents we could not match.
//
// This should never been thrown, but it is hard to prove that
// there is nothing which is not matched by the regualr expressions
// above.
throw new ezcDocumentRstTokenizerException(
$line,
$position,
$string
);
}
// Finally append ainother newline token and a end of file token, to
// make parsing the end easier.
$tokens[] = new ezcDocumentRstToken(
ezcDocumentRstToken::NEWLINE,
"\n", $line, $position
);
$tokens[] = new ezcDocumentRstToken(
ezcDocumentRstToken::NEWLINE,
"\n", $line, $position
);
$tokens[] = new ezcDocumentRstToken(
ezcDocumentRstToken::EOF,
null, $line, $position
);
return $tokens;
}
}
?>