| <?php |
| /** |
| * File containing the ezcDocumentBBCodeTokenizer |
| * |
| * Licensed to the Apache Software Foundation (ASF) under one |
| * or more contributor license agreements. See the NOTICE file |
| * distributed with this work for additional information |
| * regarding copyright ownership. The ASF licenses this file |
| * to you under the Apache License, Version 2.0 (the |
| * "License"); you may not use this file except in compliance |
| * with the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, |
| * software distributed under the License is distributed on an |
| * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
| * KIND, either express or implied. See the License for the |
| * specific language governing permissions and limitations |
| * under the License. |
| * |
| * @package Document |
| * @version //autogen// |
| * @license http://www.apache.org/licenses/LICENSE-2.0 Apache License, Version 2.0 |
| */ |
| |
| /** |
| * Tokenizer for bbcode documents |
| * |
| * The tokenizer used for all bbcode documents should prepare a token array, |
| * which can be used by the bbcode parser, without any bbcode language specific |
| * handling in the parser itself required. |
| * |
| * Token extraction |
| * ---------------- |
| * |
| * For the token extraction the reqular expressions in the $tokens property are |
| * used. The $tokens array has to be build like, and can be created in the |
| * constrctor: |
| * |
| * <code> |
| * array( |
| * array( |
| * 'class' => Class name of token, |
| * 'match' => Regular expression to match, |
| * ), |
| * ... |
| * ) |
| * </code> |
| * |
| * The array is evaluated in the given order, until one of the regular |
| * expressions match. The regular expression should have at least one named |
| * match (?P<value> ... ), with the name "value", which will be assigned to the |
| * token, created form the given class name, as its content. The matched |
| * contents will be removed from the beginning of the string. |
| |
| * Optionally a second named match, called "match", may be used inside the |
| * regular expression. If so, only the contents inside this match will be |
| * removed from the beginning of the string. This enables you to perform a |
| * trivial lookahead inside the tokenizer. |
| * |
| * If no expression matches, an exception will be thrown. |
| * |
| * @package Document |
| * @version //autogen// |
| */ |
| class ezcDocumentBBCodeTokenizer |
| { |
| /** |
| * List with tokens and a regular expression matching the given token. |
| * |
| * The tokens are matched in the given order. |
| * |
| * @var array |
| */ |
| protected $tokens = array(); |
| |
| /** |
| * Common whitespace characters. The vertical tab is excluded, because it |
| * causes strange problems with PCRE. |
| */ |
| const WHITESPACE_CHARS = '[\\x20\\t]'; |
| |
| /** |
| * Characters ending a pure text section. |
| */ |
| const TEXT_END_CHARS = '\\[\\]\\r\\n'; |
| |
| /** |
| * Special characters, which do have some special meaaning and though may |
| * not have been matched otherwise. |
| */ |
| const SPECIAL_CHARS = '\\[\\]'; |
| |
| /** |
| * Construct tokenizer |
| * |
| * Create token array with regular repression matching the respective |
| * token. |
| * |
| * @return void |
| */ |
| public function __construct() |
| { |
| $this->tokens = array( |
| // Match tokens which require to be at the start of a line before |
| // matching the actual newlines, because they are the indicator for |
| // line starts. |
| array( |
| 'class' => 'ezcDocumentBBCodeLiteralBlockToken', |
| 'match' => '(\\A(?P<match>\\[code(?:=[^\\]]+)?\\](?P<value>.+)\\[/code\\]))SUs' ), |
| array( |
| 'class' => 'ezcDocumentBBCodeListItemToken', |
| 'match' => '(\\A(?P<match>\\[\\*\\]))SUs' ), |
| array( |
| 'class' => 'ezcDocumentBBCodeTagOpenToken', |
| 'match' => '(\\A(?P<match>\\[(?P<value>[A-Za-z]+(?:=[^\\]]+)?)\\]))SUs' ), |
| array( |
| 'class' => 'ezcDocumentBBCodeTagCloseToken', |
| 'match' => '(\\A(?P<match>\\[/(?P<value>[A-Za-z]+)\\]))SUs' ), |
| |
| // Whitespaces |
| array( |
| 'class' => 'ezcDocumentBBCodeNewLineToken', |
| 'match' => '(\\A' . self::WHITESPACE_CHARS . '*(?P<value>\\r\\n|\\r|\\n))S' ), |
| array( |
| 'class' => 'ezcDocumentBBCodeWhitespaceToken', |
| 'match' => '(\\A(?P<value>' . self::WHITESPACE_CHARS . '+))S' ), |
| array( |
| 'class' => 'ezcDocumentBBCodeEndOfFileToken', |
| 'match' => '(\\A(?P<value>\\x0c))S' ), |
| |
| // Escape character |
| array( |
| 'class' => 'ezcDocumentBBCodeEscapeCharacterToken', |
| 'match' => '(\\A(?P<value>~))S' ), |
| |
| // Match text except |
| array( |
| 'class' => 'ezcDocumentBBCodeTextLineToken', |
| 'match' => '(\\A(?P<value>[^' . self::TEXT_END_CHARS . ']+))S' ), |
| |
| // Match all special characters, which are not valid textual chars, |
| // but do not have been matched by any other expression. |
| array( |
| 'class' => 'ezcDocumentBBCodeSpecialCharsToken', |
| 'match' => '(\\A(?P<value>([' . self::SPECIAL_CHARS . '])\\2*))S' ), |
| ); |
| } |
| |
| /** |
| * Tokenize the given file |
| * |
| * The method tries to tokenize the passed files and returns an array of |
| * ezcDocumentBBCodeToken struct on succes, or throws a |
| * ezcDocumentTokenizerException, if something could not be matched by any |
| * token. |
| * |
| * @param string $file |
| * @return array |
| */ |
| public function tokenizeFile( $file ) |
| { |
| if ( !file_exists( $file ) || !is_readable( $file ) ) |
| { |
| throw new ezcBaseFileNotFoundException( $file ); |
| } |
| |
| return $this->tokenizeString( file_get_contents( $file ) ); |
| } |
| |
| /** |
| * Convert tabs to spaces |
| * |
| * Convert all tabs to spaces, as defined in: |
| * http://docutils.sourceforge.net/docs/ref/rst/restructuredtext.html#whitespace |
| * |
| * @param ezcDocumentBBCodeToken $token |
| * @return void |
| */ |
| protected function convertTabs( ezcDocumentBBCodeToken $token ) |
| { |
| while ( ( $position = strpos( $token->content, "\t" ) ) !== false ) |
| { |
| $token->content = |
| substr( $token->content, 0, $position ) . |
| str_repeat( ' ', 9 - ( ( $position + $token->position ) % 8 ) ) . |
| substr( $token->content, $position + 1 ); |
| } |
| } |
| |
| /** |
| * Tokenize the given string |
| * |
| * The method tries to tokenize the passed strings and returns an array of |
| * ezcDocumentBBCodeToken struct on succes, or throws a |
| * ezcDocumentTokenizerException, if something could not be matched by any |
| * token. |
| * |
| * @param string $string |
| * @return array |
| */ |
| public function tokenizeString( $string ) |
| { |
| $line = 1; |
| $position = 1; |
| $tokens = array(); |
| |
| // Normalize newlines |
| $string = preg_replace( '([\x20\\t]*(?:\\r\\n|\\r|\\n))', "\n", $string ); |
| |
| while ( strlen( $string ) > 0 ) |
| { |
| foreach ( $this->tokens as $match ) |
| { |
| if ( preg_match( $match['match'], $string, $matches ) ) |
| { |
| // If the first part of the match is a |
| // newline, add a respective token to the |
| // stack. |
| if ( ( $matches[0][0] === "\n" ) && |
| ( $match['class'] !== 'ezcDocumentBBCodeNewLineToken' ) ) |
| { |
| $tokens[] = new ezcDocumentBBCodeNewLineToken( $matches[0][0], $line, $position ); |
| ++$line; |
| $position = 0; |
| } |
| |
| // A token matched, so add the matched token to the token |
| // list and update all variables. |
| $class = $match['class']; |
| $newToken = new $class( |
| ( isset( $matches['value'] ) ? $matches['value'] : null ), |
| $line, |
| $position |
| ); |
| |
| $match = isset( $matches['match'] ) ? $matches['match'] : $matches[0]; |
| |
| // Removed matched stuff from input string |
| $string = substr( $string, $length = strlen( $match ) ); |
| |
| // On a newline token reset the line position and increase the line value |
| if ( $newToken instanceof ezcDocumentBBCodeNewLineToken ) |
| { |
| ++$line; |
| $position = 0; |
| } |
| else |
| { |
| // Otherwise still update the line |
| // value, when there is at minimum |
| // one newline in the match. This may |
| // lead to a false position value. |
| if ( ( $newLines = substr_count( $match, "\n" ) ) > 0 ) |
| { |
| $line += $newLines; |
| $position = 0; |
| } |
| } |
| |
| // Convert tabs to spaces for whitespace tokens |
| if ( $newToken instanceof ezcDocumentBBCodeWhitespaceToken ) |
| { |
| $this->convertTabs( $newToken ); |
| } |
| |
| // If we found an explicit EOF token, just exit the parsing process. |
| if ( $newToken instanceof ezcDocumentBBCodeEndOfFileToken ) |
| { |
| break 2; |
| } |
| |
| // Add token to extracted token list |
| $tokens[] = $newToken; |
| |
| // Update position, not before converting tabs to spaces. |
| $position += ( $newToken instanceof ezcDocumentBBCodeNewLineToken ) ? 1 : strlen( $newToken->content ); |
| |
| // Restart the while loop, because we matched a token and |
| // can retry with shortened string. |
| continue 2; |
| } |
| } |
| |
| // None of the token definitions matched the input string. We throw |
| // an exception with the position of the content in the input |
| // string and the contents we could not match. |
| // |
| // This should never been thrown, but it is hard to prove that |
| // there is nothing which is not matched by the regualr expressions |
| // above. |
| throw new ezcDocumentBBCodeTokenizerException( |
| $line, |
| $position, |
| $string |
| ); |
| } |
| |
| // Finally append ainother newline token and a end of file token, to |
| // make parsing the end easier. |
| $tokens[] = new ezcDocumentBBCodeNewLineToken( "\n", $line, $position ); |
| $tokens[] = new ezcDocumentBBCodeNewLineToken( "\n", $line, $position ); |
| $tokens[] = new ezcDocumentBBCodeEndOfFileToken( null, $line, $position ); |
| return $tokens; |
| } |
| } |
| |
| ?> |