blob: 48756ef5de1169fa25ecf221a4f85fb08861c047 [file] [log] [blame]
<?php
/**
* File containing the ezcDocumentBBCodeParser class.
*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*
* @package Document
* @version //autogen//
* @license http://www.apache.org/licenses/LICENSE-2.0 Apache License, Version 2.0
*/
/**
* Parser for bbcode documents.
*
* @package Document
* @version //autogen//
*/
class ezcDocumentBBCodeParser extends ezcDocumentParser
{
/**
* Array containing simplified shift ruleset.
*
* We cannot express the BBCode syntax as a usual grammar using a BNF. With
* the pumping lemma for context free grammars [1] you can easily prove,
* that the word a^n b c^n d e^n is not a context free grammar, and this is
* what the title definitions are.
*
* This structure contains an array with callbacks implementing the shift
* rules for all tokens. There may be multiple rules for one single token.
*
* The callbacks itself create syntax elements and push them to the
* document stack. After each push the reduction callbacks will be called
* for the pushed elements.
*
* The array should look like:
* <code>
* array(
* WHITESPACE => array(
* reductionMethod,
* ...
* ),
* ...
* )
* </code>
*
* [1] http://en.bbcodepedia.org/bbcode/Pumping_lemma_for_context-free_languages
*
* @var array
*/
protected $shifts = array(
'ezcDocumentBBCodeTagOpenToken'
=> 'shiftOpeningToken',
'ezcDocumentBBCodeTagCloseToken'
=> 'shiftClosingToken',
'ezcDocumentBBCodeListItemToken'
=> 'shiftListItemToken',
'ezcDocumentBBCodeWhitespaceToken'
=> 'shiftWhitespaceToken',
'ezcDocumentBBCodeTextLineToken'
=> 'shiftTextToken',
'ezcDocumentBBCodeLiteralBlockToken'
=> 'shiftLiteralBlockToken',
'ezcDocumentBBCodeNewLineToken'
=> 'shiftNewLineToken',
'ezcDocumentBBCodeEndOfFileToken'
=> 'shiftEndOfFileToken',
);
/**
* Array containing simplified reduce ruleset.
*
* We cannot express the BBCode syntax as a usual grammar using a BNF. This
* structure implements a pseudo grammar by assigning a number of callbacks
* for internal methods implementing reduction rules for a detected syntax
* element.
*
* <code>
* array(
* ezcDocumentBBCodeNode::DOCUMENT => 'reduceDocument'
* ...
* )
* </code>
*
* @var array
*/
protected $reductions = array(
'ezcDocumentBBCodeClosingTagNode' => array(
'reduceTags',
),
'ezcDocumentBBCodeParagraphNode' => array(
'reduceParagraph',
),
'ezcDocumentBBCodeDocumentNode' => array(
'reduceDocument',
),
'ezcDocumentBBCodeListItemNode' => array(
'reduceListItem',
),
'ezcDocumentBBCodeListEndNode' => array(
'reduceList',
),
);
/**
* Contains a list of detected syntax elements.
*
* At the end of a successfull parsing process this should only contain one
* document syntax element. During the process it may contain a list of
* elements, which are up to reduction.
*
* Each element in the stack has to be an object extending from
* ezcDocumentRstNode, which may again contain any amount such objects.
* This way an abstract syntax tree is constructed.
*
* @var array
*/
protected $documentStack = array();
/**
* Parse token stream.
*
* Parse an array of ezcDocumentBBCodeToken objects into a bbcode abstract
* syntax tree.
*
* @param array $tokens
* @return ezcDocumentBBCodeDocumentNode
*/
public function parse( array $tokens )
{
/* DEBUG
echo "\n\nStart parser\n============\n\n";
// /DEBUG */
while ( ( $token = array_shift( $tokens ) ) !== null )
{
/* DEBUG
echo "[T] ({$token->line}:{$token->position}) Token: " . get_class( $token ) . " at {$token->line}:{$token->position}.\n";
// /DEBUG */
// First shift given token by the defined reduction methods
$node = false;
foreach ( $this->shifts as $class => $method )
{
if ( $token instanceof $class )
{
/* DEBUG
echo " - Handle token with ->$method\n";
// /DEBUG */
// Try to shift the token with current method
if ( ( $node = $this->$method( $token, $tokens ) ) !== false )
{
break;
}
}
}
// If the node is still null there was not matching shift rule.
if ( $node === false )
{
return $this->triggerError( E_PARSE,
"Could not find shift rule for token '" . get_class( $token ) . "'.",
$token->line, $token->position
);
}
// Token did not result in any node, it should just be ignored.
if ( $node === null )
{
continue;
}
/* DEBUG
echo "[N] Node: " . get_class( $node ) . " at {$node->token->line}:{$node->token->position}.\n";
// /DEBUG */
// Apply reductions to shifted node
do {
foreach ( $this->reductions as $class => $methods )
{
if ( $node instanceof $class )
{
foreach ( $methods as $method )
{
/* DEBUG
echo " - Handle node with ->$method\n";
// /DEBUG */
if ( ( $node = $this->$method( $node ) ) === null )
{
/* DEBUG
echo " - Reduced.\n";
// /DEBUG */
// The node has been handled, exit loop.
break 3;
}
// Check if the node class has changed and rehandle
// node in this case.
if ( !$node instanceof $class )
{
/* DEBUG
echo " - Try subsequent reductions...\n";
// /DEBUG */
continue 2;
}
}
}
}
} while ( false );
// Check if reductions have been applied, but still returned a
// node, just add to document stack in this case.
if ( $node !== null )
{
/* DEBUG
echo " => Prepend " . get_class( $node ) . " to document stack.\n";
// /DEBUG */
array_unshift( $this->documentStack, $node );
}
}
// Check if we successfully reduced the document stack
if ( ( count( $this->documentStack ) !== 1 ) ||
( !( $document = reset( $this->documentStack ) ) instanceof ezcDocumentBBCodeDocumentNode ) )
{
$node = isset( $document ) ? $document : reset( $this->documentStack );
$this->triggerError(
E_PARSE,
'Expected end of file, got: ' . get_class( $this->documentStack[1] ) . ".",
$this->documentStack[1]->token->line, $this->documentStack[1]->token->position
);
}
return $document;
}
/**
* Shift list item token
*
* List item tokens indicate a new list item. Just put them on the stack,
* they will be aggregated later.
*
* @param ezcDocumentBBCodeToken $token
* @param array $tokens
* @return mixed
*/
protected function shiftListItemToken( ezcDocumentBBCodeToken $token, array &$tokens )
{
/* DEBUG
echo " - Shift list item.\n";
// /DEBUG */
return new ezcDocumentBBCodeListItemNode( $token );
}
/**
* Shift tag opening token
*
* Opening tags mean that the following contents will be aggregated, once a
* matching closing tag is found. Is just shifted to the document stack.
*
* @param ezcDocumentBBCodeToken $token
* @param array $tokens
* @return mixed
*/
protected function shiftOpeningToken( ezcDocumentBBCodeToken $token, array &$tokens )
{
if ( $token->content !== 'list' )
{
/* DEBUG
echo " - Shift opening token {$token->content}.\n";
// /DEBUG */
return new ezcDocumentBBCodeTagNode( $token );
}
switch ( true )
{
case $token->parameters === null:
/* DEBUG
echo " - Shift bullet list.\n";
// /DEBUG */
return new ezcDocumentBBCodeBulletListNode( $token );
default:
/* DEBUG
echo " - Shift enumerated list.\n";
// /DEBUG */
return new ezcDocumentBBCodeEnumeratedListNode( $token );
}
}
/**
* Shift tag clsoing token
*
* Closing tags mean that the preceeding contents will be aggregated, once a
* matching opening tag is found. Is just shifted to the document stack,
* and the appropriate reduce call will follow right away.
*
* @param ezcDocumentBBCodeToken $token
* @param array $tokens
* @return mixed
*/
protected function shiftClosingToken( ezcDocumentBBCodeToken $token, array &$tokens )
{
if ( $token->content === 'list' )
{
/* DEBUG
echo " - Shift list end node.\n";
// /DEBUG */
return new ezcDocumentBBCodeListEndNode( $token );
}
/* DEBUG
echo " - Shift closing token {$token->content}.\n";
// /DEBUG */
return new ezcDocumentBBCodeClosingTagNode( $token );
}
/**
* Shift whitespace token.
*
* Shift whitespace tokens. Whitespaces are only considered significant, if
* the prior token was not a block level element.
*
* @param ezcDocumentBBCodeToken $token
* @param array $tokens
* @return mixed
*/
protected function shiftWhitespaceToken( ezcDocumentBBCodeToken $token, array &$tokens )
{
if ( isset( $this->documentStack[0] ) &&
( !$this->documentStack[0] instanceof ezcDocumentBBCodeBlockLevelNode ) )
{
/* DEBUG
echo " - Shift whitespace text node.\n";
// /DEBUG */
return new ezcDocumentBBCodeTextNode( $token );
}
/* DEBUG
echo " - Ignore whitespace node.\n";
// /DEBUG */
return null;
}
/**
* Shift text token.
*
* @param ezcDocumentBBCodeToken $token
* @param array $tokens
* @return mixed
*/
protected function shiftTextToken( ezcDocumentBBCodeToken $token, array &$tokens )
{
/* DEBUG
echo " - Shift text node.\n";
// /DEBUG */
return new ezcDocumentBBCodeTextNode( $token );
}
/**
* Shift literal block token
*
* Literal blocks are just a chunk of code or similar, where the token can
* jsut be converted into an apropriate node.
*
* @param ezcDocumentBBCodeToken $token
* @param array $tokens
* @return mixed
*/
protected function shiftLiteralBlockToken( ezcDocumentBBCodeToken $token, array &$tokens )
{
if ( isset( $this->documentStack[0] ) &&
( $this->documentStack[0] instanceof ezcDocumentBBCodeParagraphNode ) &&
isset( $tokens[0] ) &&
( $tokens[0] instanceof ezcDocumentBBCodeNewLineToken ) )
{
// Remove following new line tokens.
do {
array_shift( $tokens );
} while ( isset( $tokens[0] ) &&
( ( $tokens[0] instanceof ezcDocumentBBCodeNewlineToken ) ||
( $tokens[0] instanceof ezcDocumentBBCodeWhitespaceToken ) ) );
/* DEBUG
echo " - Shift literal block node.\n";
// /DEBUG */
return new ezcDocumentBBCodeLiteralBlockNode( $token );
}
else
{
/* DEBUG
echo " - Shift inline literal node.\n";
// /DEBUG */
return new ezcDocumentBBCodeInlineLiteralNode( $token );
}
}
/**
* Shift new line token.
*
* Double new lines are considered as paragraphs. All other new lines are
* just shifted as single whitespace text nodes.
*
* @param ezcDocumentBBCodeToken $token
* @param array $tokens
* @return mixed
*/
protected function shiftNewLineToken( ezcDocumentBBCodeToken $token, array &$tokens )
{
while ( isset( $tokens[0] ) &&
( $tokens[0] instanceof ezcDocumentBBCodeWhitespaceToken ) )
{
array_shift( $tokens );
}
if ( isset( $tokens[0] ) &&
( $tokens[0] instanceof ezcDocumentBBCodeNewlineToken ) )
{
do {
array_shift( $tokens );
} while ( isset( $tokens[0] ) &&
( ( $tokens[0] instanceof ezcDocumentBBCodeNewlineToken ) ||
( $tokens[0] instanceof ezcDocumentBBCodeWhitespaceToken ) ) );
/* DEBUG
echo " - Shift paragraph node.\n";
// /DEBUG */
return new ezcDocumentBBCodeParagraphNode( $token );
}
elseif ( isset( $this->documentStack[0] ) &&
( !$this->documentStack[0] instanceof ezcDocumentBBCodeBlockLevelNode ) )
{
/* DEBUG
echo " - Shift newline as whitespace node.\n";
// /DEBUG */
return new ezcDocumentBBCodeTextNode( $token );
}
/* DEBUG
echo " - Ignore whitespace node.\n";
// /DEBUG */
return null;
}
/**
* Shift EOF token.
*
* Shift End-Of-File token. We reached the end of the document, and
* therefore shift a document node onto the stack.
*
* @param ezcDocumentBBCodeToken $token
* @param array $tokens
* @return mixed
*/
protected function shiftEndOfFileToken( ezcDocumentBBCodeToken $token, array &$tokens )
{
/* DEBUG
echo " - Shift document node.\n";
// /DEBUG */
return new ezcDocumentBBCodeDocumentNode( $token );
}
/**
* Reduce tags.
*
* Locates the matching opening tag for a closing tag and reduces the
* contents found on the way back.
*
* @param ezcDocumentBBCodeClosingTagNode $node
* @return mixed
*/
protected function reduceTags( ezcDocumentBBCodeClosingTagNode $node )
{
$nodes = array();
while ( isset( $this->documentStack[0] ) &&
( ( !$this->documentStack[0] instanceof ezcDocumentBBCodeTagNode ) ||
( $this->documentStack[0]->token->content !== $node->token->content ) ) )
{
$nodes[] = $child = array_shift( $this->documentStack );
if ( ( $child instanceof ezcDocumentBBCodeTagNode ) &&
( !count( $child->nodes ) ) )
{
return $this->triggerError( E_PARSE,
"Opening tag, without matching closing tag found: '" . $child->token->content . "'.",
$child->token->line, $child->token->position
);
}
if ( $child instanceof ezcDocumentBBCodeClosingTagNode )
{
return $this->triggerError( E_PARSE,
"Closing tag, without matching opening tag found: '" . $child->token->content . "'.",
$child->token->line, $child->token->position
);
}
}
if ( ( !$this->documentStack[0] instanceof ezcDocumentBBCodeTagNode ) ||
( $this->documentStack[0]->token->content !== $node->token->content ) )
{
return $this->triggerError( E_PARSE,
"Closing tag, without matching opening tag found: '" . $node->token->content . "'.",
$node->token->line, $node->token->position
);
}
$node = array_shift( $this->documentStack );
$node->nodes = array_reverse( $nodes );
return $node;
}
/**
* Reduce list items.
*
* Aggregates list items and puts them into a found list.
*
* @param ezcDocumentBBCodeParagraphNode $node
* @return mixed
*/
protected function reduceListItem( ezcDocumentBBCodeNode $node )
{
$nodes = array();
while ( isset( $this->documentStack[0] ) &&
( !$this->documentStack[0] instanceof ezcDocumentBBCodeListItemNode ) &&
( ( !$this->documentStack[0] instanceof ezcDocumentBBCodeListNode ) ||
( ( $this->documentStack[0] instanceof ezcDocumentBBCodeListNode ) &&
( count( $this->documentStack[0]->nodes ) ) ) ) )
{
$nodes[] = $child = array_shift( $this->documentStack );
if ( ( $child instanceof ezcDocumentBBCodeTagNode ) &&
( !count( $child->nodes ) ) )
{
return $this->triggerError( E_PARSE,
"Opening tag, without matching closing tag found: '" . $child->token->content . "'.",
$child->token->line, $child->token->position
);
}
if ( $child instanceof ezcDocumentBBCodeClosingTagNode )
{
return $this->triggerError( E_PARSE,
"Closing tag, without matching opening tag found: '" . $child->token->content . "'.",
$child->token->line, $child->token->position
);
}
}
if ( !isset( $this->documentStack[0] ) )
{
return $this->triggerError( E_PARSE,
"Missing list item node.",
$child->token->line, $child->token->position
);
}
// Wrap non-block-level nodes into paragraphs
$wrapped = array();
$temp = array();
foreach ( $nodes as $child )
{
if ( !$child instanceof ezcDocumentBBCodeBlockLevelNode )
{
$temp[] = $child;
}
elseif ( count( $temp ) )
{
$wrapped[] = $para = new ezcDocumentBBCodeParagraphNode( $temp[0]->token );
$para->nodes = array_reverse( $temp );
$temp = array();
$wrapped[] = $child;
}
else
{
$wrapped[] = $child;
}
}
if ( count( $temp ) )
{
$wrapped[] = $para = new ezcDocumentBBCodeParagraphNode( $temp[0]->token );
$para->nodes = array_reverse( $temp );
}
if ( $this->documentStack[0] instanceof ezcDocumentBBCodeListItemNode )
{
$this->documentStack[0]->nodes = array_reverse( $wrapped );
}
return $node;
}
/**
* Reduce list.
*
* Aggregates list items and puts them into a found list.
*
* @param ezcDocumentBBCodeParagraphNode $node
* @return mixed
*/
protected function reduceList( ezcDocumentBBCodeNode $node )
{
$this->reduceListItem( $node );
$nodes = array();
while ( isset( $this->documentStack[0] ) &&
( $this->documentStack[0] instanceof ezcDocumentBBCodeListItemNode ) )
{
$nodes[] = array_shift( $this->documentStack );
}
if ( !isset( $this->documentStack[0] ) ||
( !$this->documentStack[0] instanceof ezcDocumentBBCodeListNode ) )
{
return $this->triggerError( E_PARSE,
"Missing list start node.",
$child->token->line, $child->token->position
);
}
$this->documentStack[0]->nodes = array_reverse( $nodes );
return null;
}
/**
* Reduce paragraph.
*
* Paragraphs are reduce with all inline tokens, which have been added to
* the document stack before. If there are no inline nodes, the paragraph
* will be ommitted.
*
* @param ezcDocumentBBCodeParagraphNode $node
* @return mixed
*/
protected function reduceParagraph( ezcDocumentBBCodeParagraphNode $node )
{
$nodes = array();
while ( isset( $this->documentStack[0] ) &&
( !$this->documentStack[0] instanceof ezcDocumentBBCodeParagraphNode ) &&
( !$this->documentStack[0] instanceof ezcDocumentBBCodeListNode ) &&
( !$this->documentStack[0] instanceof ezcDocumentBBCodeLiteralBlockNode ) )
{
$nodes[] = $child = array_shift( $this->documentStack );
if ( ( $child instanceof ezcDocumentBBCodeTagNode ) &&
( !count( $child->nodes ) ) )
{
return $this->triggerError( E_PARSE,
"Opening tag, without matching closing tag found: '" . $child->token->content . "'.",
$child->token->line, $child->token->position
);
}
if ( $child instanceof ezcDocumentBBCodeClosingTagNode )
{
return $this->triggerError( E_PARSE,
"Closing tag, without matching opening tag found: '" . $child->token->content . "'.",
$child->token->line, $child->token->position
);
}
}
// Omit empty paragraphs
if ( !count( $nodes ) )
{
return null;
}
$node->nodes = array_reverse( $nodes );
return $node;
}
/**
* Reduce prior sections, if a new section has been found.
*
* If a new section has been found all sections with a higher depth level
* can be closed, and all items fitting into sections may be aggregated by
* the respective sections as well.
*
* @param ezcDocumentBBCodeDocumentNode $node
*/
protected function reduceDocument( ezcDocumentBBCodeDocumentNode $node )
{
$nodes = array();
while ( isset( $this->documentStack[0] ) &&
( ( $this->documentStack[0] instanceof ezcDocumentBBCodeParagraphNode ) ||
( $this->documentStack[0] instanceof ezcDocumentBBCodeListNode ) ||
( $this->documentStack[0] instanceof ezcDocumentBBCodeLiteralBlockNode ) ) )
{
$nodes[] = array_shift( $this->documentStack );
}
$node->nodes = array_reverse( $nodes );
return $node;
}
}
?>