blob: 70dac8d41eb4d6e089e773eb019bb69e888a15e0 [file] [log] [blame]
package org.apache.maven.doxia.parser;
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
import java.io.Reader;
import java.util.HashMap;
import java.util.Map;
import java.util.Set;
import java.util.Stack;
import java.util.TreeSet;
import javax.swing.text.html.HTML.Attribute;
import org.apache.maven.doxia.macro.MacroExecutionException;
import org.apache.maven.doxia.markup.HtmlMarkup;
import org.apache.maven.doxia.sink.Sink;
import org.apache.maven.doxia.sink.SinkEventAttributes;
import org.apache.maven.doxia.sink.impl.SinkEventAttributeSet;
import org.apache.maven.doxia.util.DoxiaUtils;
import org.codehaus.plexus.util.StringUtils;
import org.codehaus.plexus.util.xml.pull.XmlPullParser;
import org.codehaus.plexus.util.xml.pull.XmlPullParserException;
/**
* Common base parser for xhtml5 events.
*/
public class Xhtml5BaseParser
extends AbstractXmlParser
implements HtmlMarkup
{
/**
* True if a <script></script> or <style></style> block is read. CDATA sections within are
* handled as rawText.
*/
private boolean scriptBlock;
/** Used to distinguish <a href=""> from <a name="">. */
private boolean isLink;
/** Used to distinguish <a href=""> from <a name="">. */
private boolean isAnchor;
/** Used for nested lists. */
private int orderedListDepth = 0;
/** Counts section level. */
private int sectionLevel;
/** Counts heading level. */
private int headingLevel;
/** Verbatim flag, true whenever we are inside a <pre> tag. */
private boolean inVerbatim;
/** Used to keep track of closing tags for content events */
private Stack<String> divStack = new Stack<String>();
/** Used to wrap the definedTerm with its definition, even when one is omitted */
boolean hasDefinitionListItem = false;
/** Map of warn messages with a String as key to describe the error type and a Set as value.
* Using to reduce warn messages. */
private Map<String, Set<String>> warnMessages;
/** {@inheritDoc} */
@Override
public void parse( Reader source, Sink sink )
throws ParseException
{
init();
try
{
super.parse( source, sink );
}
finally
{
logWarnings();
setSecondParsing( false );
init();
}
}
/**
* {@inheritDoc}
*
* Adds all XHTML (HTML 5.2) entities to the parser so that they can be recognized and resolved
* without additional DTD.
*/
@Override
protected void initXmlParser( XmlPullParser parser )
throws XmlPullParserException
{
super.initXmlParser( parser );
}
/**
* <p>
* Goes through a common list of possible html5 start tags. These include only tags that can go into
* the body of an xhtml5 document and so should be re-usable by different xhtml-based parsers.
* </p>
* <p>
* The currently handled tags are:
* </p>
* <p>
* <code>
* &lt;article&gt;, &lt;nav&gt;, &lt;aside&gt;, &lt;section&gt;, &lt;h2&gt;, &lt;h3&gt;, &lt;h4&gt;,
* &lt;h5&gt;, &lt;h6&gt;, &lt;header&gt;, &lt;main&gt;, &lt;footer&gt;, &lt;em&gt;, &lt;strong&gt;,
* &lt;small&gt;, &lt;s&gt;, &lt;cite&gt;, &lt;q&gt;, &lt;dfn&gt;, &lt;abbr&gt;, &lt;i&gt;,
* &lt;b&gt;, &lt;code&gt;, &lt;samp&gt;, &lt;kbd&gt;, &lt;sub&gt;, &lt;sup&gt;, &lt;u&gt;,
* &lt;mark&gt;, &lt;ruby&gt;, &lt;rb&gt;, &lt;rt&gt;, &lt;rtc&gt;, &lt;rp&gt;, &lt;bdi&gt;,
* &lt;bdo&gt;, &lt;span&gt;, &lt;ins&gt;, &lt;del&gt;, &lt;p&gt;, &lt;pre&gt;, &lt;ul&gt;,
* &lt;ol&gt;, &lt;li&gt;, &lt;dl&gt;, &lt;dt&gt;, &lt;dd&gt;, &lt;a&gt;, &lt;table&gt;,
* &lt;tr&gt;, &lt;th&gt;, &lt;td&gt;, &lt;caption&gt;, &lt;br/&gt;, &lt;wbr/&gt;, &lt;hr/&gt;,
* &lt;img/&gt;.
* </code>
* </p>
*
* @param parser A parser.
* @param sink the sink to receive the events.
* @return True if the event has been handled by this method, i.e. the tag was recognized, false otherwise.
*/
protected boolean baseStartTag( XmlPullParser parser, Sink sink )
{
boolean visited = true;
SinkEventAttributeSet attribs = getAttributesFromParser( parser );
if ( parser.getName().equals( HtmlMarkup.ARTICLE.toString() ) )
{
sink.article( attribs );
}
else if ( parser.getName().equals( HtmlMarkup.NAV.toString() ) )
{
sink.navigation( attribs );
}
else if ( parser.getName().equals( HtmlMarkup.ASIDE.toString() ) )
{
sink.sidebar( attribs );
}
else if ( parser.getName().equals( HtmlMarkup.SECTION.toString() ) )
{
handleSectionStart( sink, attribs );
}
else if ( parser.getName().equals( HtmlMarkup.H2.toString() ) )
{
handleHeadingStart( sink, Sink.SECTION_LEVEL_1, attribs );
}
else if ( parser.getName().equals( HtmlMarkup.H3.toString() ) )
{
handleHeadingStart( sink, Sink.SECTION_LEVEL_2, attribs );
}
else if ( parser.getName().equals( HtmlMarkup.H4.toString() ) )
{
handleHeadingStart( sink, Sink.SECTION_LEVEL_3, attribs );
}
else if ( parser.getName().equals( HtmlMarkup.H5.toString() ) )
{
handleHeadingStart( sink, Sink.SECTION_LEVEL_4, attribs );
}
else if ( parser.getName().equals( HtmlMarkup.H6.toString() ) )
{
handleHeadingStart( sink, Sink.SECTION_LEVEL_5, attribs );
}
else if ( parser.getName().equals( HtmlMarkup.HEADER.toString() ) )
{
sink.header( attribs );
}
else if ( parser.getName().equals( HtmlMarkup.MAIN.toString() ) )
{
sink.content( attribs );
}
else if ( parser.getName().equals( HtmlMarkup.FOOTER.toString() ) )
{
sink.footer( attribs );
}
else if ( parser.getName().equals( HtmlMarkup.EM.toString() ) )
{
attribs.addAttributes( SinkEventAttributeSet.Semantics.EMPHASIS );
sink.inline( attribs );
}
else if ( parser.getName().equals( HtmlMarkup.STRONG.toString() ) )
{
attribs.addAttributes( SinkEventAttributeSet.Semantics.STRONG );
sink.inline( attribs );
}
else if ( parser.getName().equals( HtmlMarkup.SMALL.toString() ) )
{
attribs.addAttributes( SinkEventAttributeSet.Semantics.SMALL );
sink.inline( attribs );
}
else if ( parser.getName().equals( HtmlMarkup.S.toString() ) )
{
attribs.addAttributes( SinkEventAttributeSet.Semantics.LINE_THROUGH );
sink.inline( attribs );
/* deprecated line-through support */
}
else if ( parser.getName().equals( HtmlMarkup.CITE.toString() ) )
{
attribs.addAttributes( SinkEventAttributeSet.Semantics.CITATION );
sink.inline( attribs );
}
else if ( parser.getName().equals( HtmlMarkup.Q.toString() ) )
{
attribs.addAttributes( SinkEventAttributeSet.Semantics.QUOTE );
sink.inline( attribs );
}
else if ( parser.getName().equals( HtmlMarkup.DFN.toString() ) )
{
attribs.addAttributes( SinkEventAttributeSet.Semantics.DEFINITION );
sink.inline( attribs );
}
else if ( parser.getName().equals( HtmlMarkup.ABBR.toString() ) )
{
attribs.addAttributes( SinkEventAttributeSet.Semantics.ABBREVIATION );
sink.inline( attribs );
}
else if ( parser.getName().equals( HtmlMarkup.I.toString() ) )
{
attribs.addAttributes( SinkEventAttributeSet.Semantics.ITALIC );
sink.inline( attribs );
}
else if ( parser.getName().equals( HtmlMarkup.B.toString() ) )
{
attribs.addAttributes( SinkEventAttributeSet.Semantics.BOLD );
sink.inline( attribs );
}
else if ( parser.getName().equals( HtmlMarkup.CODE.toString() ) )
{
attribs.addAttributes( SinkEventAttributeSet.Semantics.CODE );
sink.inline( attribs );
}
else if ( parser.getName().equals( HtmlMarkup.VAR.toString() ) )
{
attribs.addAttributes( SinkEventAttributeSet.Semantics.VARIABLE );
sink.inline( attribs );
}
else if ( parser.getName().equals( HtmlMarkup.SAMP.toString() ) )
{
attribs.addAttributes( SinkEventAttributeSet.Semantics.SAMPLE );
sink.inline( attribs );
}
else if ( parser.getName().equals( HtmlMarkup.KBD.toString() ) )
{
attribs.addAttributes( SinkEventAttributeSet.Semantics.KEYBOARD );
sink.inline( attribs );
}
else if ( parser.getName().equals( HtmlMarkup.SUP.toString() ) )
{
attribs.addAttributes( SinkEventAttributeSet.Semantics.SUPERSCRIPT );
sink.inline( attribs );
}
else if ( parser.getName().equals( HtmlMarkup.SUB.toString() ) )
{
attribs.addAttributes( SinkEventAttributeSet.Semantics.SUBSCRIPT );
sink.inline( attribs );
}
else if ( parser.getName().equals( HtmlMarkup.U.toString() ) )
{
attribs.addAttributes( SinkEventAttributeSet.Semantics.ANNOTATION );
sink.inline( attribs );
}
else if ( parser.getName().equals( HtmlMarkup.MARK.toString() ) )
{
attribs.addAttributes( SinkEventAttributeSet.Semantics.HIGHLIGHT );
sink.inline( attribs );
}
else if ( parser.getName().equals( HtmlMarkup.RUBY.toString() ) )
{
attribs.addAttributes( SinkEventAttributeSet.Semantics.RUBY );
sink.inline( attribs );
}
else if ( parser.getName().equals( HtmlMarkup.RB.toString() ) )
{
attribs.addAttributes( SinkEventAttributeSet.Semantics.RUBY_BASE );
sink.inline( attribs );
}
else if ( parser.getName().equals( HtmlMarkup.RT.toString() ) )
{
attribs.addAttributes( SinkEventAttributeSet.Semantics.RUBY_TEXT );
sink.inline( attribs );
}
else if ( parser.getName().equals( HtmlMarkup.RTC.toString() ) )
{
attribs.addAttributes( SinkEventAttributeSet.Semantics.RUBY_TEXT_CONTAINER );
sink.inline( attribs );
}
else if ( parser.getName().equals( HtmlMarkup.RP.toString() ) )
{
attribs.addAttributes( SinkEventAttributeSet.Semantics.RUBY_PARANTHESES );
sink.inline( attribs );
}
else if ( parser.getName().equals( HtmlMarkup.BDI.toString() ) )
{
attribs.addAttributes( SinkEventAttributeSet.Semantics.BIDIRECTIONAL_ISOLATION );
sink.inline( attribs );
}
else if ( parser.getName().equals( HtmlMarkup.BDO.toString() ) )
{
attribs.addAttributes( SinkEventAttributeSet.Semantics.BIDIRECTIONAL_OVERRIDE );
sink.inline( attribs );
}
else if ( parser.getName().equals( HtmlMarkup.SPAN.toString() ) )
{
attribs.addAttributes( SinkEventAttributeSet.Semantics.PHRASE );
sink.inline( attribs );
}
else if ( parser.getName().equals( HtmlMarkup.INS.toString() ) )
{
attribs.addAttributes( SinkEventAttributeSet.Semantics.INSERT );
sink.inline( attribs );
}
else if ( parser.getName().equals( HtmlMarkup.DEL.toString() ) )
{
attribs.addAttributes( SinkEventAttributeSet.Semantics.DELETE );
sink.inline( attribs );
}
else if ( parser.getName().equals( HtmlMarkup.P.toString() ) )
{
handlePStart( sink, attribs );
}
else if ( parser.getName().equals( HtmlMarkup.DIV.toString() ) )
{
handleDivStart( parser, attribs, sink );
}
else if ( parser.getName().equals( HtmlMarkup.PRE.toString() ) )
{
handlePreStart( attribs, sink );
}
else if ( parser.getName().equals( HtmlMarkup.UL.toString() ) )
{
sink.list( attribs );
}
else if ( parser.getName().equals( HtmlMarkup.OL.toString() ) )
{
handleOLStart( parser, sink, attribs );
}
else if ( parser.getName().equals( HtmlMarkup.LI.toString() ) )
{
handleLIStart( sink, attribs );
}
else if ( parser.getName().equals( HtmlMarkup.DL.toString() ) )
{
sink.definitionList( attribs );
}
else if ( parser.getName().equals( HtmlMarkup.DT.toString() ) )
{
if ( hasDefinitionListItem )
{
// close previous listItem
sink.definitionListItem_();
}
sink.definitionListItem( attribs );
hasDefinitionListItem = true;
sink.definedTerm( attribs );
}
else if ( parser.getName().equals( HtmlMarkup.DD.toString() ) )
{
if ( !hasDefinitionListItem )
{
sink.definitionListItem( attribs );
}
sink.definition( attribs );
}
else if ( ( parser.getName().equals( HtmlMarkup.FIGURE.toString() ) ) )
{
sink.figure( attribs );
}
else if ( ( parser.getName().equals( HtmlMarkup.FIGCAPTION.toString() ) ) )
{
sink.figureCaption( attribs );
}
else if ( parser.getName().equals( HtmlMarkup.A.toString() ) )
{
handleAStart( parser, sink, attribs );
}
else if ( parser.getName().equals( HtmlMarkup.TABLE.toString() ) )
{
handleTableStart( sink, attribs, parser );
}
else if ( parser.getName().equals( HtmlMarkup.TR.toString() ) )
{
sink.tableRow( attribs );
}
else if ( parser.getName().equals( HtmlMarkup.TH.toString() ) )
{
sink.tableHeaderCell( attribs );
}
else if ( parser.getName().equals( HtmlMarkup.TD.toString() ) )
{
sink.tableCell( attribs );
}
else if ( parser.getName().equals( HtmlMarkup.CAPTION.toString() ) )
{
sink.tableCaption( attribs );
}
else if ( parser.getName().equals( HtmlMarkup.BR.toString() ) )
{
sink.lineBreak( attribs );
}
else if ( parser.getName().equals( HtmlMarkup.WBR.toString() ) )
{
sink.lineBreakOpportunity( attribs );
}
else if ( parser.getName().equals( HtmlMarkup.HR.toString() ) )
{
sink.horizontalRule( attribs );
}
else if ( parser.getName().equals( HtmlMarkup.IMG.toString() ) )
{
handleImgStart( parser, sink, attribs );
}
else if ( parser.getName().equals( HtmlMarkup.SCRIPT.toString() )
|| parser.getName().equals( HtmlMarkup.STYLE.toString() ) )
{
handleUnknown( parser, sink, TAG_TYPE_START );
scriptBlock = true;
}
else
{
visited = false;
}
return visited;
}
/**
* <p>
* Goes through a common list of possible html end tags.
* These should be re-usable by different xhtml-based parsers.
* The tags handled here are the same as for {@link #baseStartTag(XmlPullParser,Sink)},
* except for the empty elements ({@code<br/>, <hr/>, <img/>}).
* </p>
*
* @param parser A parser.
* @param sink the sink to receive the events.
* @return True if the event has been handled by this method, false otherwise.
*/
protected boolean baseEndTag( XmlPullParser parser, Sink sink )
{
boolean visited = true;
if ( parser.getName().equals( HtmlMarkup.P.toString() ) )
{
sink.paragraph_();
}
else if ( parser.getName().equals( HtmlMarkup.DIV.toString() ) )
{
handleDivEnd( sink );
}
else if ( parser.getName().equals( HtmlMarkup.PRE.toString() ) )
{
verbatim_();
sink.verbatim_();
}
else if ( parser.getName().equals( HtmlMarkup.UL.toString() ) )
{
sink.list_();
}
else if ( parser.getName().equals( HtmlMarkup.OL.toString() ) )
{
sink.numberedList_();
orderedListDepth--;
}
else if ( parser.getName().equals( HtmlMarkup.LI.toString() ) )
{
handleListItemEnd( sink );
}
else if ( parser.getName().equals( HtmlMarkup.DL.toString() ) )
{
if ( hasDefinitionListItem )
{
sink.definitionListItem_();
hasDefinitionListItem = false;
}
sink.definitionList_();
}
else if ( parser.getName().equals( HtmlMarkup.DT.toString() ) )
{
sink.definedTerm_();
}
else if ( parser.getName().equals( HtmlMarkup.DD.toString() ) )
{
sink.definition_();
sink.definitionListItem_();
hasDefinitionListItem = false;
}
else if ( ( parser.getName().equals( HtmlMarkup.FIGURE.toString() ) ) )
{
sink.figure_();
}
else if ( ( parser.getName().equals( HtmlMarkup.FIGCAPTION.toString() ) ) )
{
sink.figureCaption_();
}
else if ( parser.getName().equals( HtmlMarkup.A.toString() ) )
{
handleAEnd( sink );
}
else if ( parser.getName().equals( HtmlMarkup.EM.toString() ) )
{
sink.inline_();
}
else if ( parser.getName().equals( HtmlMarkup.STRONG.toString() ) )
{
sink.inline_();
}
else if ( parser.getName().equals( HtmlMarkup.SMALL.toString() ) )
{
sink.inline_();
}
else if ( parser.getName().equals( HtmlMarkup.S.toString() ) )
{
sink.inline_();
}
else if ( parser.getName().equals( HtmlMarkup.CITE.toString() ) )
{
sink.inline_();
}
else if ( parser.getName().equals( HtmlMarkup.Q.toString() ) )
{
sink.inline_();
}
else if ( parser.getName().equals( HtmlMarkup.DFN.toString() ) )
{
sink.inline_();
}
else if ( parser.getName().equals( HtmlMarkup.ABBR.toString() ) )
{
sink.inline_();
}
else if ( parser.getName().equals( HtmlMarkup.I.toString() ) )
{
sink.inline_();
}
else if ( parser.getName().equals( HtmlMarkup.B.toString() ) )
{
sink.inline_();
}
else if ( parser.getName().equals( HtmlMarkup.CODE.toString() ) )
{
sink.inline_();
}
else if ( parser.getName().equals( HtmlMarkup.VAR.toString() ) )
{
sink.inline_();
}
else if ( parser.getName().equals( HtmlMarkup.SAMP.toString() ) )
{
sink.inline_();
}
else if ( parser.getName().equals( HtmlMarkup.KBD.toString() ) )
{
sink.inline_();
}
else if ( parser.getName().equals( HtmlMarkup.SUP.toString() ) )
{
sink.inline_();
}
else if ( parser.getName().equals( HtmlMarkup.SUB.toString() ) )
{
sink.inline_();
}
else if ( parser.getName().equals( HtmlMarkup.U.toString() ) )
{
sink.inline_();
}
else if ( parser.getName().equals( HtmlMarkup.MARK.toString() ) )
{
sink.inline_();
}
else if ( parser.getName().equals( HtmlMarkup.RUBY.toString() ) )
{
sink.inline_();
}
else if ( parser.getName().equals( HtmlMarkup.RB.toString() ) )
{
sink.inline_();
}
else if ( parser.getName().equals( HtmlMarkup.RT.toString() ) )
{
sink.inline_();
}
else if ( parser.getName().equals( HtmlMarkup.RTC.toString() ) )
{
sink.inline_();
}
else if ( parser.getName().equals( HtmlMarkup.RP.toString() ) )
{
sink.inline_();
}
else if ( parser.getName().equals( HtmlMarkup.BDI.toString() ) )
{
sink.inline_();
}
else if ( parser.getName().equals( HtmlMarkup.BDO.toString() ) )
{
sink.inline_();
}
else if ( parser.getName().equals( HtmlMarkup.SPAN.toString() ) )
{
sink.inline_();
}
else if ( parser.getName().equals( HtmlMarkup.INS.toString() ) )
{
sink.inline_();
}
else if ( parser.getName().equals( HtmlMarkup.DEL.toString() ) )
{
sink.inline_();
}
// ----------------------------------------------------------------------
// Tables
// ----------------------------------------------------------------------
else if ( parser.getName().equals( HtmlMarkup.TABLE.toString() ) )
{
sink.tableRows_();
sink.table_();
}
else if ( parser.getName().equals( HtmlMarkup.TR.toString() ) )
{
sink.tableRow_();
}
else if ( parser.getName().equals( HtmlMarkup.TH.toString() ) )
{
sink.tableHeaderCell_();
}
else if ( parser.getName().equals( HtmlMarkup.TD.toString() ) )
{
sink.tableCell_();
}
else if ( parser.getName().equals( HtmlMarkup.CAPTION.toString() ) )
{
sink.tableCaption_();
}
else if ( parser.getName().equals( HtmlMarkup.ARTICLE.toString() ) )
{
sink.article_();
}
else if ( parser.getName().equals( HtmlMarkup.NAV.toString() ) )
{
sink.navigation_();
}
else if ( parser.getName().equals( HtmlMarkup.ASIDE.toString() ) )
{
sink.sidebar_();
}
else if ( parser.getName().equals( HtmlMarkup.SECTION.toString() ) )
{
handleSectionEnd( sink );
}
else if ( parser.getName().equals( HtmlMarkup.H2.toString() ) )
{
sink.sectionTitle1_();
}
else if ( parser.getName().equals( HtmlMarkup.H3.toString() ) )
{
sink.sectionTitle2_();
}
else if ( parser.getName().equals( HtmlMarkup.H4.toString() ) )
{
sink.sectionTitle3_();
}
else if ( parser.getName().equals( HtmlMarkup.H5.toString() ) )
{
sink.sectionTitle4_();
}
else if ( parser.getName().equals( HtmlMarkup.H6.toString() ) )
{
sink.sectionTitle5_();
}
else if ( parser.getName().equals( HtmlMarkup.HEADER.toString() ) )
{
sink.header_();
}
else if ( parser.getName().equals( HtmlMarkup.MAIN.toString() ) )
{
sink.content_();
}
else if ( parser.getName().equals( HtmlMarkup.FOOTER.toString() ) )
{
sink.footer_();
}
else if ( parser.getName().equals( HtmlMarkup.SCRIPT.toString() )
|| parser.getName().equals( HtmlMarkup.STYLE.toString() ) )
{
handleUnknown( parser, sink, TAG_TYPE_END );
scriptBlock = false;
}
else
{
visited = false;
}
return visited;
}
/**
* {@inheritDoc}
*
* Just calls {@link #baseStartTag(XmlPullParser,Sink)}, this should be
* overridden by implementing parsers to include additional tags.
*/
protected void handleStartTag( XmlPullParser parser, Sink sink )
throws XmlPullParserException, MacroExecutionException
{
if ( !baseStartTag( parser, sink ) )
{
if ( getLog().isWarnEnabled() )
{
String position = "[" + parser.getLineNumber() + ":"
+ parser.getColumnNumber() + "]";
String tag = "<" + parser.getName() + ">";
getLog().warn( "Unrecognized xml tag: " + tag + " at " + position );
}
}
}
/**
* {@inheritDoc}
*
* Just calls {@link #baseEndTag(XmlPullParser,Sink)}, this should be
* overridden by implementing parsers to include additional tags.
*/
protected void handleEndTag( XmlPullParser parser, Sink sink )
throws XmlPullParserException, MacroExecutionException
{
if ( !baseEndTag( parser, sink ) )
{
// unrecognized tag is already logged in StartTag
}
}
/** {@inheritDoc} */
@Override
protected void handleText( XmlPullParser parser, Sink sink )
throws XmlPullParserException
{
String text = getText( parser );
/*
* NOTE: Don't do any whitespace trimming here. Whitespace normalization has already been performed by the
* parser so any whitespace that makes it here is significant.
*
* NOTE: text within script tags is ignored, scripting code should be embedded in CDATA.
*/
if ( StringUtils.isNotEmpty( text ) && !isScriptBlock() )
{
sink.text( text );
}
}
/** {@inheritDoc} */
@Override
protected void handleComment( XmlPullParser parser, Sink sink )
throws XmlPullParserException
{
String text = getText( parser );
if ( "PB".equals( text.trim() ) )
{
sink.pageBreak();
}
else
{
if ( isEmitComments() )
{
sink.comment( text );
}
}
}
/** {@inheritDoc} */
@Override
protected void handleCdsect( XmlPullParser parser, Sink sink )
throws XmlPullParserException
{
String text = getText( parser );
if ( isScriptBlock() )
{
sink.unknown( CDATA, new Object[] { Integer.valueOf( CDATA_TYPE ), text}, null );
}
else
{
sink.text( text );
}
}
/**
* Make sure sections are nested consecutively.
*
* <p>
* HTML5 heading tags H1 to H6 imply sections where they are not
* present, that means we have to open close any sections that
* are missing in between.
* </p>
*
* <p>
* For instance, if the following sequence is parsed:
* </p>
* <pre>
* &lt;h3&gt;&lt;/h3&gt;
* &lt;h6&gt;&lt;/h6&gt;
* </pre>
* <p>
* we have to insert two section starts before we open the <code>&lt;h6&gt;</code>.
* In the following sequence
* </p>
* <pre>
* &lt;h6&gt;&lt;/h6&gt;
* &lt;h3&gt;&lt;/h3&gt;
* </pre>
* <p>
* we have to close two sections before we open the <code>&lt;h3&gt;</code>.
* </p>
*
* <p>The current level is set to newLevel afterwards.</p>
*
* @param newLevel the new section level, all upper levels have to be closed.
* @param sink the sink to receive the events.
*/
protected void consecutiveSections( int newLevel, Sink sink, SinkEventAttributeSet attribs )
{
closeOpenSections( newLevel, sink );
openMissingSections( newLevel, sink );
this.headingLevel = newLevel;
}
/**
* Close open sections.
*
* @param newLevel the new section level, all upper levels have to be closed.
* @param sink the sink to receive the events.
*/
private void closeOpenSections( int newLevel, Sink sink )
{
while ( this.headingLevel >= newLevel
&& this.sectionLevel < headingLevel )
{
if ( headingLevel == Sink.SECTION_LEVEL_5 )
{
sink.section5_();
}
else if ( headingLevel == Sink.SECTION_LEVEL_4 )
{
sink.section4_();
}
else if ( headingLevel == Sink.SECTION_LEVEL_3 )
{
sink.section3_();
}
else if ( headingLevel == Sink.SECTION_LEVEL_2 )
{
sink.section2_();
}
else if ( headingLevel == Sink.SECTION_LEVEL_1 )
{
sink.section1_();
}
this.headingLevel--;
}
}
/**
* Open missing sections.
*
* @param newLevel the new section level, all lower levels have to be opened.
* @param sink the sink to receive the events.
*/
private void openMissingSections( int newLevel, Sink sink )
{
while ( this.headingLevel < newLevel
&& this.sectionLevel < newLevel )
{
this.headingLevel++;
if ( headingLevel == Sink.SECTION_LEVEL_5 )
{
sink.section5();
}
else if ( headingLevel == Sink.SECTION_LEVEL_4 )
{
sink.section4();
}
else if ( headingLevel == Sink.SECTION_LEVEL_3 )
{
sink.section3();
}
else if ( headingLevel == Sink.SECTION_LEVEL_2 )
{
sink.section2();
}
else if ( headingLevel == Sink.SECTION_LEVEL_1 )
{
sink.section1();
}
}
}
/**
* Return the current section level.
*
* @return the current section level.
*/
protected int getSectionLevel()
{
return this.headingLevel;
}
/**
* Set the current section level.
*
* @param newLevel the new section level.
*/
protected void setSectionLevel( int newLevel )
{
this.headingLevel = newLevel;
}
/**
* Stop verbatim mode.
*/
protected void verbatim_()
{
this.inVerbatim = false;
}
/**
* Start verbatim mode.
*/
protected void verbatim()
{
this.inVerbatim = true;
}
/**
* Checks if we are currently inside a &lt;pre&gt; tag.
*
* @return true if we are currently in verbatim mode.
*/
protected boolean isVerbatim()
{
return this.inVerbatim;
}
/**
* Checks if we are currently inside a &lt;script&gt; tag.
*
* @return true if we are currently inside <code>&lt;script&gt;</code> tags.
*
* @since 1.1.1.
*/
protected boolean isScriptBlock()
{
return this.scriptBlock;
}
/**
* Checks if the given id is a valid Doxia id and if not, returns a transformed one.
*
* @param id The id to validate.
* @return A transformed id or the original id if it was already valid.
* @see DoxiaUtils#encodeId(String)
*/
protected String validAnchor( String id )
{
if ( !DoxiaUtils.isValidId( id ) )
{
String linkAnchor = DoxiaUtils.encodeId( id, true );
String msg = "Modified invalid link: '" + id + "' to '" + linkAnchor + "'";
logMessage( "modifiedLink", msg );
return linkAnchor;
}
return id;
}
/** {@inheritDoc} */
@Override
protected void init()
{
super.init();
this.scriptBlock = false;
this.isLink = false;
this.isAnchor = false;
this.orderedListDepth = 0;
this.headingLevel = 0;
this.inVerbatim = false;
this.warnMessages = null;
}
private void handleAEnd( Sink sink )
{
if ( isLink )
{
sink.link_();
isLink = false;
}
else if ( isAnchor )
{
sink.anchor_();
isAnchor = false;
}
}
private void handleAStart( XmlPullParser parser, Sink sink, SinkEventAttributeSet attribs )
{
String href = parser.getAttributeValue( null, Attribute.HREF.toString() );
if ( href != null )
{
int hashIndex = href.indexOf( '#' );
if ( hashIndex != -1 && !DoxiaUtils.isExternalLink( href ) )
{
String hash = href.substring( hashIndex + 1 );
if ( !DoxiaUtils.isValidId( hash ) )
{
href = href.substring( 0, hashIndex ) + "#" + DoxiaUtils.encodeId( hash, true );
String msg = "Modified invalid link: '" + hash + "' to '" + href + "'";
logMessage( "modifiedLink", msg );
}
}
sink.link( href, attribs );
isLink = true;
}
else
{
String name = parser.getAttributeValue( null, Attribute.NAME.toString() );
if ( name != null )
{
sink.anchor( validAnchor( name ), attribs );
isAnchor = true;
}
else
{
String id = parser.getAttributeValue( null, Attribute.ID.toString() );
if ( id != null )
{
sink.anchor( validAnchor( id ), attribs );
isAnchor = true;
}
}
}
}
private boolean handleDivStart( XmlPullParser parser, SinkEventAttributeSet attribs, Sink sink )
{
String divclass = parser.getAttributeValue( null, Attribute.CLASS.toString() );
this.divStack.push( divclass );
if ( "content".equals( divclass ) )
{
SinkEventAttributeSet atts = new SinkEventAttributeSet( attribs );
atts.removeAttribute( SinkEventAttributes.CLASS );
sink.content( atts );
}
if ( "source".equals( divclass ) )
{
return false;
}
else
{
sink.division( attribs );
}
return true;
}
private boolean handleDivEnd( Sink sink )
{
String divclass = divStack.pop();
if ( "content".equals( divclass ) )
{
sink.content_();
}
if ( "source".equals( divclass ) )
{
return false;
}
else
{
sink.division_();
}
return true;
}
private void handleImgStart( XmlPullParser parser, Sink sink, SinkEventAttributeSet attribs )
{
String src = parser.getAttributeValue( null, Attribute.SRC.toString() );
if ( src != null )
{
sink.figureGraphics( src, attribs );
}
}
private void handleLIStart( Sink sink, SinkEventAttributeSet attribs )
{
if ( orderedListDepth == 0 )
{
sink.listItem( attribs );
}
else
{
sink.numberedListItem( attribs );
}
}
private void handleListItemEnd( Sink sink )
{
if ( orderedListDepth == 0 )
{
sink.listItem_();
}
else
{
sink.numberedListItem_();
}
}
private void handleOLStart( XmlPullParser parser, Sink sink, SinkEventAttributeSet attribs )
{
int numbering = Sink.NUMBERING_DECIMAL;
// this will have to be generalized if we handle styles
String style = parser.getAttributeValue( null, Attribute.STYLE.toString() );
if ( style != null )
{
if ( "list-style-type: upper-alpha".equals( style ) )
{
numbering = Sink.NUMBERING_UPPER_ALPHA;
}
else if ( "list-style-type: lower-alpha".equals( style ) )
{
numbering = Sink.NUMBERING_LOWER_ALPHA;
}
else if ( "list-style-type: upper-roman".equals( style ) )
{
numbering = Sink.NUMBERING_UPPER_ROMAN;
}
else if ( "list-style-type: lower-roman".equals( style ) )
{
numbering = Sink.NUMBERING_LOWER_ROMAN;
}
else if ( "list-style-type: decimal".equals( style ) )
{
numbering = Sink.NUMBERING_DECIMAL;
}
}
sink.numberedList( numbering, attribs );
orderedListDepth++;
}
private void handlePStart( Sink sink, SinkEventAttributeSet attribs )
{
sink.paragraph( attribs );
}
/*
* The PRE element tells visual user agents that the enclosed text is
* "preformatted". When handling preformatted text, visual user agents:
* - May leave white space intact.
* - May render text with a fixed-pitch font.
* - May disable automatic word wrap.
* - Must not disable bidirectional processing.
* Non-visual user agents are not required to respect extra white space
* in the content of a PRE element.
*/
private void handlePreStart( SinkEventAttributeSet attribs, Sink sink )
{
verbatim();
sink.verbatim( attribs );
}
private void handleSectionStart( Sink sink, SinkEventAttributeSet attribs )
{
sink.section( ++sectionLevel, attribs );
}
private void handleHeadingStart( Sink sink, int level, SinkEventAttributeSet attribs )
{
consecutiveSections( level, sink, attribs );
sink.sectionTitle( level, attribs );
}
private void handleSectionEnd( Sink sink )
{
closeOpenSections( sectionLevel, sink );
this.headingLevel = 0;
sink.section_( sectionLevel-- );
}
private void handleTableStart( Sink sink, SinkEventAttributeSet attribs, XmlPullParser parser )
{
sink.table( attribs );
String border = parser.getAttributeValue( null, Attribute.BORDER.toString() );
boolean grid = true;
if ( border == null || "0".equals( border ) )
{
grid = false;
}
String align = parser.getAttributeValue( null, Attribute.ALIGN.toString() );
int[] justif = {Sink.JUSTIFY_LEFT};
if ( "center".equals( align ) )
{
justif[0] = Sink.JUSTIFY_CENTER;
}
else if ( "right".equals( align ) )
{
justif[0] = Sink.JUSTIFY_RIGHT;
}
sink.tableRows( justif, grid );
}
/**
* If debug mode is enabled, log the <code>msg</code> as is, otherwise add unique msg in <code>warnMessages</code>.
*
* @param key not null
* @param msg not null
* @see #parse(Reader, Sink)
* @since 1.1.1
*/
private void logMessage( String key, String msg )
{
final String log = "[XHTML Parser] " + msg;
if ( getLog().isDebugEnabled() )
{
getLog().debug( log );
return;
}
if ( warnMessages == null )
{
warnMessages = new HashMap<String, Set<String>>();
}
Set<String> set = warnMessages.get( key );
if ( set == null )
{
set = new TreeSet<String>();
}
set.add( log );
warnMessages.put( key, set );
}
/**
* @since 1.1.1
*/
private void logWarnings()
{
if ( getLog().isWarnEnabled() && this.warnMessages != null && !isSecondParsing() )
{
for ( Map.Entry<String, Set<String>> entry : this.warnMessages.entrySet() )
{
for ( String msg : entry.getValue() )
{
getLog().warn( msg );
}
}
this.warnMessages = null;
}
}
}