| package org.apache.maven.doxia.parser; |
| |
| /* |
| * Licensed to the Apache Software Foundation (ASF) under one |
| * or more contributor license agreements. See the NOTICE file |
| * distributed with this work for additional information |
| * regarding copyright ownership. The ASF licenses this file |
| * to you under the Apache License, Version 2.0 (the |
| * "License"); you may not use this file except in compliance |
| * with the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, |
| * software distributed under the License is distributed on an |
| * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
| * KIND, either express or implied. See the License for the |
| * specific language governing permissions and limitations |
| * under the License. |
| */ |
| |
| import java.io.Reader; |
| import java.util.HashMap; |
| import java.util.Map; |
| import java.util.Set; |
| import java.util.Stack; |
| import java.util.TreeSet; |
| |
| import javax.swing.text.html.HTML.Attribute; |
| |
| import org.apache.maven.doxia.macro.MacroExecutionException; |
| import org.apache.maven.doxia.markup.HtmlMarkup; |
| import org.apache.maven.doxia.sink.Sink; |
| import org.apache.maven.doxia.sink.SinkEventAttributes; |
| import org.apache.maven.doxia.sink.impl.SinkEventAttributeSet; |
| import org.apache.maven.doxia.util.DoxiaUtils; |
| import org.codehaus.plexus.util.StringUtils; |
| import org.codehaus.plexus.util.xml.pull.XmlPullParser; |
| import org.codehaus.plexus.util.xml.pull.XmlPullParserException; |
| |
| /** |
| * Common base parser for xhtml5 events. |
| */ |
| public class Xhtml5BaseParser |
| extends AbstractXmlParser |
| implements HtmlMarkup |
| { |
| /** |
| * True if a <script></script> or <style></style> block is read. CDATA sections within are |
| * handled as rawText. |
| */ |
| private boolean scriptBlock; |
| |
| /** Used to distinguish <a href=""> from <a name="">. */ |
| private boolean isLink; |
| |
| /** Used to distinguish <a href=""> from <a name="">. */ |
| private boolean isAnchor; |
| |
| /** Used for nested lists. */ |
| private int orderedListDepth = 0; |
| |
| /** Counts section level. */ |
| private int sectionLevel; |
| |
| /** Counts heading level. */ |
| private int headingLevel; |
| |
| /** Verbatim flag, true whenever we are inside a <pre> tag. */ |
| private boolean inVerbatim; |
| |
| /** Used to keep track of closing tags for content events */ |
| private Stack<String> divStack = new Stack<String>(); |
| |
| /** Used to wrap the definedTerm with its definition, even when one is omitted */ |
| boolean hasDefinitionListItem = false; |
| |
| /** Map of warn messages with a String as key to describe the error type and a Set as value. |
| * Using to reduce warn messages. */ |
| private Map<String, Set<String>> warnMessages; |
| |
| /** {@inheritDoc} */ |
| @Override |
| public void parse( Reader source, Sink sink ) |
| throws ParseException |
| { |
| init(); |
| |
| try |
| { |
| super.parse( source, sink ); |
| } |
| finally |
| { |
| logWarnings(); |
| |
| setSecondParsing( false ); |
| init(); |
| } |
| } |
| |
| /** |
| * {@inheritDoc} |
| * |
| * Adds all XHTML (HTML 5.2) entities to the parser so that they can be recognized and resolved |
| * without additional DTD. |
| */ |
| @Override |
| protected void initXmlParser( XmlPullParser parser ) |
| throws XmlPullParserException |
| { |
| super.initXmlParser( parser ); |
| } |
| |
| /** |
| * <p> |
| * Goes through a common list of possible html5 start tags. These include only tags that can go into |
| * the body of an xhtml5 document and so should be re-usable by different xhtml-based parsers. |
| * </p> |
| * <p> |
| * The currently handled tags are: |
| * </p> |
| * <p> |
| * <code> |
| * <article>, <nav>, <aside>, <section>, <h2>, <h3>, <h4>, |
| * <h5>, <h6>, <header>, <main>, <footer>, <em>, <strong>, |
| * <small>, <s>, <cite>, <q>, <dfn>, <abbr>, <i>, |
| * <b>, <code>, <samp>, <kbd>, <sub>, <sup>, <u>, |
| * <mark>, <ruby>, <rb>, <rt>, <rtc>, <rp>, <bdi>, |
| * <bdo>, <span>, <ins>, <del>, <p>, <pre>, <ul>, |
| * <ol>, <li>, <dl>, <dt>, <dd>, <a>, <table>, |
| * <tr>, <th>, <td>, <caption>, <br/>, <wbr/>, <hr/>, |
| * <img/>. |
| * </code> |
| * </p> |
| * |
| * @param parser A parser. |
| * @param sink the sink to receive the events. |
| * @return True if the event has been handled by this method, i.e. the tag was recognized, false otherwise. |
| */ |
| protected boolean baseStartTag( XmlPullParser parser, Sink sink ) |
| { |
| boolean visited = true; |
| |
| SinkEventAttributeSet attribs = getAttributesFromParser( parser ); |
| |
| if ( parser.getName().equals( HtmlMarkup.ARTICLE.toString() ) ) |
| { |
| sink.article( attribs ); |
| } |
| else if ( parser.getName().equals( HtmlMarkup.NAV.toString() ) ) |
| { |
| sink.navigation( attribs ); |
| } |
| else if ( parser.getName().equals( HtmlMarkup.ASIDE.toString() ) ) |
| { |
| sink.sidebar( attribs ); |
| } |
| else if ( parser.getName().equals( HtmlMarkup.SECTION.toString() ) ) |
| { |
| handleSectionStart( sink, attribs ); |
| } |
| else if ( parser.getName().equals( HtmlMarkup.H2.toString() ) ) |
| { |
| handleHeadingStart( sink, Sink.SECTION_LEVEL_1, attribs ); |
| } |
| else if ( parser.getName().equals( HtmlMarkup.H3.toString() ) ) |
| { |
| handleHeadingStart( sink, Sink.SECTION_LEVEL_2, attribs ); |
| } |
| else if ( parser.getName().equals( HtmlMarkup.H4.toString() ) ) |
| { |
| handleHeadingStart( sink, Sink.SECTION_LEVEL_3, attribs ); |
| } |
| else if ( parser.getName().equals( HtmlMarkup.H5.toString() ) ) |
| { |
| handleHeadingStart( sink, Sink.SECTION_LEVEL_4, attribs ); |
| } |
| else if ( parser.getName().equals( HtmlMarkup.H6.toString() ) ) |
| { |
| handleHeadingStart( sink, Sink.SECTION_LEVEL_5, attribs ); |
| } |
| else if ( parser.getName().equals( HtmlMarkup.HEADER.toString() ) ) |
| { |
| sink.header( attribs ); |
| } |
| else if ( parser.getName().equals( HtmlMarkup.MAIN.toString() ) ) |
| { |
| sink.content( attribs ); |
| } |
| else if ( parser.getName().equals( HtmlMarkup.FOOTER.toString() ) ) |
| { |
| sink.footer( attribs ); |
| } |
| else if ( parser.getName().equals( HtmlMarkup.EM.toString() ) ) |
| { |
| attribs.addAttributes( SinkEventAttributeSet.Semantics.EMPHASIS ); |
| sink.inline( attribs ); |
| } |
| else if ( parser.getName().equals( HtmlMarkup.STRONG.toString() ) ) |
| { |
| attribs.addAttributes( SinkEventAttributeSet.Semantics.STRONG ); |
| sink.inline( attribs ); |
| } |
| else if ( parser.getName().equals( HtmlMarkup.SMALL.toString() ) ) |
| { |
| attribs.addAttributes( SinkEventAttributeSet.Semantics.SMALL ); |
| sink.inline( attribs ); |
| } |
| else if ( parser.getName().equals( HtmlMarkup.S.toString() ) ) |
| { |
| attribs.addAttributes( SinkEventAttributeSet.Semantics.LINE_THROUGH ); |
| sink.inline( attribs ); |
| /* deprecated line-through support */ |
| } |
| else if ( parser.getName().equals( HtmlMarkup.CITE.toString() ) ) |
| { |
| attribs.addAttributes( SinkEventAttributeSet.Semantics.CITATION ); |
| sink.inline( attribs ); |
| } |
| else if ( parser.getName().equals( HtmlMarkup.Q.toString() ) ) |
| { |
| attribs.addAttributes( SinkEventAttributeSet.Semantics.QUOTE ); |
| sink.inline( attribs ); |
| } |
| else if ( parser.getName().equals( HtmlMarkup.DFN.toString() ) ) |
| { |
| attribs.addAttributes( SinkEventAttributeSet.Semantics.DEFINITION ); |
| sink.inline( attribs ); |
| } |
| else if ( parser.getName().equals( HtmlMarkup.ABBR.toString() ) ) |
| { |
| attribs.addAttributes( SinkEventAttributeSet.Semantics.ABBREVIATION ); |
| sink.inline( attribs ); |
| } |
| else if ( parser.getName().equals( HtmlMarkup.I.toString() ) ) |
| { |
| attribs.addAttributes( SinkEventAttributeSet.Semantics.ITALIC ); |
| sink.inline( attribs ); |
| } |
| else if ( parser.getName().equals( HtmlMarkup.B.toString() ) ) |
| { |
| attribs.addAttributes( SinkEventAttributeSet.Semantics.BOLD ); |
| sink.inline( attribs ); |
| } |
| else if ( parser.getName().equals( HtmlMarkup.CODE.toString() ) ) |
| { |
| attribs.addAttributes( SinkEventAttributeSet.Semantics.CODE ); |
| sink.inline( attribs ); |
| } |
| else if ( parser.getName().equals( HtmlMarkup.VAR.toString() ) ) |
| { |
| attribs.addAttributes( SinkEventAttributeSet.Semantics.VARIABLE ); |
| sink.inline( attribs ); |
| } |
| else if ( parser.getName().equals( HtmlMarkup.SAMP.toString() ) ) |
| { |
| attribs.addAttributes( SinkEventAttributeSet.Semantics.SAMPLE ); |
| sink.inline( attribs ); |
| } |
| else if ( parser.getName().equals( HtmlMarkup.KBD.toString() ) ) |
| { |
| attribs.addAttributes( SinkEventAttributeSet.Semantics.KEYBOARD ); |
| sink.inline( attribs ); |
| } |
| else if ( parser.getName().equals( HtmlMarkup.SUP.toString() ) ) |
| { |
| attribs.addAttributes( SinkEventAttributeSet.Semantics.SUPERSCRIPT ); |
| sink.inline( attribs ); |
| } |
| else if ( parser.getName().equals( HtmlMarkup.SUB.toString() ) ) |
| { |
| attribs.addAttributes( SinkEventAttributeSet.Semantics.SUBSCRIPT ); |
| sink.inline( attribs ); |
| } |
| else if ( parser.getName().equals( HtmlMarkup.U.toString() ) ) |
| { |
| attribs.addAttributes( SinkEventAttributeSet.Semantics.ANNOTATION ); |
| sink.inline( attribs ); |
| } |
| else if ( parser.getName().equals( HtmlMarkup.MARK.toString() ) ) |
| { |
| attribs.addAttributes( SinkEventAttributeSet.Semantics.HIGHLIGHT ); |
| sink.inline( attribs ); |
| } |
| else if ( parser.getName().equals( HtmlMarkup.RUBY.toString() ) ) |
| { |
| attribs.addAttributes( SinkEventAttributeSet.Semantics.RUBY ); |
| sink.inline( attribs ); |
| } |
| else if ( parser.getName().equals( HtmlMarkup.RB.toString() ) ) |
| { |
| attribs.addAttributes( SinkEventAttributeSet.Semantics.RUBY_BASE ); |
| sink.inline( attribs ); |
| } |
| else if ( parser.getName().equals( HtmlMarkup.RT.toString() ) ) |
| { |
| attribs.addAttributes( SinkEventAttributeSet.Semantics.RUBY_TEXT ); |
| sink.inline( attribs ); |
| } |
| else if ( parser.getName().equals( HtmlMarkup.RTC.toString() ) ) |
| { |
| attribs.addAttributes( SinkEventAttributeSet.Semantics.RUBY_TEXT_CONTAINER ); |
| sink.inline( attribs ); |
| } |
| else if ( parser.getName().equals( HtmlMarkup.RP.toString() ) ) |
| { |
| attribs.addAttributes( SinkEventAttributeSet.Semantics.RUBY_PARANTHESES ); |
| sink.inline( attribs ); |
| } |
| else if ( parser.getName().equals( HtmlMarkup.BDI.toString() ) ) |
| { |
| attribs.addAttributes( SinkEventAttributeSet.Semantics.BIDIRECTIONAL_ISOLATION ); |
| sink.inline( attribs ); |
| } |
| else if ( parser.getName().equals( HtmlMarkup.BDO.toString() ) ) |
| { |
| attribs.addAttributes( SinkEventAttributeSet.Semantics.BIDIRECTIONAL_OVERRIDE ); |
| sink.inline( attribs ); |
| } |
| else if ( parser.getName().equals( HtmlMarkup.SPAN.toString() ) ) |
| { |
| attribs.addAttributes( SinkEventAttributeSet.Semantics.PHRASE ); |
| sink.inline( attribs ); |
| } |
| else if ( parser.getName().equals( HtmlMarkup.INS.toString() ) ) |
| { |
| attribs.addAttributes( SinkEventAttributeSet.Semantics.INSERT ); |
| sink.inline( attribs ); |
| } |
| else if ( parser.getName().equals( HtmlMarkup.DEL.toString() ) ) |
| { |
| attribs.addAttributes( SinkEventAttributeSet.Semantics.DELETE ); |
| sink.inline( attribs ); |
| } |
| else if ( parser.getName().equals( HtmlMarkup.P.toString() ) ) |
| { |
| handlePStart( sink, attribs ); |
| } |
| else if ( parser.getName().equals( HtmlMarkup.DIV.toString() ) ) |
| { |
| handleDivStart( parser, attribs, sink ); |
| } |
| else if ( parser.getName().equals( HtmlMarkup.PRE.toString() ) ) |
| { |
| handlePreStart( attribs, sink ); |
| } |
| else if ( parser.getName().equals( HtmlMarkup.UL.toString() ) ) |
| { |
| sink.list( attribs ); |
| } |
| else if ( parser.getName().equals( HtmlMarkup.OL.toString() ) ) |
| { |
| handleOLStart( parser, sink, attribs ); |
| } |
| else if ( parser.getName().equals( HtmlMarkup.LI.toString() ) ) |
| { |
| handleLIStart( sink, attribs ); |
| } |
| else if ( parser.getName().equals( HtmlMarkup.DL.toString() ) ) |
| { |
| sink.definitionList( attribs ); |
| } |
| else if ( parser.getName().equals( HtmlMarkup.DT.toString() ) ) |
| { |
| if ( hasDefinitionListItem ) |
| { |
| // close previous listItem |
| sink.definitionListItem_(); |
| } |
| sink.definitionListItem( attribs ); |
| hasDefinitionListItem = true; |
| sink.definedTerm( attribs ); |
| } |
| else if ( parser.getName().equals( HtmlMarkup.DD.toString() ) ) |
| { |
| if ( !hasDefinitionListItem ) |
| { |
| sink.definitionListItem( attribs ); |
| } |
| sink.definition( attribs ); |
| } |
| else if ( ( parser.getName().equals( HtmlMarkup.FIGURE.toString() ) ) ) |
| { |
| sink.figure( attribs ); |
| } |
| else if ( ( parser.getName().equals( HtmlMarkup.FIGCAPTION.toString() ) ) ) |
| { |
| sink.figureCaption( attribs ); |
| } |
| else if ( parser.getName().equals( HtmlMarkup.A.toString() ) ) |
| { |
| handleAStart( parser, sink, attribs ); |
| } |
| else if ( parser.getName().equals( HtmlMarkup.TABLE.toString() ) ) |
| { |
| handleTableStart( sink, attribs, parser ); |
| } |
| else if ( parser.getName().equals( HtmlMarkup.TR.toString() ) ) |
| { |
| sink.tableRow( attribs ); |
| } |
| else if ( parser.getName().equals( HtmlMarkup.TH.toString() ) ) |
| { |
| sink.tableHeaderCell( attribs ); |
| } |
| else if ( parser.getName().equals( HtmlMarkup.TD.toString() ) ) |
| { |
| sink.tableCell( attribs ); |
| } |
| else if ( parser.getName().equals( HtmlMarkup.CAPTION.toString() ) ) |
| { |
| sink.tableCaption( attribs ); |
| } |
| else if ( parser.getName().equals( HtmlMarkup.BR.toString() ) ) |
| { |
| sink.lineBreak( attribs ); |
| } |
| else if ( parser.getName().equals( HtmlMarkup.WBR.toString() ) ) |
| { |
| sink.lineBreakOpportunity( attribs ); |
| } |
| else if ( parser.getName().equals( HtmlMarkup.HR.toString() ) ) |
| { |
| sink.horizontalRule( attribs ); |
| } |
| else if ( parser.getName().equals( HtmlMarkup.IMG.toString() ) ) |
| { |
| handleImgStart( parser, sink, attribs ); |
| } |
| else if ( parser.getName().equals( HtmlMarkup.SCRIPT.toString() ) |
| || parser.getName().equals( HtmlMarkup.STYLE.toString() ) ) |
| { |
| handleUnknown( parser, sink, TAG_TYPE_START ); |
| scriptBlock = true; |
| } |
| else |
| { |
| visited = false; |
| } |
| |
| return visited; |
| } |
| |
| /** |
| * <p> |
| * Goes through a common list of possible html end tags. |
| * These should be re-usable by different xhtml-based parsers. |
| * The tags handled here are the same as for {@link #baseStartTag(XmlPullParser,Sink)}, |
| * except for the empty elements ({@code<br/>, <hr/>, <img/>}). |
| * </p> |
| * |
| * @param parser A parser. |
| * @param sink the sink to receive the events. |
| * @return True if the event has been handled by this method, false otherwise. |
| */ |
| protected boolean baseEndTag( XmlPullParser parser, Sink sink ) |
| { |
| boolean visited = true; |
| |
| if ( parser.getName().equals( HtmlMarkup.P.toString() ) ) |
| { |
| sink.paragraph_(); |
| } |
| else if ( parser.getName().equals( HtmlMarkup.DIV.toString() ) ) |
| { |
| handleDivEnd( sink ); |
| } |
| else if ( parser.getName().equals( HtmlMarkup.PRE.toString() ) ) |
| { |
| verbatim_(); |
| |
| sink.verbatim_(); |
| } |
| else if ( parser.getName().equals( HtmlMarkup.UL.toString() ) ) |
| { |
| sink.list_(); |
| } |
| else if ( parser.getName().equals( HtmlMarkup.OL.toString() ) ) |
| { |
| sink.numberedList_(); |
| orderedListDepth--; |
| } |
| else if ( parser.getName().equals( HtmlMarkup.LI.toString() ) ) |
| { |
| handleListItemEnd( sink ); |
| } |
| else if ( parser.getName().equals( HtmlMarkup.DL.toString() ) ) |
| { |
| if ( hasDefinitionListItem ) |
| { |
| sink.definitionListItem_(); |
| hasDefinitionListItem = false; |
| } |
| sink.definitionList_(); |
| } |
| else if ( parser.getName().equals( HtmlMarkup.DT.toString() ) ) |
| { |
| sink.definedTerm_(); |
| } |
| else if ( parser.getName().equals( HtmlMarkup.DD.toString() ) ) |
| { |
| sink.definition_(); |
| sink.definitionListItem_(); |
| hasDefinitionListItem = false; |
| } |
| else if ( ( parser.getName().equals( HtmlMarkup.FIGURE.toString() ) ) ) |
| { |
| sink.figure_(); |
| } |
| else if ( ( parser.getName().equals( HtmlMarkup.FIGCAPTION.toString() ) ) ) |
| { |
| sink.figureCaption_(); |
| } |
| else if ( parser.getName().equals( HtmlMarkup.A.toString() ) ) |
| { |
| handleAEnd( sink ); |
| } |
| |
| else if ( parser.getName().equals( HtmlMarkup.EM.toString() ) ) |
| { |
| sink.inline_(); |
| } |
| else if ( parser.getName().equals( HtmlMarkup.STRONG.toString() ) ) |
| { |
| sink.inline_(); |
| } |
| else if ( parser.getName().equals( HtmlMarkup.SMALL.toString() ) ) |
| { |
| sink.inline_(); |
| } |
| else if ( parser.getName().equals( HtmlMarkup.S.toString() ) ) |
| { |
| sink.inline_(); |
| } |
| else if ( parser.getName().equals( HtmlMarkup.CITE.toString() ) ) |
| { |
| sink.inline_(); |
| } |
| else if ( parser.getName().equals( HtmlMarkup.Q.toString() ) ) |
| { |
| sink.inline_(); |
| } |
| else if ( parser.getName().equals( HtmlMarkup.DFN.toString() ) ) |
| { |
| sink.inline_(); |
| } |
| else if ( parser.getName().equals( HtmlMarkup.ABBR.toString() ) ) |
| { |
| sink.inline_(); |
| } |
| else if ( parser.getName().equals( HtmlMarkup.I.toString() ) ) |
| { |
| sink.inline_(); |
| } |
| else if ( parser.getName().equals( HtmlMarkup.B.toString() ) ) |
| { |
| sink.inline_(); |
| } |
| else if ( parser.getName().equals( HtmlMarkup.CODE.toString() ) ) |
| { |
| sink.inline_(); |
| } |
| else if ( parser.getName().equals( HtmlMarkup.VAR.toString() ) ) |
| { |
| sink.inline_(); |
| } |
| else if ( parser.getName().equals( HtmlMarkup.SAMP.toString() ) ) |
| { |
| sink.inline_(); |
| } |
| else if ( parser.getName().equals( HtmlMarkup.KBD.toString() ) ) |
| { |
| sink.inline_(); |
| } |
| else if ( parser.getName().equals( HtmlMarkup.SUP.toString() ) ) |
| { |
| sink.inline_(); |
| } |
| else if ( parser.getName().equals( HtmlMarkup.SUB.toString() ) ) |
| { |
| sink.inline_(); |
| } |
| else if ( parser.getName().equals( HtmlMarkup.U.toString() ) ) |
| { |
| sink.inline_(); |
| } |
| else if ( parser.getName().equals( HtmlMarkup.MARK.toString() ) ) |
| { |
| sink.inline_(); |
| } |
| else if ( parser.getName().equals( HtmlMarkup.RUBY.toString() ) ) |
| { |
| sink.inline_(); |
| } |
| else if ( parser.getName().equals( HtmlMarkup.RB.toString() ) ) |
| { |
| sink.inline_(); |
| } |
| else if ( parser.getName().equals( HtmlMarkup.RT.toString() ) ) |
| { |
| sink.inline_(); |
| } |
| else if ( parser.getName().equals( HtmlMarkup.RTC.toString() ) ) |
| { |
| sink.inline_(); |
| } |
| else if ( parser.getName().equals( HtmlMarkup.RP.toString() ) ) |
| { |
| sink.inline_(); |
| } |
| else if ( parser.getName().equals( HtmlMarkup.BDI.toString() ) ) |
| { |
| sink.inline_(); |
| } |
| else if ( parser.getName().equals( HtmlMarkup.BDO.toString() ) ) |
| { |
| sink.inline_(); |
| } |
| else if ( parser.getName().equals( HtmlMarkup.SPAN.toString() ) ) |
| { |
| sink.inline_(); |
| } |
| else if ( parser.getName().equals( HtmlMarkup.INS.toString() ) ) |
| { |
| sink.inline_(); |
| } |
| else if ( parser.getName().equals( HtmlMarkup.DEL.toString() ) ) |
| { |
| sink.inline_(); |
| } |
| |
| // ---------------------------------------------------------------------- |
| // Tables |
| // ---------------------------------------------------------------------- |
| |
| else if ( parser.getName().equals( HtmlMarkup.TABLE.toString() ) ) |
| { |
| sink.tableRows_(); |
| |
| sink.table_(); |
| } |
| else if ( parser.getName().equals( HtmlMarkup.TR.toString() ) ) |
| { |
| sink.tableRow_(); |
| } |
| else if ( parser.getName().equals( HtmlMarkup.TH.toString() ) ) |
| { |
| sink.tableHeaderCell_(); |
| } |
| else if ( parser.getName().equals( HtmlMarkup.TD.toString() ) ) |
| { |
| sink.tableCell_(); |
| } |
| else if ( parser.getName().equals( HtmlMarkup.CAPTION.toString() ) ) |
| { |
| sink.tableCaption_(); |
| } |
| else if ( parser.getName().equals( HtmlMarkup.ARTICLE.toString() ) ) |
| { |
| sink.article_(); |
| } |
| else if ( parser.getName().equals( HtmlMarkup.NAV.toString() ) ) |
| { |
| sink.navigation_(); |
| } |
| else if ( parser.getName().equals( HtmlMarkup.ASIDE.toString() ) ) |
| { |
| sink.sidebar_(); |
| } |
| else if ( parser.getName().equals( HtmlMarkup.SECTION.toString() ) ) |
| { |
| handleSectionEnd( sink ); |
| } |
| else if ( parser.getName().equals( HtmlMarkup.H2.toString() ) ) |
| { |
| sink.sectionTitle1_(); |
| } |
| else if ( parser.getName().equals( HtmlMarkup.H3.toString() ) ) |
| { |
| sink.sectionTitle2_(); |
| } |
| else if ( parser.getName().equals( HtmlMarkup.H4.toString() ) ) |
| { |
| sink.sectionTitle3_(); |
| } |
| else if ( parser.getName().equals( HtmlMarkup.H5.toString() ) ) |
| { |
| sink.sectionTitle4_(); |
| } |
| else if ( parser.getName().equals( HtmlMarkup.H6.toString() ) ) |
| { |
| sink.sectionTitle5_(); |
| } |
| else if ( parser.getName().equals( HtmlMarkup.HEADER.toString() ) ) |
| { |
| sink.header_(); |
| } |
| else if ( parser.getName().equals( HtmlMarkup.MAIN.toString() ) ) |
| { |
| sink.content_(); |
| } |
| else if ( parser.getName().equals( HtmlMarkup.FOOTER.toString() ) ) |
| { |
| sink.footer_(); |
| } |
| else if ( parser.getName().equals( HtmlMarkup.SCRIPT.toString() ) |
| || parser.getName().equals( HtmlMarkup.STYLE.toString() ) ) |
| { |
| handleUnknown( parser, sink, TAG_TYPE_END ); |
| |
| scriptBlock = false; |
| } |
| else |
| { |
| visited = false; |
| } |
| |
| return visited; |
| } |
| |
| /** |
| * {@inheritDoc} |
| * |
| * Just calls {@link #baseStartTag(XmlPullParser,Sink)}, this should be |
| * overridden by implementing parsers to include additional tags. |
| */ |
| protected void handleStartTag( XmlPullParser parser, Sink sink ) |
| throws XmlPullParserException, MacroExecutionException |
| { |
| if ( !baseStartTag( parser, sink ) ) |
| { |
| if ( getLog().isWarnEnabled() ) |
| { |
| String position = "[" + parser.getLineNumber() + ":" |
| + parser.getColumnNumber() + "]"; |
| String tag = "<" + parser.getName() + ">"; |
| |
| getLog().warn( "Unrecognized xml tag: " + tag + " at " + position ); |
| } |
| } |
| } |
| |
| /** |
| * {@inheritDoc} |
| * |
| * Just calls {@link #baseEndTag(XmlPullParser,Sink)}, this should be |
| * overridden by implementing parsers to include additional tags. |
| */ |
| protected void handleEndTag( XmlPullParser parser, Sink sink ) |
| throws XmlPullParserException, MacroExecutionException |
| { |
| if ( !baseEndTag( parser, sink ) ) |
| { |
| // unrecognized tag is already logged in StartTag |
| } |
| } |
| |
| /** {@inheritDoc} */ |
| @Override |
| protected void handleText( XmlPullParser parser, Sink sink ) |
| throws XmlPullParserException |
| { |
| String text = getText( parser ); |
| |
| /* |
| * NOTE: Don't do any whitespace trimming here. Whitespace normalization has already been performed by the |
| * parser so any whitespace that makes it here is significant. |
| * |
| * NOTE: text within script tags is ignored, scripting code should be embedded in CDATA. |
| */ |
| if ( StringUtils.isNotEmpty( text ) && !isScriptBlock() ) |
| { |
| sink.text( text ); |
| } |
| } |
| |
| /** {@inheritDoc} */ |
| @Override |
| protected void handleComment( XmlPullParser parser, Sink sink ) |
| throws XmlPullParserException |
| { |
| String text = getText( parser ); |
| |
| if ( "PB".equals( text.trim() ) ) |
| { |
| sink.pageBreak(); |
| } |
| else |
| { |
| if ( isEmitComments() ) |
| { |
| sink.comment( text ); |
| } |
| } |
| } |
| |
| /** {@inheritDoc} */ |
| @Override |
| protected void handleCdsect( XmlPullParser parser, Sink sink ) |
| throws XmlPullParserException |
| { |
| String text = getText( parser ); |
| |
| if ( isScriptBlock() ) |
| { |
| sink.unknown( CDATA, new Object[] { Integer.valueOf( CDATA_TYPE ), text}, null ); |
| } |
| else |
| { |
| sink.text( text ); |
| } |
| } |
| |
| /** |
| * Make sure sections are nested consecutively. |
| * |
| * <p> |
| * HTML5 heading tags H1 to H6 imply sections where they are not |
| * present, that means we have to open close any sections that |
| * are missing in between. |
| * </p> |
| * |
| * <p> |
| * For instance, if the following sequence is parsed: |
| * </p> |
| * <pre> |
| * <h3></h3> |
| * <h6></h6> |
| * </pre> |
| * <p> |
| * we have to insert two section starts before we open the <code><h6></code>. |
| * In the following sequence |
| * </p> |
| * <pre> |
| * <h6></h6> |
| * <h3></h3> |
| * </pre> |
| * <p> |
| * we have to close two sections before we open the <code><h3></code>. |
| * </p> |
| * |
| * <p>The current level is set to newLevel afterwards.</p> |
| * |
| * @param newLevel the new section level, all upper levels have to be closed. |
| * @param sink the sink to receive the events. |
| */ |
| protected void consecutiveSections( int newLevel, Sink sink, SinkEventAttributeSet attribs ) |
| { |
| closeOpenSections( newLevel, sink ); |
| openMissingSections( newLevel, sink ); |
| |
| this.headingLevel = newLevel; |
| } |
| |
| /** |
| * Close open sections. |
| * |
| * @param newLevel the new section level, all upper levels have to be closed. |
| * @param sink the sink to receive the events. |
| */ |
| private void closeOpenSections( int newLevel, Sink sink ) |
| { |
| while ( this.headingLevel >= newLevel |
| && this.sectionLevel < headingLevel ) |
| { |
| if ( headingLevel == Sink.SECTION_LEVEL_5 ) |
| { |
| sink.section5_(); |
| } |
| else if ( headingLevel == Sink.SECTION_LEVEL_4 ) |
| { |
| sink.section4_(); |
| } |
| else if ( headingLevel == Sink.SECTION_LEVEL_3 ) |
| { |
| sink.section3_(); |
| } |
| else if ( headingLevel == Sink.SECTION_LEVEL_2 ) |
| { |
| sink.section2_(); |
| } |
| else if ( headingLevel == Sink.SECTION_LEVEL_1 ) |
| { |
| sink.section1_(); |
| } |
| |
| this.headingLevel--; |
| } |
| } |
| |
| /** |
| * Open missing sections. |
| * |
| * @param newLevel the new section level, all lower levels have to be opened. |
| * @param sink the sink to receive the events. |
| */ |
| private void openMissingSections( int newLevel, Sink sink ) |
| { |
| while ( this.headingLevel < newLevel |
| && this.sectionLevel < newLevel ) |
| { |
| this.headingLevel++; |
| |
| if ( headingLevel == Sink.SECTION_LEVEL_5 ) |
| { |
| sink.section5(); |
| } |
| else if ( headingLevel == Sink.SECTION_LEVEL_4 ) |
| { |
| sink.section4(); |
| } |
| else if ( headingLevel == Sink.SECTION_LEVEL_3 ) |
| { |
| sink.section3(); |
| } |
| else if ( headingLevel == Sink.SECTION_LEVEL_2 ) |
| { |
| sink.section2(); |
| } |
| else if ( headingLevel == Sink.SECTION_LEVEL_1 ) |
| { |
| sink.section1(); |
| } |
| } |
| } |
| |
| /** |
| * Return the current section level. |
| * |
| * @return the current section level. |
| */ |
| protected int getSectionLevel() |
| { |
| return this.headingLevel; |
| } |
| |
| /** |
| * Set the current section level. |
| * |
| * @param newLevel the new section level. |
| */ |
| protected void setSectionLevel( int newLevel ) |
| { |
| this.headingLevel = newLevel; |
| } |
| |
| /** |
| * Stop verbatim mode. |
| */ |
| protected void verbatim_() |
| { |
| this.inVerbatim = false; |
| } |
| |
| /** |
| * Start verbatim mode. |
| */ |
| protected void verbatim() |
| { |
| this.inVerbatim = true; |
| } |
| |
| /** |
| * Checks if we are currently inside a <pre> tag. |
| * |
| * @return true if we are currently in verbatim mode. |
| */ |
| protected boolean isVerbatim() |
| { |
| return this.inVerbatim; |
| } |
| |
| /** |
| * Checks if we are currently inside a <script> tag. |
| * |
| * @return true if we are currently inside <code><script></code> tags. |
| * |
| * @since 1.1.1. |
| */ |
| protected boolean isScriptBlock() |
| { |
| return this.scriptBlock; |
| } |
| |
| /** |
| * Checks if the given id is a valid Doxia id and if not, returns a transformed one. |
| * |
| * @param id The id to validate. |
| * @return A transformed id or the original id if it was already valid. |
| * @see DoxiaUtils#encodeId(String) |
| */ |
| protected String validAnchor( String id ) |
| { |
| if ( !DoxiaUtils.isValidId( id ) ) |
| { |
| String linkAnchor = DoxiaUtils.encodeId( id, true ); |
| |
| String msg = "Modified invalid link: '" + id + "' to '" + linkAnchor + "'"; |
| logMessage( "modifiedLink", msg ); |
| |
| return linkAnchor; |
| } |
| |
| return id; |
| } |
| |
| /** {@inheritDoc} */ |
| @Override |
| protected void init() |
| { |
| super.init(); |
| |
| this.scriptBlock = false; |
| this.isLink = false; |
| this.isAnchor = false; |
| this.orderedListDepth = 0; |
| this.headingLevel = 0; |
| this.inVerbatim = false; |
| this.warnMessages = null; |
| } |
| |
| private void handleAEnd( Sink sink ) |
| { |
| if ( isLink ) |
| { |
| sink.link_(); |
| isLink = false; |
| } |
| else if ( isAnchor ) |
| { |
| sink.anchor_(); |
| isAnchor = false; |
| } |
| } |
| |
| private void handleAStart( XmlPullParser parser, Sink sink, SinkEventAttributeSet attribs ) |
| { |
| String href = parser.getAttributeValue( null, Attribute.HREF.toString() ); |
| |
| if ( href != null ) |
| { |
| int hashIndex = href.indexOf( '#' ); |
| if ( hashIndex != -1 && !DoxiaUtils.isExternalLink( href ) ) |
| { |
| String hash = href.substring( hashIndex + 1 ); |
| |
| if ( !DoxiaUtils.isValidId( hash ) ) |
| { |
| href = href.substring( 0, hashIndex ) + "#" + DoxiaUtils.encodeId( hash, true ); |
| |
| String msg = "Modified invalid link: '" + hash + "' to '" + href + "'"; |
| logMessage( "modifiedLink", msg ); |
| } |
| } |
| sink.link( href, attribs ); |
| isLink = true; |
| } |
| else |
| { |
| String name = parser.getAttributeValue( null, Attribute.NAME.toString() ); |
| |
| if ( name != null ) |
| { |
| sink.anchor( validAnchor( name ), attribs ); |
| isAnchor = true; |
| } |
| else |
| { |
| String id = parser.getAttributeValue( null, Attribute.ID.toString() ); |
| if ( id != null ) |
| { |
| sink.anchor( validAnchor( id ), attribs ); |
| isAnchor = true; |
| } |
| } |
| } |
| } |
| |
| private boolean handleDivStart( XmlPullParser parser, SinkEventAttributeSet attribs, Sink sink ) |
| { |
| String divclass = parser.getAttributeValue( null, Attribute.CLASS.toString() ); |
| |
| this.divStack.push( divclass ); |
| |
| if ( "content".equals( divclass ) ) |
| { |
| SinkEventAttributeSet atts = new SinkEventAttributeSet( attribs ); |
| atts.removeAttribute( SinkEventAttributes.CLASS ); |
| sink.content( atts ); |
| } |
| if ( "source".equals( divclass ) ) |
| { |
| return false; |
| } |
| else |
| { |
| sink.division( attribs ); |
| } |
| |
| return true; |
| } |
| |
| private boolean handleDivEnd( Sink sink ) |
| { |
| String divclass = divStack.pop(); |
| |
| if ( "content".equals( divclass ) ) |
| { |
| sink.content_(); |
| } |
| if ( "source".equals( divclass ) ) |
| { |
| return false; |
| } |
| else |
| { |
| sink.division_(); |
| } |
| |
| return true; |
| } |
| |
| private void handleImgStart( XmlPullParser parser, Sink sink, SinkEventAttributeSet attribs ) |
| { |
| String src = parser.getAttributeValue( null, Attribute.SRC.toString() ); |
| |
| if ( src != null ) |
| { |
| sink.figureGraphics( src, attribs ); |
| } |
| } |
| |
| private void handleLIStart( Sink sink, SinkEventAttributeSet attribs ) |
| { |
| if ( orderedListDepth == 0 ) |
| { |
| sink.listItem( attribs ); |
| } |
| else |
| { |
| sink.numberedListItem( attribs ); |
| } |
| } |
| |
| private void handleListItemEnd( Sink sink ) |
| { |
| if ( orderedListDepth == 0 ) |
| { |
| sink.listItem_(); |
| } |
| else |
| { |
| sink.numberedListItem_(); |
| } |
| } |
| |
| private void handleOLStart( XmlPullParser parser, Sink sink, SinkEventAttributeSet attribs ) |
| { |
| int numbering = Sink.NUMBERING_DECIMAL; |
| // this will have to be generalized if we handle styles |
| String style = parser.getAttributeValue( null, Attribute.STYLE.toString() ); |
| |
| if ( style != null ) |
| { |
| if ( "list-style-type: upper-alpha".equals( style ) ) |
| { |
| numbering = Sink.NUMBERING_UPPER_ALPHA; |
| } |
| else if ( "list-style-type: lower-alpha".equals( style ) ) |
| { |
| numbering = Sink.NUMBERING_LOWER_ALPHA; |
| } |
| else if ( "list-style-type: upper-roman".equals( style ) ) |
| { |
| numbering = Sink.NUMBERING_UPPER_ROMAN; |
| } |
| else if ( "list-style-type: lower-roman".equals( style ) ) |
| { |
| numbering = Sink.NUMBERING_LOWER_ROMAN; |
| } |
| else if ( "list-style-type: decimal".equals( style ) ) |
| { |
| numbering = Sink.NUMBERING_DECIMAL; |
| } |
| } |
| |
| sink.numberedList( numbering, attribs ); |
| orderedListDepth++; |
| } |
| |
| private void handlePStart( Sink sink, SinkEventAttributeSet attribs ) |
| { |
| sink.paragraph( attribs ); |
| } |
| |
| /* |
| * The PRE element tells visual user agents that the enclosed text is |
| * "preformatted". When handling preformatted text, visual user agents: |
| * - May leave white space intact. |
| * - May render text with a fixed-pitch font. |
| * - May disable automatic word wrap. |
| * - Must not disable bidirectional processing. |
| * Non-visual user agents are not required to respect extra white space |
| * in the content of a PRE element. |
| */ |
| private void handlePreStart( SinkEventAttributeSet attribs, Sink sink ) |
| { |
| verbatim(); |
| sink.verbatim( attribs ); |
| } |
| |
| private void handleSectionStart( Sink sink, SinkEventAttributeSet attribs ) |
| { |
| sink.section( ++sectionLevel, attribs ); |
| } |
| |
| private void handleHeadingStart( Sink sink, int level, SinkEventAttributeSet attribs ) |
| { |
| consecutiveSections( level, sink, attribs ); |
| sink.sectionTitle( level, attribs ); |
| } |
| |
| private void handleSectionEnd( Sink sink ) |
| { |
| closeOpenSections( sectionLevel, sink ); |
| this.headingLevel = 0; |
| |
| sink.section_( sectionLevel-- ); |
| } |
| |
| private void handleTableStart( Sink sink, SinkEventAttributeSet attribs, XmlPullParser parser ) |
| { |
| sink.table( attribs ); |
| String border = parser.getAttributeValue( null, Attribute.BORDER.toString() ); |
| boolean grid = true; |
| |
| if ( border == null || "0".equals( border ) ) |
| { |
| grid = false; |
| } |
| |
| String align = parser.getAttributeValue( null, Attribute.ALIGN.toString() ); |
| int[] justif = {Sink.JUSTIFY_LEFT}; |
| |
| if ( "center".equals( align ) ) |
| { |
| justif[0] = Sink.JUSTIFY_CENTER; |
| } |
| else if ( "right".equals( align ) ) |
| { |
| justif[0] = Sink.JUSTIFY_RIGHT; |
| } |
| |
| sink.tableRows( justif, grid ); |
| } |
| |
| /** |
| * If debug mode is enabled, log the <code>msg</code> as is, otherwise add unique msg in <code>warnMessages</code>. |
| * |
| * @param key not null |
| * @param msg not null |
| * @see #parse(Reader, Sink) |
| * @since 1.1.1 |
| */ |
| private void logMessage( String key, String msg ) |
| { |
| final String log = "[XHTML Parser] " + msg; |
| if ( getLog().isDebugEnabled() ) |
| { |
| getLog().debug( log ); |
| |
| return; |
| } |
| |
| if ( warnMessages == null ) |
| { |
| warnMessages = new HashMap<String, Set<String>>(); |
| } |
| |
| Set<String> set = warnMessages.get( key ); |
| if ( set == null ) |
| { |
| set = new TreeSet<String>(); |
| } |
| set.add( log ); |
| warnMessages.put( key, set ); |
| } |
| |
| /** |
| * @since 1.1.1 |
| */ |
| private void logWarnings() |
| { |
| if ( getLog().isWarnEnabled() && this.warnMessages != null && !isSecondParsing() ) |
| { |
| for ( Map.Entry<String, Set<String>> entry : this.warnMessages.entrySet() ) |
| { |
| for ( String msg : entry.getValue() ) |
| { |
| getLog().warn( msg ); |
| } |
| } |
| |
| this.warnMessages = null; |
| } |
| } |
| } |