blob: 3b59556c0e6af7146c71ae99da7b9703af8b6c44 [file] [log] [blame]
package org.apache.maven.doxia.module.markdown;
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
import org.apache.commons.lang.StringEscapeUtils;
import org.apache.commons.lang.StringUtils;
import org.apache.maven.doxia.markup.HtmlMarkup;
import org.apache.maven.doxia.module.xhtml.XhtmlParser;
import org.apache.maven.doxia.parser.AbstractParser;
import org.apache.maven.doxia.parser.ParseException;
import org.apache.maven.doxia.parser.Parser;
import org.apache.maven.doxia.sink.Sink;
import org.codehaus.plexus.component.annotations.Component;
import org.codehaus.plexus.component.annotations.Requirement;
import org.codehaus.plexus.util.IOUtil;
import org.codehaus.plexus.util.xml.pull.XmlPullParser;
import org.pegdown.Extensions;
import org.pegdown.PegDownProcessor;
import org.pegdown.ast.HeaderNode;
import org.pegdown.ast.HtmlBlockNode;
import org.pegdown.ast.Node;
import org.pegdown.ast.RootNode;
import org.pegdown.ast.SuperNode;
import org.pegdown.ast.TextNode;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
* Implementation of {@link org.apache.maven.doxia.parser.Parser} for Markdown documents.
* <p/>
* Defers effective parsing to the <a href="">PegDown library</a>, which generates HTML content
* then delegates parsing of this content to a slightly modified Doxia Xhtml parser.
* @author Julien Nicoulaud <>
* @since 1.3
* @see MarkdownToDoxiaHtmlSerializer
@Component( role = Parser.class, hint = "markdown" )
public class MarkdownParser
extends AbstractParser
* The role hint for the {@link MarkdownParser} Plexus component.
public static final String ROLE_HINT = "markdown";
* The {@link PegDownProcessor} used to convert Pegdown documents to HTML.
protected static final PegDownProcessor PEGDOWN_PROCESSOR =
new PegDownProcessor( Extensions.ALL & ~Extensions.HARDWRAPS, Long.MAX_VALUE );
* Regex that identifies a multimarkdown-style metadata section at the start of the document
private static final String MULTI_MARKDOWN_METADATA_SECTION =
* Regex that captures the key and value of a multimarkdown-style metadata entry.
private static final String MULTI_MARKDOWN_METADATA_ENTRY =
* In order to ensure that we have minimal risk of false positives when slurping metadata sections, the
* first key in the metadata section must be one of these standard keys or else the entire metadata section is
* ignored.
private static final String[] STANDARD_METADATA_KEYS =
{ "title", "author", "date", "address", "affiliation", "copyright", "email", "keywords", "language", "phone",
"subtitle" };
public int getType()
return TXT_TYPE;
private PegDownHtmlParser parser;
public void parse( Reader source, Sink sink )
throws ParseException
// Markdown to HTML (using Pegdown library)
String html = toHtml( source );
// then HTML to Sink API
parser.parse( new StringReader( html ), sink );
catch ( IOException e )
throw new ParseException( "Failed reading Markdown source document", e );
* uses PegDown library to parse content and generate HTML output.
* @param source the Markdown source
* @return HTML content generated by PegDown
* @throws IOException
* @see MarkdownToDoxiaHtmlSerializer
private String toHtml( Reader source )
throws IOException
String text = IOUtil.toString( source );
StringBuilder html = new StringBuilder( text.length() * 2 );
html.append( "<html>" );
html.append( "<head>" );
Pattern metadataPattern = Pattern.compile( MULTI_MARKDOWN_METADATA_SECTION, Pattern.MULTILINE );
Matcher metadataMatcher = metadataPattern.matcher( text );
boolean haveTitle = false;
if ( metadataMatcher.find() )
metadataPattern = Pattern.compile( MULTI_MARKDOWN_METADATA_ENTRY, Pattern.MULTILINE );
Matcher lineMatcher = metadataPattern.matcher( 1 ) );
boolean first = true;
while ( lineMatcher.find() )
String key = StringUtils.trimToEmpty( 1 ) );
if ( first )
boolean found = false;
if ( k.equalsIgnoreCase( key ) )
found = true;
if ( !found )
first = false;
String value = StringUtils.trimToEmpty( 2 ) );
if ( "title".equalsIgnoreCase( key ) )
haveTitle = true;
html.append( "<title>" );
html.append( StringEscapeUtils.escapeXml( value ) );
html.append( "</title>" );
else if ( "author".equalsIgnoreCase( key ) )
html.append( "<meta name=\'author\' content=\'" );
html.append( StringEscapeUtils.escapeXml( value ) );
html.append( "\' />" );
else if ( "date".equalsIgnoreCase( key ) )
html.append( "<meta name=\'date\' content=\'" );
html.append( StringEscapeUtils.escapeXml( value ) );
html.append( "\' />" );
html.append( "<meta name=\'" );
html.append( StringEscapeUtils.escapeXml( key ) );
html.append( "\' content=\'" );
html.append( StringEscapeUtils.escapeXml( value ) );
html.append( "\' />" );
if ( !first )
text = text.substring( metadataMatcher.end() );
RootNode rootNode = PEGDOWN_PROCESSOR.parseMarkdown( text.toCharArray() );
if ( !haveTitle && rootNode.getChildren().size() > 0 )
// use the first (non-comment) node only if it is a heading
int i = 0;
Node firstNode = null;
while ( i < rootNode.getChildren().size() && isHtmlComment(
( firstNode = rootNode.getChildren().get( i ) ) ) )
if ( firstNode instanceof HeaderNode )
html.append( "<title>" );
html.append( StringEscapeUtils.escapeXml( nodeText( firstNode ) ) );
html.append( "</title>" );
html.append( "</head>" );
html.append( "<body>" );
html.append( new MarkdownToDoxiaHtmlSerializer().toHtml( rootNode ) );
html.append( "</body>" );
html.append( "</html>" );
return html.toString();
public static boolean isHtmlComment( Node node )
if ( node instanceof HtmlBlockNode )
HtmlBlockNode blockNode = (HtmlBlockNode) node;
return blockNode.getText().startsWith( "<!--" );
return false;
public static String nodeText( Node node )
StringBuilder builder = new StringBuilder();
if ( node instanceof TextNode )
builder.append( TextNode.class.cast( node ).getText() );
for ( Node n : node.getChildren() )
if ( n instanceof TextNode )
builder.append( TextNode.class.cast( n ).getText() );
else if ( n instanceof SuperNode )
builder.append( nodeText( n ) );
return builder.toString();
* Internal parser for HTML generated by PegDown library.
@Component( role = PegDownHtmlParser.class )
public static class PegDownHtmlParser
extends XhtmlParser
public PegDownHtmlParser()
protected boolean baseEndTag( XmlPullParser parser, Sink sink )
boolean visited = super.baseEndTag( parser, sink );
if ( !visited )
if ( parser.getName().equals( HtmlMarkup.DIV.toString() ) )
handleUnknown( parser, sink, TAG_TYPE_END );
visited = true;
return visited;
protected boolean baseStartTag( XmlPullParser parser, Sink sink )
boolean visited = super.baseStartTag( parser, sink );
if ( !visited )
if ( parser.getName().equals( HtmlMarkup.DIV.toString() ) )
handleUnknown( parser, sink, TAG_TYPE_START );
visited = true;
return visited;