doxia-modules/doxia-module-markdown/src/main/java/org/apache/maven/doxia/module/markdown/MarkdownParser.java - maven-doxia - Git at Google

 package org.apache.maven.doxia.module.markdown;

 /*
  * Licensed to the Apache Software Foundation (ASF) under one
  * or more contributor license agreements.  See the NOTICE file
  * distributed with this work for additional information
  * regarding copyright ownership.  The ASF licenses this file
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
  *
  *   http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing,
  * software distributed under the License is distributed on an
  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
  * KIND, either express or implied.  See the License for the
  * specific language governing permissions and limitations
  * under the License.
  */

 import org.apache.commons.lang.StringEscapeUtils;
 import org.apache.commons.lang.StringUtils;
 import org.apache.maven.doxia.markup.HtmlMarkup;
 import org.apache.maven.doxia.module.xhtml.XhtmlParser;
 import org.apache.maven.doxia.parser.AbstractParser;
 import org.apache.maven.doxia.parser.ParseException;
 import org.apache.maven.doxia.parser.Parser;
 import org.apache.maven.doxia.sink.Sink;
 import org.codehaus.plexus.component.annotations.Component;
 import org.codehaus.plexus.component.annotations.Requirement;
 import org.codehaus.plexus.util.IOUtil;
 import org.codehaus.plexus.util.xml.pull.XmlPullParser;
 import org.pegdown.Extensions;
 import org.pegdown.PegDownProcessor;
 import org.pegdown.ast.HeaderNode;
 import org.pegdown.ast.HtmlBlockNode;
 import org.pegdown.ast.Node;
 import org.pegdown.ast.RootNode;
 import org.pegdown.ast.SuperNode;
 import org.pegdown.ast.TextNode;

 import java.io.IOException;
 import java.io.Reader;
 import java.io.StringReader;
 import java.util.regex.Matcher;
 import java.util.regex.Pattern;

 /**
  * Implementation of {@link org.apache.maven.doxia.parser.Parser} for Markdown documents.
  * <p/>
  * Defers effective parsing to the <a href="http://pegdown.org">PegDown library</a>, which generates HTML content
  * then delegates parsing of this content to a slightly modified Doxia Xhtml parser.
  *
  * @author Julien Nicoulaud <julien.nicoulaud@gmail.com>
  * @since 1.3
  * @see MarkdownToDoxiaHtmlSerializer
  */
 @Component( role = Parser.class, hint = "markdown" )
 public class MarkdownParser
     extends AbstractParser
 {

     /**
      * The role hint for the {@link MarkdownParser} Plexus component.
      */
     public static final String ROLE_HINT = "markdown";

     /**
      * The {@link PegDownProcessor} used to convert Pegdown documents to HTML.
      */
     protected static final PegDownProcessor PEGDOWN_PROCESSOR =
         new PegDownProcessor( Extensions.ALL & ~Extensions.HARDWRAPS, Long.MAX_VALUE );

     /**
      * Regex that identifies a multimarkdown-style metadata section at the start of the document
      */
     private static final String MULTI_MARKDOWN_METADATA_SECTION =
         "^(((?:[^\\s:][^:]*):(?:.*(?:\r?\n\\p{Blank}+[^\\s].*)*\r?\n))+)(?:\\s*\r?\n)";

     /**
      * Regex that captures the key and value of a multimarkdown-style metadata entry.
      */
     private static final String MULTI_MARKDOWN_METADATA_ENTRY =
         "([^\\s:][^:]*):(.*(?:\r?\n\\p{Blank}+[^\\s].*)*)\r?\n";

     /**
      * In order to ensure that we have minimal risk of false positives when slurping metadata sections, the
      * first key in the metadata section must be one of these standard keys or else the entire metadata section is
      * ignored.
      */
     private static final String[] STANDARD_METADATA_KEYS =
         { "title", "author", "date", "address", "affiliation", "copyright", "email", "keywords", "language", "phone",
             "subtitle" };

     public int getType()
     {
         return TXT_TYPE;
     }

     @Requirement
     private PegDownHtmlParser parser;

     public void parse( Reader source, Sink sink )
         throws ParseException
     {
         try
         {
             // Markdown to HTML (using Pegdown library)
             String html = toHtml( source );
             // then HTML to Sink API
             parser.parse( new StringReader( html ), sink );
         }
         catch ( IOException e )
         {
             throw new ParseException( "Failed reading Markdown source document", e );
         }
     }

     /**
      * uses PegDown library to parse content and generate HTML output.
      *
      * @param source the Markdown source
      * @return HTML content generated by PegDown
      * @throws IOException
      * @see MarkdownToDoxiaHtmlSerializer
      */
     private String toHtml( Reader source )
         throws IOException
     {
         String text = IOUtil.toString( source );
         StringBuilder html = new StringBuilder( text.length() * 2 );
         html.append( "<html>" );
         html.append( "<head>" );
         Pattern metadataPattern = Pattern.compile( MULTI_MARKDOWN_METADATA_SECTION, Pattern.MULTILINE );
         Matcher metadataMatcher = metadataPattern.matcher( text );
         boolean haveTitle = false;
         if ( metadataMatcher.find() )
         {
             metadataPattern = Pattern.compile( MULTI_MARKDOWN_METADATA_ENTRY, Pattern.MULTILINE );
             Matcher lineMatcher = metadataPattern.matcher( metadataMatcher.group( 1 ) );
             boolean first = true;
             while ( lineMatcher.find() )
             {
                 String key = StringUtils.trimToEmpty( lineMatcher.group( 1 ) );
                 if ( first )
                 {
                     boolean found = false;
                     for ( String k : STANDARD_METADATA_KEYS )
                     {
                         if ( k.equalsIgnoreCase( key ) )
                         {
                             found = true;
                             break;
                         }
                     }
                     if ( !found )
                     {
                         break;
                     }
                     first = false;
                 }
                 String value = StringUtils.trimToEmpty( lineMatcher.group( 2 ) );
                 if ( "title".equalsIgnoreCase( key ) )
                 {
                     haveTitle = true;
                     html.append( "<title>" );
                     html.append( StringEscapeUtils.escapeXml( value ) );
                     html.append( "</title>" );
                 }
                 else if ( "author".equalsIgnoreCase( key ) )
                 {
                     html.append( "<meta name=\'author\' content=\'" );
                     html.append( StringEscapeUtils.escapeXml( value ) );
                     html.append( "\' />" );
                 }
                 else if ( "date".equalsIgnoreCase( key ) )
                 {
                     html.append( "<meta name=\'date\' content=\'" );
                     html.append( StringEscapeUtils.escapeXml( value ) );
                     html.append( "\' />" );
                 }
                 else
                 {
                     html.append( "<meta name=\'" );
                     html.append( StringEscapeUtils.escapeXml( key ) );
                     html.append( "\' content=\'" );
                     html.append( StringEscapeUtils.escapeXml( value ) );
                     html.append( "\' />" );
                 }
             }
             if ( !first )
             {
                 text = text.substring( metadataMatcher.end() );
             }
         }
         RootNode rootNode = PEGDOWN_PROCESSOR.parseMarkdown( text.toCharArray() );
         if ( !haveTitle && rootNode.getChildren().size() > 0 )
         {
             // use the first (non-comment) node only if it is a heading
             int i = 0;
             Node firstNode = null;
             while ( i < rootNode.getChildren().size() && isHtmlComment(
                 ( firstNode = rootNode.getChildren().get( i ) ) ) )
             {
                 i++;
             }
             if ( firstNode instanceof HeaderNode )
             {
                 html.append( "<title>" );
                 html.append( StringEscapeUtils.escapeXml( nodeText( firstNode ) ) );
                 html.append( "</title>" );
             }
         }
         html.append( "</head>" );
         html.append( "<body>" );
         html.append( new MarkdownToDoxiaHtmlSerializer().toHtml( rootNode ) );
         html.append( "</body>" );
         html.append( "</html>" );

         return html.toString();
     }

     public static boolean isHtmlComment( Node node )
     {
         if ( node instanceof HtmlBlockNode )
         {
             HtmlBlockNode blockNode = (HtmlBlockNode) node;
             return blockNode.getText().startsWith( "<!--" );
         }
         return false;
     }

     public static String nodeText( Node node )
     {
         StringBuilder builder = new StringBuilder();
         if ( node instanceof TextNode )
         {
             builder.append( TextNode.class.cast( node ).getText() );
         }
         else
         {
             for ( Node n : node.getChildren() )
             {
                 if ( n instanceof TextNode )
                 {
                     builder.append( TextNode.class.cast( n ).getText() );
                 }
                 else if ( n instanceof SuperNode )
                 {
                     builder.append( nodeText( n ) );
                 }
             }
         }
         return builder.toString();
     }

     /**
      * Internal parser for HTML generated by PegDown library.
      */
     @Component( role = PegDownHtmlParser.class )
     public static class PegDownHtmlParser
         extends XhtmlParser
     {
         public PegDownHtmlParser()
         {
             super();
         }

         @Override
         protected boolean baseEndTag( XmlPullParser parser, Sink sink )
         {
             boolean visited = super.baseEndTag( parser, sink );
             if ( !visited )
             {
                 if ( parser.getName().equals( HtmlMarkup.DIV.toString() ) )
                 {
                     handleUnknown( parser, sink, TAG_TYPE_END );
                     visited = true;
                 }
             }
             return visited;
         }

         @Override
         protected boolean baseStartTag( XmlPullParser parser, Sink sink )
         {
             boolean visited = super.baseStartTag( parser, sink );
             if ( !visited )
             {
                 if ( parser.getName().equals( HtmlMarkup.DIV.toString() ) )
                 {
                     handleUnknown( parser, sink, TAG_TYPE_START );
                     visited = true;
                 }
             }
             return visited;
         }
     }
 }
	package org.apache.maven.doxia.module.markdown;

	/*
	* Licensed to the Apache Software Foundation (ASF) under one
	* or more contributor license agreements. See the NOTICE file
	* distributed with this work for additional information
	* regarding copyright ownership. The ASF licenses this file
	* to you under the Apache License, Version 2.0 (the
	* "License"); you may not use this file except in compliance
	* with the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing,
	* software distributed under the License is distributed on an
	* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
	* KIND, either express or implied. See the License for the
	* specific language governing permissions and limitations
	* under the License.
	*/

	import org.apache.commons.lang.StringEscapeUtils;
	import org.apache.commons.lang.StringUtils;
	import org.apache.maven.doxia.markup.HtmlMarkup;
	import org.apache.maven.doxia.module.xhtml.XhtmlParser;
	import org.apache.maven.doxia.parser.AbstractParser;
	import org.apache.maven.doxia.parser.ParseException;
	import org.apache.maven.doxia.parser.Parser;
	import org.apache.maven.doxia.sink.Sink;
	import org.codehaus.plexus.component.annotations.Component;
	import org.codehaus.plexus.component.annotations.Requirement;
	import org.codehaus.plexus.util.IOUtil;
	import org.codehaus.plexus.util.xml.pull.XmlPullParser;
	import org.pegdown.Extensions;
	import org.pegdown.PegDownProcessor;
	import org.pegdown.ast.HeaderNode;
	import org.pegdown.ast.HtmlBlockNode;
	import org.pegdown.ast.Node;
	import org.pegdown.ast.RootNode;
	import org.pegdown.ast.SuperNode;
	import org.pegdown.ast.TextNode;

	import java.io.IOException;
	import java.io.Reader;
	import java.io.StringReader;
	import java.util.regex.Matcher;
	import java.util.regex.Pattern;

	/**
	* Implementation of {@link org.apache.maven.doxia.parser.Parser} for Markdown documents.
	* <p/>
	* Defers effective parsing to the <a href="http://pegdown.org">PegDown library</a>, which generates HTML content
	* then delegates parsing of this content to a slightly modified Doxia Xhtml parser.
	*
	* @author Julien Nicoulaud <julien.nicoulaud@gmail.com>
	* @since 1.3
	* @see MarkdownToDoxiaHtmlSerializer
	*/
	@Component( role = Parser.class, hint = "markdown" )
	public class MarkdownParser
	extends AbstractParser
	{

	/**
	* The role hint for the {@link MarkdownParser} Plexus component.
	*/
	public static final String ROLE_HINT = "markdown";

	/**
	* The {@link PegDownProcessor} used to convert Pegdown documents to HTML.
	*/
	protected static final PegDownProcessor PEGDOWN_PROCESSOR =
	new PegDownProcessor( Extensions.ALL & ~Extensions.HARDWRAPS, Long.MAX_VALUE );

	/**
	* Regex that identifies a multimarkdown-style metadata section at the start of the document
	*/
	private static final String MULTI_MARKDOWN_METADATA_SECTION =
	"^(((?:[^\\s:][^:]):(?:.(?:\r?\n\\p{Blank}+[^\\s].)\r?\n))+)(?:\\s*\r?\n)";

	/**
	* Regex that captures the key and value of a multimarkdown-style metadata entry.
	*/
	private static final String MULTI_MARKDOWN_METADATA_ENTRY =
	"([^\\s:][^:]):(.(?:\r?\n\\p{Blank}+[^\\s].))\r?\n";

	/**
	* In order to ensure that we have minimal risk of false positives when slurping metadata sections, the
	* first key in the metadata section must be one of these standard keys or else the entire metadata section is
	* ignored.
	*/
	private static final String[] STANDARD_METADATA_KEYS =
	{ "title", "author", "date", "address", "affiliation", "copyright", "email", "keywords", "language", "phone",
	"subtitle" };

	public int getType()
	{
	return TXT_TYPE;
	}

	@Requirement
	private PegDownHtmlParser parser;

	public void parse( Reader source, Sink sink )
	throws ParseException
	{
	try
	{
	// Markdown to HTML (using Pegdown library)
	String html = toHtml( source );
	// then HTML to Sink API
	parser.parse( new StringReader( html ), sink );
	}
	catch ( IOException e )
	{
	throw new ParseException( "Failed reading Markdown source document", e );
	}
	}

	/**
	* uses PegDown library to parse content and generate HTML output.
	*
	* @param source the Markdown source
	* @return HTML content generated by PegDown
	* @throws IOException
	* @see MarkdownToDoxiaHtmlSerializer
	*/
	private String toHtml( Reader source )
	throws IOException
	{
	String text = IOUtil.toString( source );
	StringBuilder html = new StringBuilder( text.length() * 2 );
	html.append( "<html>" );
	html.append( "<head>" );
	Pattern metadataPattern = Pattern.compile( MULTI_MARKDOWN_METADATA_SECTION, Pattern.MULTILINE );
	Matcher metadataMatcher = metadataPattern.matcher( text );
	boolean haveTitle = false;
	if ( metadataMatcher.find() )
	{
	metadataPattern = Pattern.compile( MULTI_MARKDOWN_METADATA_ENTRY, Pattern.MULTILINE );
	Matcher lineMatcher = metadataPattern.matcher( metadataMatcher.group( 1 ) );
	boolean first = true;
	while ( lineMatcher.find() )
	{
	String key = StringUtils.trimToEmpty( lineMatcher.group( 1 ) );
	if ( first )
	{
	boolean found = false;
	for ( String k : STANDARD_METADATA_KEYS )
	{
	if ( k.equalsIgnoreCase( key ) )
	{
	found = true;
	break;
	}
	}
	if ( !found )
	{
	break;
	}
	first = false;
	}
	String value = StringUtils.trimToEmpty( lineMatcher.group( 2 ) );
	if ( "title".equalsIgnoreCase( key ) )
	{
	haveTitle = true;
	html.append( "<title>" );
	html.append( StringEscapeUtils.escapeXml( value ) );
	html.append( "</title>" );
	}
	else if ( "author".equalsIgnoreCase( key ) )
	{
	html.append( "<meta name=\'author\' content=\'" );
	html.append( StringEscapeUtils.escapeXml( value ) );
	html.append( "\' />" );
	}
	else if ( "date".equalsIgnoreCase( key ) )
	{
	html.append( "<meta name=\'date\' content=\'" );
	html.append( StringEscapeUtils.escapeXml( value ) );
	html.append( "\' />" );
	}
	else
	{
	html.append( "<meta name=\'" );
	html.append( StringEscapeUtils.escapeXml( key ) );
	html.append( "\' content=\'" );
	html.append( StringEscapeUtils.escapeXml( value ) );
	html.append( "\' />" );
	}
	}
	if ( !first )
	{
	text = text.substring( metadataMatcher.end() );
	}
	}
	RootNode rootNode = PEGDOWN_PROCESSOR.parseMarkdown( text.toCharArray() );
	if ( !haveTitle && rootNode.getChildren().size() > 0 )
	{
	// use the first (non-comment) node only if it is a heading
	int i = 0;
	Node firstNode = null;
	while ( i < rootNode.getChildren().size() && isHtmlComment(
	( firstNode = rootNode.getChildren().get( i ) ) ) )
	{
	i++;
	}
	if ( firstNode instanceof HeaderNode )
	{
	html.append( "<title>" );
	html.append( StringEscapeUtils.escapeXml( nodeText( firstNode ) ) );
	html.append( "</title>" );
	}
	}
	html.append( "</head>" );
	html.append( "<body>" );
	html.append( new MarkdownToDoxiaHtmlSerializer().toHtml( rootNode ) );
	html.append( "</body>" );
	html.append( "</html>" );

	return html.toString();
	}

	public static boolean isHtmlComment( Node node )
	{
	if ( node instanceof HtmlBlockNode )
	{
	HtmlBlockNode blockNode = (HtmlBlockNode) node;
	return blockNode.getText().startsWith( "<!--" );
	}
	return false;
	}

	public static String nodeText( Node node )
	{
	StringBuilder builder = new StringBuilder();
	if ( node instanceof TextNode )
	{
	builder.append( TextNode.class.cast( node ).getText() );
	}
	else
	{
	for ( Node n : node.getChildren() )
	{
	if ( n instanceof TextNode )
	{
	builder.append( TextNode.class.cast( n ).getText() );
	}
	else if ( n instanceof SuperNode )
	{
	builder.append( nodeText( n ) );
	}
	}
	}
	return builder.toString();
	}

	/**
	* Internal parser for HTML generated by PegDown library.
	*/
	@Component( role = PegDownHtmlParser.class )
	public static class PegDownHtmlParser
	extends XhtmlParser
	{
	public PegDownHtmlParser()
	{
	super();
	}

	@Override
	protected boolean baseEndTag( XmlPullParser parser, Sink sink )
	{
	boolean visited = super.baseEndTag( parser, sink );
	if ( !visited )
	{
	if ( parser.getName().equals( HtmlMarkup.DIV.toString() ) )
	{
	handleUnknown( parser, sink, TAG_TYPE_END );
	visited = true;
	}
	}
	return visited;
	}

	@Override
	protected boolean baseStartTag( XmlPullParser parser, Sink sink )
	{
	boolean visited = super.baseStartTag( parser, sink );
	if ( !visited )
	{
	if ( parser.getName().equals( HtmlMarkup.DIV.toString() ) )
	{
	handleUnknown( parser, sink, TAG_TYPE_START );
	visited = true;
	}
	}
	return visited;
	}
	}
	}