doxia-modules/doxia-module-markdown/src/main/java/org/apache/maven/doxia/module/markdown/MarkdownParser.java - maven-doxia - Git at Google

 package org.apache.maven.doxia.module.markdown;

 /*
  * Licensed to the Apache Software Foundation (ASF) under one
  * or more contributor license agreements.  See the NOTICE file
  * distributed with this work for additional information
  * regarding copyright ownership.  The ASF licenses this file
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
  *
  *   http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing,
  * software distributed under the License is distributed on an
  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
  * KIND, either express or implied.  See the License for the
  * specific language governing permissions and limitations
  * under the License.
  */

 import com.vladsch.flexmark.Extension;
 import com.vladsch.flexmark.ast.Heading;
 import com.vladsch.flexmark.ast.HtmlCommentBlock;
 import com.vladsch.flexmark.ast.Node;
 import com.vladsch.flexmark.ast.util.TextCollectingVisitor;
 import com.vladsch.flexmark.html.HtmlRenderer;
 import com.vladsch.flexmark.profiles.pegdown.Extensions;
 import com.vladsch.flexmark.profiles.pegdown.PegdownOptionsAdapter;
 import com.vladsch.flexmark.util.options.MutableDataHolder;
 import org.apache.commons.lang.StringEscapeUtils;
 import org.apache.commons.lang.StringUtils;
 import org.apache.maven.doxia.markup.HtmlMarkup;
 import org.apache.maven.doxia.module.xhtml.XhtmlParser;
 import org.apache.maven.doxia.parser.AbstractParser;
 import org.apache.maven.doxia.parser.ParseException;
 import org.apache.maven.doxia.parser.Parser;
 import org.apache.maven.doxia.sink.Sink;
 import org.codehaus.plexus.component.annotations.Component;
 import org.codehaus.plexus.component.annotations.Requirement;
 import org.codehaus.plexus.util.IOUtil;
 import org.codehaus.plexus.util.xml.pull.XmlPullParser;

 import java.io.IOException;
 import java.io.Reader;
 import java.io.StringReader;
 import java.util.ArrayList;
 import java.util.regex.Matcher;
 import java.util.regex.Pattern;

 /**
  * Implementation of {@link org.apache.maven.doxia.parser.Parser} for Markdown documents.
  * <p/>
  * Defers effective parsing to the <a href="https://github.com/vsch/flexmark-java">flexmark-java library</a>,
  * which generates HTML content then delegates parsing of this content to a slightly modified Doxia Xhtml parser.
  * (before 1.8, the <a href="http://pegdown.org">PegDown library</a> was used)
  *
  * @author Vladimir Schneider <vladimir@vladsch.com>
  * @author Julien Nicoulaud <julien.nicoulaud@gmail.com>
  * @since 1.3
  */
 @Component( role = Parser.class, hint = "markdown" )
 public class MarkdownParser
     extends AbstractParser
 {

     /**
      * The role hint for the {@link MarkdownParser} Plexus component.
      */
     public static final String ROLE_HINT = "markdown";

     /**
      * Regex that identifies a multimarkdown-style metadata section at the start of the document
      */
     private static final String MULTI_MARKDOWN_METADATA_SECTION =
         "^(((?:[^\\s:][^:]*):(?:.*(?:\r?\n\\p{Blank}+[^\\s].*)*\r?\n))+)(?:\\s*\r?\n)";

     /**
      * Regex that captures the key and value of a multimarkdown-style metadata entry.
      */
     private static final String MULTI_MARKDOWN_METADATA_ENTRY =
         "([^\\s:][^:]*):(.*(?:\r?\n\\p{Blank}+[^\\s].*)*)\r?\n";

     /**
      * In order to ensure that we have minimal risk of false positives when slurping metadata sections, the
      * first key in the metadata section must be one of these standard keys or else the entire metadata section is
      * ignored.
      */
     private static final String[] STANDARD_METADATA_KEYS =
         { "title", "author", "date", "address", "affiliation", "copyright", "email", "keywords", "language", "phone",
             "subtitle" };

     public int getType()
     {
         return TXT_TYPE;
     }

     @Requirement
     private MarkdownHtmlParser parser;

     public void parse( Reader source, Sink sink )
         throws ParseException
     {
         try
         {
             // Markdown to HTML (using flexmark-java library)
             String html = toHtml( source );
             // then HTML to Sink API
             parser.parse( new StringReader( html ), sink );
         }
         catch ( IOException e )
         {
             throw new ParseException( "Failed reading Markdown source document", e );
         }
     }

     /**
      * uses flexmark-java library to parse content and generate HTML output.
      *
      * @param source the Markdown source
      * @return HTML content generated by flexmark-java
      * @throws IOException passed through
      */
     private String toHtml( Reader source )
         throws IOException
     {
         String text = IOUtil.toString( source );
         MutableDataHolder flexmarkOptions = PegdownOptionsAdapter.flexmarkOptions(
                 Extensions.ALL & ~( Extensions.HARDWRAPS | Extensions.ANCHORLINKS ) ).toMutable();
         ArrayList<Extension> extensions = new ArrayList<Extension>();
         for ( Extension extension : flexmarkOptions.get( com.vladsch.flexmark.parser.Parser.EXTENSIONS ) )
         {
             extensions.add( extension );
         }

         extensions.add( FlexmarkDoxiaExtension.create() );
         flexmarkOptions.set( com.vladsch.flexmark.parser.Parser.EXTENSIONS, extensions );
         flexmarkOptions.set( HtmlRenderer.HTML_BLOCK_OPEN_TAG_EOL, false );
         flexmarkOptions.set( HtmlRenderer.HTML_BLOCK_CLOSE_TAG_EOL, false );
         flexmarkOptions.set( HtmlRenderer.MAX_TRAILING_BLANK_LINES, -1 );

         com.vladsch.flexmark.parser.Parser parser = com.vladsch.flexmark.parser.Parser.builder( flexmarkOptions )
                 .build();
         HtmlRenderer renderer = HtmlRenderer.builder( flexmarkOptions ).build();

         StringBuilder html = new StringBuilder( 1000 );
         html.append( "<html>" );
         html.append( "<head>" );
         Pattern metadataPattern = Pattern.compile( MULTI_MARKDOWN_METADATA_SECTION, Pattern.MULTILINE );
         Matcher metadataMatcher = metadataPattern.matcher( text );
         boolean haveTitle = false;
         if ( metadataMatcher.find() )
         {
             metadataPattern = Pattern.compile( MULTI_MARKDOWN_METADATA_ENTRY, Pattern.MULTILINE );
             Matcher lineMatcher = metadataPattern.matcher( metadataMatcher.group( 1 ) );
             boolean first = true;
             while ( lineMatcher.find() )
             {
                 String key = StringUtils.trimToEmpty( lineMatcher.group( 1 ) );
                 if ( first )
                 {
                     boolean found = false;
                     for ( String k : STANDARD_METADATA_KEYS )
                     {
                         if ( k.equalsIgnoreCase( key ) )
                         {
                             found = true;
                             break;
                         }
                     }
                     if ( !found )
                     {
                         break;
                     }
                     first = false;
                 }
                 String value = StringUtils.trimToEmpty( lineMatcher.group( 2 ) );
                 if ( "title".equalsIgnoreCase( key ) )
                 {
                     haveTitle = true;
                     html.append( "<title>" );
                     html.append( StringEscapeUtils.escapeXml( value ) );
                     html.append( "</title>" );
                 }
                 else if ( "author".equalsIgnoreCase( key ) )
                 {
                     html.append( "<meta name=\'author\' content=\'" );
                     html.append( StringEscapeUtils.escapeXml( value ) );
                     html.append( "\' />" );
                 }
                 else if ( "date".equalsIgnoreCase( key ) )
                 {
                     html.append( "<meta name=\'date\' content=\'" );
                     html.append( StringEscapeUtils.escapeXml( value ) );
                     html.append( "\' />" );
                 }
                 else
                 {
                     html.append( "<meta name=\'" );
                     html.append( StringEscapeUtils.escapeXml( key ) );
                     html.append( "\' content=\'" );
                     html.append( StringEscapeUtils.escapeXml( value ) );
                     html.append( "\' />" );
                 }
             }
             if ( !first )
             {
                 text = text.substring( metadataMatcher.end() );
             }
         }

         Node rootNode = parser.parse( text );
         String markdownHtml = renderer.render( rootNode );

         if ( !haveTitle && rootNode.hasChildren() )
         {
             // use the first (non-comment) node only if it is a heading
             Node firstNode = rootNode.getFirstChild();
             while ( firstNode != null && !( firstNode instanceof Heading ) )
             {
                 if ( !( firstNode instanceof HtmlCommentBlock ) )
                 {
                     break;
                 }
                 firstNode = firstNode.getNext();
             }

             if ( firstNode instanceof Heading )
             {
                 html.append( "<title>" );
                 TextCollectingVisitor collectingVisitor = new TextCollectingVisitor();
                 String headingText = collectingVisitor.collectAndGetText( firstNode );
                 html.append( StringEscapeUtils.escapeXml( headingText ) );
                 html.append( "</title>" );
             }
         }
         html.append( "</head>" );
         html.append( "<body>" );
         html.append( markdownHtml );
         html.append( "</body>" );
         html.append( "</html>" );

         return html.toString();
     }

     /**
      * Internal parser for HTML generated by the Markdown library.
      */
     @Component( role = MarkdownHtmlParser.class )
     public static class MarkdownHtmlParser
         extends XhtmlParser
     {
         public MarkdownHtmlParser()
         {
             super();
         }

         @Override
         protected boolean baseEndTag( XmlPullParser parser, Sink sink )
         {
             boolean visited = super.baseEndTag( parser, sink );
             if ( !visited )
             {
                 if ( parser.getName().equals( HtmlMarkup.DIV.toString() ) )
                 {
                     handleUnknown( parser, sink, TAG_TYPE_END );
                     visited = true;
                 }
             }
             return visited;
         }

         @Override
         protected boolean baseStartTag( XmlPullParser parser, Sink sink )
         {
             boolean visited = super.baseStartTag( parser, sink );
             if ( !visited )
             {
                 if ( parser.getName().equals( HtmlMarkup.DIV.toString() ) )
                 {
                     handleUnknown( parser, sink, TAG_TYPE_START );
                     visited = true;
                 }
             }
             return visited;
         }
     }
 }
	package org.apache.maven.doxia.module.markdown;

	/*
	* Licensed to the Apache Software Foundation (ASF) under one
	* or more contributor license agreements. See the NOTICE file
	* distributed with this work for additional information
	* regarding copyright ownership. The ASF licenses this file
	* to you under the Apache License, Version 2.0 (the
	* "License"); you may not use this file except in compliance
	* with the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing,
	* software distributed under the License is distributed on an
	* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
	* KIND, either express or implied. See the License for the
	* specific language governing permissions and limitations
	* under the License.
	*/

	import com.vladsch.flexmark.Extension;
	import com.vladsch.flexmark.ast.Heading;
	import com.vladsch.flexmark.ast.HtmlCommentBlock;
	import com.vladsch.flexmark.ast.Node;
	import com.vladsch.flexmark.ast.util.TextCollectingVisitor;
	import com.vladsch.flexmark.html.HtmlRenderer;
	import com.vladsch.flexmark.profiles.pegdown.Extensions;
	import com.vladsch.flexmark.profiles.pegdown.PegdownOptionsAdapter;
	import com.vladsch.flexmark.util.options.MutableDataHolder;
	import org.apache.commons.lang.StringEscapeUtils;
	import org.apache.commons.lang.StringUtils;
	import org.apache.maven.doxia.markup.HtmlMarkup;
	import org.apache.maven.doxia.module.xhtml.XhtmlParser;
	import org.apache.maven.doxia.parser.AbstractParser;
	import org.apache.maven.doxia.parser.ParseException;
	import org.apache.maven.doxia.parser.Parser;
	import org.apache.maven.doxia.sink.Sink;
	import org.codehaus.plexus.component.annotations.Component;
	import org.codehaus.plexus.component.annotations.Requirement;
	import org.codehaus.plexus.util.IOUtil;
	import org.codehaus.plexus.util.xml.pull.XmlPullParser;

	import java.io.IOException;
	import java.io.Reader;
	import java.io.StringReader;
	import java.util.ArrayList;
	import java.util.regex.Matcher;
	import java.util.regex.Pattern;

	/**
	* Implementation of {@link org.apache.maven.doxia.parser.Parser} for Markdown documents.
	* <p/>
	* Defers effective parsing to the <a href="https://github.com/vsch/flexmark-java">flexmark-java library</a>,
	* which generates HTML content then delegates parsing of this content to a slightly modified Doxia Xhtml parser.
	* (before 1.8, the <a href="http://pegdown.org">PegDown library</a> was used)
	*
	* @author Vladimir Schneider <vladimir@vladsch.com>
	* @author Julien Nicoulaud <julien.nicoulaud@gmail.com>
	* @since 1.3
	*/
	@Component( role = Parser.class, hint = "markdown" )
	public class MarkdownParser
	extends AbstractParser
	{

	/**
	* The role hint for the {@link MarkdownParser} Plexus component.
	*/
	public static final String ROLE_HINT = "markdown";

	/**
	* Regex that identifies a multimarkdown-style metadata section at the start of the document
	*/
	private static final String MULTI_MARKDOWN_METADATA_SECTION =
	"^(((?:[^\\s:][^:]):(?:.(?:\r?\n\\p{Blank}+[^\\s].)\r?\n))+)(?:\\s*\r?\n)";

	/**
	* Regex that captures the key and value of a multimarkdown-style metadata entry.
	*/
	private static final String MULTI_MARKDOWN_METADATA_ENTRY =
	"([^\\s:][^:]):(.(?:\r?\n\\p{Blank}+[^\\s].))\r?\n";

	/**
	* In order to ensure that we have minimal risk of false positives when slurping metadata sections, the
	* first key in the metadata section must be one of these standard keys or else the entire metadata section is
	* ignored.
	*/
	private static final String[] STANDARD_METADATA_KEYS =
	{ "title", "author", "date", "address", "affiliation", "copyright", "email", "keywords", "language", "phone",
	"subtitle" };

	public int getType()
	{
	return TXT_TYPE;
	}

	@Requirement
	private MarkdownHtmlParser parser;

	public void parse( Reader source, Sink sink )
	throws ParseException
	{
	try
	{
	// Markdown to HTML (using flexmark-java library)
	String html = toHtml( source );
	// then HTML to Sink API
	parser.parse( new StringReader( html ), sink );
	}
	catch ( IOException e )
	{
	throw new ParseException( "Failed reading Markdown source document", e );
	}
	}

	/**
	* uses flexmark-java library to parse content and generate HTML output.
	*
	* @param source the Markdown source
	* @return HTML content generated by flexmark-java
	* @throws IOException passed through
	*/
	private String toHtml( Reader source )
	throws IOException
	{
	String text = IOUtil.toString( source );
	MutableDataHolder flexmarkOptions = PegdownOptionsAdapter.flexmarkOptions(
	Extensions.ALL & ~( Extensions.HARDWRAPS \| Extensions.ANCHORLINKS ) ).toMutable();
	ArrayList<Extension> extensions = new ArrayList<Extension>();
	for ( Extension extension : flexmarkOptions.get( com.vladsch.flexmark.parser.Parser.EXTENSIONS ) )
	{
	extensions.add( extension );
	}

	extensions.add( FlexmarkDoxiaExtension.create() );
	flexmarkOptions.set( com.vladsch.flexmark.parser.Parser.EXTENSIONS, extensions );
	flexmarkOptions.set( HtmlRenderer.HTML_BLOCK_OPEN_TAG_EOL, false );
	flexmarkOptions.set( HtmlRenderer.HTML_BLOCK_CLOSE_TAG_EOL, false );
	flexmarkOptions.set( HtmlRenderer.MAX_TRAILING_BLANK_LINES, -1 );

	com.vladsch.flexmark.parser.Parser parser = com.vladsch.flexmark.parser.Parser.builder( flexmarkOptions )
	.build();
	HtmlRenderer renderer = HtmlRenderer.builder( flexmarkOptions ).build();

	StringBuilder html = new StringBuilder( 1000 );
	html.append( "<html>" );
	html.append( "<head>" );
	Pattern metadataPattern = Pattern.compile( MULTI_MARKDOWN_METADATA_SECTION, Pattern.MULTILINE );
	Matcher metadataMatcher = metadataPattern.matcher( text );
	boolean haveTitle = false;
	if ( metadataMatcher.find() )
	{
	metadataPattern = Pattern.compile( MULTI_MARKDOWN_METADATA_ENTRY, Pattern.MULTILINE );
	Matcher lineMatcher = metadataPattern.matcher( metadataMatcher.group( 1 ) );
	boolean first = true;
	while ( lineMatcher.find() )
	{
	String key = StringUtils.trimToEmpty( lineMatcher.group( 1 ) );
	if ( first )
	{
	boolean found = false;
	for ( String k : STANDARD_METADATA_KEYS )
	{
	if ( k.equalsIgnoreCase( key ) )
	{
	found = true;
	break;
	}
	}
	if ( !found )
	{
	break;
	}
	first = false;
	}
	String value = StringUtils.trimToEmpty( lineMatcher.group( 2 ) );
	if ( "title".equalsIgnoreCase( key ) )
	{
	haveTitle = true;
	html.append( "<title>" );
	html.append( StringEscapeUtils.escapeXml( value ) );
	html.append( "</title>" );
	}
	else if ( "author".equalsIgnoreCase( key ) )
	{
	html.append( "<meta name=\'author\' content=\'" );
	html.append( StringEscapeUtils.escapeXml( value ) );
	html.append( "\' />" );
	}
	else if ( "date".equalsIgnoreCase( key ) )
	{
	html.append( "<meta name=\'date\' content=\'" );
	html.append( StringEscapeUtils.escapeXml( value ) );
	html.append( "\' />" );
	}
	else
	{
	html.append( "<meta name=\'" );
	html.append( StringEscapeUtils.escapeXml( key ) );
	html.append( "\' content=\'" );
	html.append( StringEscapeUtils.escapeXml( value ) );
	html.append( "\' />" );
	}
	}
	if ( !first )
	{
	text = text.substring( metadataMatcher.end() );
	}
	}

	Node rootNode = parser.parse( text );
	String markdownHtml = renderer.render( rootNode );

	if ( !haveTitle && rootNode.hasChildren() )
	{
	// use the first (non-comment) node only if it is a heading
	Node firstNode = rootNode.getFirstChild();
	while ( firstNode != null && !( firstNode instanceof Heading ) )
	{
	if ( !( firstNode instanceof HtmlCommentBlock ) )
	{
	break;
	}
	firstNode = firstNode.getNext();
	}

	if ( firstNode instanceof Heading )
	{
	html.append( "<title>" );
	TextCollectingVisitor collectingVisitor = new TextCollectingVisitor();
	String headingText = collectingVisitor.collectAndGetText( firstNode );
	html.append( StringEscapeUtils.escapeXml( headingText ) );
	html.append( "</title>" );
	}
	}
	html.append( "</head>" );
	html.append( "<body>" );
	html.append( markdownHtml );
	html.append( "</body>" );
	html.append( "</html>" );

	return html.toString();
	}

	/**
	* Internal parser for HTML generated by the Markdown library.
	*/
	@Component( role = MarkdownHtmlParser.class )
	public static class MarkdownHtmlParser
	extends XhtmlParser
	{
	public MarkdownHtmlParser()
	{
	super();
	}

	@Override
	protected boolean baseEndTag( XmlPullParser parser, Sink sink )
	{
	boolean visited = super.baseEndTag( parser, sink );
	if ( !visited )
	{
	if ( parser.getName().equals( HtmlMarkup.DIV.toString() ) )
	{
	handleUnknown( parser, sink, TAG_TYPE_END );
	visited = true;
	}
	}
	return visited;
	}

	@Override
	protected boolean baseStartTag( XmlPullParser parser, Sink sink )
	{
	boolean visited = super.baseStartTag( parser, sink );
	if ( !visited )
	{
	if ( parser.getName().equals( HtmlMarkup.DIV.toString() ) )
	{
	handleUnknown( parser, sink, TAG_TYPE_START );
	visited = true;
	}
	}
	return visited;
	}
	}
	}