| package org.apache.maven.doxia.module.markdown; |
| |
| /* |
| * Licensed to the Apache Software Foundation (ASF) under one |
| * or more contributor license agreements. See the NOTICE file |
| * distributed with this work for additional information |
| * regarding copyright ownership. The ASF licenses this file |
| * to you under the Apache License, Version 2.0 (the |
| * "License"); you may not use this file except in compliance |
| * with the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, |
| * software distributed under the License is distributed on an |
| * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
| * KIND, either express or implied. See the License for the |
| * specific language governing permissions and limitations |
| * under the License. |
| */ |
| |
| import org.apache.commons.lang.StringEscapeUtils; |
| import org.apache.commons.lang.StringUtils; |
| import org.apache.maven.doxia.markup.HtmlMarkup; |
| import org.apache.maven.doxia.module.xhtml.XhtmlParser; |
| import org.apache.maven.doxia.parser.AbstractParser; |
| import org.apache.maven.doxia.parser.ParseException; |
| import org.apache.maven.doxia.parser.Parser; |
| import org.apache.maven.doxia.sink.Sink; |
| import org.codehaus.plexus.component.annotations.Component; |
| import org.codehaus.plexus.component.annotations.Requirement; |
| import org.codehaus.plexus.util.IOUtil; |
| import org.codehaus.plexus.util.xml.pull.XmlPullParser; |
| import org.pegdown.Extensions; |
| import org.pegdown.PegDownProcessor; |
| import org.pegdown.ast.HeaderNode; |
| import org.pegdown.ast.HtmlBlockNode; |
| import org.pegdown.ast.Node; |
| import org.pegdown.ast.RootNode; |
| import org.pegdown.ast.SuperNode; |
| import org.pegdown.ast.TextNode; |
| |
| import java.io.IOException; |
| import java.io.Reader; |
| import java.io.StringReader; |
| import java.util.regex.Matcher; |
| import java.util.regex.Pattern; |
| |
| /** |
| * Implementation of {@link org.apache.maven.doxia.parser.Parser} for Markdown documents. |
| * <p/> |
| * Defers effective parsing to the <a href="http://pegdown.org">PegDown library</a>, which generates HTML content |
| * then delegates parsing of this content to a slightly modified Doxia Xhtml parser. |
| * |
| * @author Julien Nicoulaud <julien.nicoulaud@gmail.com> |
| * @since 1.3 |
| * @see MarkdownToDoxiaHtmlSerializer |
| */ |
| @Component( role = Parser.class, hint = "markdown" ) |
| public class MarkdownParser |
| extends AbstractParser |
| { |
| |
| /** |
| * The role hint for the {@link MarkdownParser} Plexus component. |
| */ |
| public static final String ROLE_HINT = "markdown"; |
| |
| /** |
| * The {@link PegDownProcessor} used to convert Pegdown documents to HTML. |
| */ |
| protected static final PegDownProcessor PEGDOWN_PROCESSOR = |
| new PegDownProcessor( Extensions.ALL & ~Extensions.HARDWRAPS, Long.MAX_VALUE ); |
| |
| /** |
| * Regex that identifies a multimarkdown-style metadata section at the start of the document |
| */ |
| private static final String MULTI_MARKDOWN_METADATA_SECTION = |
| "^(((?:[^\\s:][^:]*):(?:.*(?:\r?\n\\p{Blank}+[^\\s].*)*\r?\n))+)(?:\\s*\r?\n)"; |
| |
| /** |
| * Regex that captures the key and value of a multimarkdown-style metadata entry. |
| */ |
| private static final String MULTI_MARKDOWN_METADATA_ENTRY = |
| "([^\\s:][^:]*):(.*(?:\r?\n\\p{Blank}+[^\\s].*)*)\r?\n"; |
| |
| /** |
| * In order to ensure that we have minimal risk of false positives when slurping metadata sections, the |
| * first key in the metadata section must be one of these standard keys or else the entire metadata section is |
| * ignored. |
| */ |
| private static final String[] STANDARD_METADATA_KEYS = |
| { "title", "author", "date", "address", "affiliation", "copyright", "email", "keywords", "language", "phone", |
| "subtitle" }; |
| |
| public int getType() |
| { |
| return TXT_TYPE; |
| } |
| |
| @Requirement |
| private PegDownHtmlParser parser; |
| |
| public void parse( Reader source, Sink sink ) |
| throws ParseException |
| { |
| try |
| { |
| // Markdown to HTML (using Pegdown library) |
| String html = toHtml( source ); |
| // then HTML to Sink API |
| parser.parse( new StringReader( html ), sink ); |
| } |
| catch ( IOException e ) |
| { |
| throw new ParseException( "Failed reading Markdown source document", e ); |
| } |
| } |
| |
| /** |
| * uses PegDown library to parse content and generate HTML output. |
| * |
| * @param source the Markdown source |
| * @return HTML content generated by PegDown |
| * @throws IOException |
| * @see MarkdownToDoxiaHtmlSerializer |
| */ |
| private String toHtml( Reader source ) |
| throws IOException |
| { |
| String text = IOUtil.toString( source ); |
| StringBuilder html = new StringBuilder( text.length() * 2 ); |
| html.append( "<html>" ); |
| html.append( "<head>" ); |
| Pattern metadataPattern = Pattern.compile( MULTI_MARKDOWN_METADATA_SECTION, Pattern.MULTILINE ); |
| Matcher metadataMatcher = metadataPattern.matcher( text ); |
| boolean haveTitle = false; |
| if ( metadataMatcher.find() ) |
| { |
| metadataPattern = Pattern.compile( MULTI_MARKDOWN_METADATA_ENTRY, Pattern.MULTILINE ); |
| Matcher lineMatcher = metadataPattern.matcher( metadataMatcher.group( 1 ) ); |
| boolean first = true; |
| while ( lineMatcher.find() ) |
| { |
| String key = StringUtils.trimToEmpty( lineMatcher.group( 1 ) ); |
| if ( first ) |
| { |
| boolean found = false; |
| for ( String k : STANDARD_METADATA_KEYS ) |
| { |
| if ( k.equalsIgnoreCase( key ) ) |
| { |
| found = true; |
| break; |
| } |
| } |
| if ( !found ) |
| { |
| break; |
| } |
| first = false; |
| } |
| String value = StringUtils.trimToEmpty( lineMatcher.group( 2 ) ); |
| if ( "title".equalsIgnoreCase( key ) ) |
| { |
| haveTitle = true; |
| html.append( "<title>" ); |
| html.append( StringEscapeUtils.escapeXml( value ) ); |
| html.append( "</title>" ); |
| } |
| else if ( "author".equalsIgnoreCase( key ) ) |
| { |
| html.append( "<meta name=\'author\' content=\'" ); |
| html.append( StringEscapeUtils.escapeXml( value ) ); |
| html.append( "\' />" ); |
| } |
| else if ( "date".equalsIgnoreCase( key ) ) |
| { |
| html.append( "<meta name=\'date\' content=\'" ); |
| html.append( StringEscapeUtils.escapeXml( value ) ); |
| html.append( "\' />" ); |
| } |
| else |
| { |
| html.append( "<meta name=\'" ); |
| html.append( StringEscapeUtils.escapeXml( key ) ); |
| html.append( "\' content=\'" ); |
| html.append( StringEscapeUtils.escapeXml( value ) ); |
| html.append( "\' />" ); |
| } |
| } |
| if ( !first ) |
| { |
| text = text.substring( metadataMatcher.end() ); |
| } |
| } |
| RootNode rootNode = PEGDOWN_PROCESSOR.parseMarkdown( text.toCharArray() ); |
| if ( !haveTitle && rootNode.getChildren().size() > 0 ) |
| { |
| // use the first (non-comment) node only if it is a heading |
| int i = 0; |
| Node firstNode = null; |
| while ( i < rootNode.getChildren().size() && isHtmlComment( |
| ( firstNode = rootNode.getChildren().get( i ) ) ) ) |
| { |
| i++; |
| } |
| if ( firstNode instanceof HeaderNode ) |
| { |
| html.append( "<title>" ); |
| html.append( StringEscapeUtils.escapeXml( nodeText( firstNode ) ) ); |
| html.append( "</title>" ); |
| } |
| } |
| html.append( "</head>" ); |
| html.append( "<body>" ); |
| html.append( new MarkdownToDoxiaHtmlSerializer().toHtml( rootNode ) ); |
| html.append( "</body>" ); |
| html.append( "</html>" ); |
| |
| return html.toString(); |
| } |
| |
| public static boolean isHtmlComment( Node node ) |
| { |
| if ( node instanceof HtmlBlockNode ) |
| { |
| HtmlBlockNode blockNode = (HtmlBlockNode) node; |
| return blockNode.getText().startsWith( "<!--" ); |
| } |
| return false; |
| } |
| |
| public static String nodeText( Node node ) |
| { |
| StringBuilder builder = new StringBuilder(); |
| if ( node instanceof TextNode ) |
| { |
| builder.append( TextNode.class.cast( node ).getText() ); |
| } |
| else |
| { |
| for ( Node n : node.getChildren() ) |
| { |
| if ( n instanceof TextNode ) |
| { |
| builder.append( TextNode.class.cast( n ).getText() ); |
| } |
| else if ( n instanceof SuperNode ) |
| { |
| builder.append( nodeText( n ) ); |
| } |
| } |
| } |
| return builder.toString(); |
| } |
| |
| /** |
| * Internal parser for HTML generated by PegDown library. |
| */ |
| @Component( role = PegDownHtmlParser.class ) |
| public static class PegDownHtmlParser |
| extends XhtmlParser |
| { |
| public PegDownHtmlParser() |
| { |
| super(); |
| } |
| |
| @Override |
| protected boolean baseEndTag( XmlPullParser parser, Sink sink ) |
| { |
| boolean visited = super.baseEndTag( parser, sink ); |
| if ( !visited ) |
| { |
| if ( parser.getName().equals( HtmlMarkup.DIV.toString() ) ) |
| { |
| handleUnknown( parser, sink, TAG_TYPE_END ); |
| visited = true; |
| } |
| } |
| return visited; |
| } |
| |
| @Override |
| protected boolean baseStartTag( XmlPullParser parser, Sink sink ) |
| { |
| boolean visited = super.baseStartTag( parser, sink ); |
| if ( !visited ) |
| { |
| if ( parser.getName().equals( HtmlMarkup.DIV.toString() ) ) |
| { |
| handleUnknown( parser, sink, TAG_TYPE_START ); |
| visited = true; |
| } |
| } |
| return visited; |
| } |
| } |
| } |