Make metadata identification less greedy, and use the first node as heading if it is a header and there is no other header available git-svn-id: https://svn.apache.org/repos/asf/maven/doxia/doxia/trunk@1541436 13f79535-47bb-0310-9956-ffa450edef68

commit: 4c433bdbc75c1af78735bf2d2a804cc732a4abb8 [log] [tgz]
author: Stephen Connolly <stephenc@apache.org> Wed Nov 13 09:29:47 2013 +0000
committer: Stephen Connolly <stephenc@apache.org> Wed Nov 13 09:29:47 2013 +0000
tree: f4017967964471a31cf4f80b85e5fbd4e5fa425a
parent: 35e482beb9d803066c2b1f288f6d814558514539 [diff]
diff --git a/doxia-modules/doxia-module-markdown/src/main/java/org/apache/maven/doxia/module/markdown/MarkdownParser.java b/doxia-modules/doxia-module-markdown/src/main/java/org/apache/maven/doxia/module/markdown/MarkdownParser.java
index d46fab6..22f94fe 100644
--- a/doxia-modules/doxia-module-markdown/src/main/java/org/apache/maven/doxia/module/markdown/MarkdownParser.java
+++ b/doxia-modules/doxia-module-markdown/src/main/java/org/apache/maven/doxia/module/markdown/MarkdownParser.java

@@ -29,7 +29,11 @@
 import org.codehaus.plexus.util.IOUtil;
 import org.pegdown.Extensions;
 import org.pegdown.PegDownProcessor;
+import org.pegdown.ast.HeaderNode;
+import org.pegdown.ast.Node;
 import org.pegdown.ast.RootNode;
+import org.pegdown.ast.SuperNode;
+import org.pegdown.ast.TextNode;
 
 import java.io.IOException;
 import java.io.Reader;
@@ -65,13 +69,22 @@
      * Regex that identifies a multimarkdown-style metadata section at the start of the document
      */
     private static final String MULTI_MARKDOWN_METADATA_SECTION =
-        "^((?:[^\\s:][^:]*):(?:.*(?:\r?\n\\s[^\\s].*)*\r?\n))+(?:\\s*\r?\n)";
+        "^(((?:[^\\s:][^:]*):(?:.*(?:\r?\n\\s[^\\s].*)*\r?\n))+)(?:\\s*\r?\n)";
 
     /**
      * Regex that captures the key and value of a multimarkdown-style metadata entry.
      */
     private static final String MULTI_MARKDOWN_METADATA_ENTRY = "([^\\s:][^:]*):(.*(?:\r?\n\\s[^\\s].*)*)\r?\n";
 
+    /**
+     * In order to ensure that we have minimal risk of false positives when slurping metadata sections, the
+     * first key in the metadata section must be one of these standard keys or else the entire metadata section is
+     * ignored.
+     */
+    private static final String[] STANDARD_METADATA_KEYS =
+        { "title", "author", "date", "address", "affiliation", "copyright", "email", "keywords", "language", "phone",
+            "subtitle" };
+
 
     /**
      * {@inheritDoc}
@@ -88,38 +101,80 @@
             html.append( "<head>" );
             Pattern metadataPattern = Pattern.compile( MULTI_MARKDOWN_METADATA_SECTION, Pattern.MULTILINE );
             Matcher metadataMatcher = metadataPattern.matcher( text );
+            boolean haveTitle = false;
             if ( metadataMatcher.find() )
             {
                 metadataPattern = Pattern.compile( MULTI_MARKDOWN_METADATA_ENTRY, Pattern.MULTILINE );
-                for ( int i = 1; i <= metadataMatcher.groupCount(); i++ )
+                Matcher lineMatcher = metadataPattern.matcher( metadataMatcher.group( 1 ) );
+                boolean first = true;
+                while ( lineMatcher.find() )
                 {
-                    String line = metadataMatcher.group( i );
-                    Matcher lineMatcher = metadataPattern.matcher( line );
-                    if ( lineMatcher.matches() )
+                    String key = StringUtils.trimToEmpty( lineMatcher.group( 1 ) );
+                    if ( first )
                     {
-                        String key = StringUtils.trimToEmpty( lineMatcher.group( 1 ) );
-                        String value = StringUtils.trimToEmpty( lineMatcher.group( 2 ) );
-                        if ( "title".equalsIgnoreCase( key ) )
+                        boolean found = false;
+                        for ( String k : STANDARD_METADATA_KEYS )
                         {
-                            html.append( "<title>" );
-                            html.append( StringEscapeUtils.escapeXml( value ) );
-                            html.append( "</title>" );
+                            if ( k.equalsIgnoreCase( key ) )
+                            {
+                                found = true;
+                                break;
+                            }
                         }
-                        else
+                        if ( !found )
                         {
-                            html.append( "<meta name=\'" );
-                            html.append( StringEscapeUtils.escapeXml( key ) );
-                            html.append( "\' content=\'" );
-                            html.append( StringEscapeUtils.escapeXml( value ) );
-                            html.append( "\' />" );
+                            break;
                         }
+                        first = false;
+                    }
+                    String value = StringUtils.trimToEmpty( lineMatcher.group( 2 ) );
+                    if ( "title".equalsIgnoreCase( key ) )
+                    {
+                        haveTitle = true;
+                        html.append( "<title>" );
+                        html.append( StringEscapeUtils.escapeXml( value ) );
+                        html.append( "</title>" );
+                    }
+                    else if ( "author".equalsIgnoreCase( key ) )
+                    {
+                        html.append( "<meta name=\'author\' content=\'" );
+                        html.append( StringEscapeUtils.escapeXml( value ) );
+                        html.append( "\' />" );
+                    }
+                    else if ( "date".equalsIgnoreCase( key ) )
+                    {
+                        html.append( "<meta name=\'date\' content=\'" );
+                        html.append( StringEscapeUtils.escapeXml( value ) );
+                        html.append( "\' />" );
+                    }
+                    else
+                    {
+                        html.append( "<meta name=\'" );
+                        html.append( StringEscapeUtils.escapeXml( key ) );
+                        html.append( "\' content=\'" );
+                        html.append( StringEscapeUtils.escapeXml( value ) );
+                        html.append( "\' />" );
                     }
                 }
-                text = text.substring( metadataMatcher.end() );
+                if ( !first )
+                {
+                    text = text.substring( metadataMatcher.end() );
+                }
+            }
+            RootNode rootNode = PEGDOWN_PROCESSOR.parseMarkdown( text.toCharArray() );
+            if ( !haveTitle && rootNode.getChildren().size() > 0 )
+            {
+                // use the first node only if it is a heading
+                final Node firstNode = rootNode.getChildren().get( 0 );
+                if ( firstNode instanceof HeaderNode )
+                {
+                    html.append( "<title>" );
+                    html.append( StringEscapeUtils.escapeXml( nodeText( firstNode ) ) );
+                    html.append( "</title>" );
+                }
             }
             html.append( "</head>" );
             html.append( "<body>" );
-            RootNode rootNode = PEGDOWN_PROCESSOR.parseMarkdown( text.toCharArray() );
             html.append( new MarkdownToDoxiaHtmlSerializer().toHtml( rootNode ) );
             html.append( "</body>" );
             html.append( "</html>" );
@@ -130,4 +185,29 @@
             throw new ParseException( "Failed reading Markdown source document", e );
         }
     }
+
+    public static String nodeText( Node node )
+    {
+        StringBuilder builder = new StringBuilder();
+        if ( node instanceof TextNode )
+        {
+            builder.append( TextNode.class.cast( node ).getText() );
+        }
+        else
+        {
+            for ( Node n : node.getChildren() )
+            {
+                if ( n instanceof TextNode )
+                {
+                    builder.append( TextNode.class.cast( n ).getText() );
+                }
+                else if ( n instanceof SuperNode )
+                {
+                    builder.append( nodeText( n ) );
+                }
+            }
+        }
+        return builder.toString();
+    }
+
 }

diff --git a/doxia-modules/doxia-module-markdown/src/test/java/org/apache/maven/doxia/module/markdown/MarkdownParserTest.java b/doxia-modules/doxia-module-markdown/src/test/java/org/apache/maven/doxia/module/markdown/MarkdownParserTest.java
index 718274b..cc39888 100644
--- a/doxia-modules/doxia-module-markdown/src/test/java/org/apache/maven/doxia/module/markdown/MarkdownParserTest.java
+++ b/doxia-modules/doxia-module-markdown/src/test/java/org/apache/maven/doxia/module/markdown/MarkdownParserTest.java

@@ -19,18 +19,16 @@
  * under the License.
  */
 
-import java.io.Reader;
-
-import java.util.Iterator;
-
 import org.apache.maven.doxia.parser.AbstractParserTest;
 import org.apache.maven.doxia.parser.ParseException;
 import org.apache.maven.doxia.parser.Parser;
 import org.apache.maven.doxia.sink.SinkEventElement;
 import org.apache.maven.doxia.sink.SinkEventTestingSink;
-
 import org.codehaus.plexus.util.IOUtil;
 
+import java.io.Reader;
+import java.util.Iterator;
+
 /**
  * Tests for {@link MarkdownParser}.
  *
@@ -85,7 +83,7 @@
     {
         Iterator<SinkEventElement> it = parseFileToEventTestingSink( "paragraph" ).getEventList().iterator();
 
-        assertEquals( it, "body", "paragraph", "text", "paragraph_", "body_" );
+        assertEquals( it, "head", "head_", "body", "paragraph", "text", "paragraph_", "body_" );
 
         assertFalse( it.hasNext() );
     }
@@ -100,7 +98,7 @@
     {
         Iterator<SinkEventElement> it = parseFileToEventTestingSink( "bold" ).getEventList().iterator();
 
-        assertEquals( it, "body", "paragraph", "bold", "text", "bold_", "paragraph_", "body_" );
+        assertEquals( it, "head", "head_", "body", "paragraph", "bold", "text", "bold_", "paragraph_", "body_" );
 
         assertFalse( it.hasNext() );
     }
@@ -115,7 +113,7 @@
     {
         Iterator<SinkEventElement> it = parseFileToEventTestingSink( "italic" ).getEventList().iterator();
 
-        assertEquals( it, "body", "paragraph", "italic", "text", "italic_", "paragraph_", "body_" );
+        assertEquals( it, "head", "head_", "body", "paragraph", "italic", "text", "italic_", "paragraph_", "body_" );
 
         assertFalse( it.hasNext() );
     }
@@ -130,7 +128,7 @@
     {
         Iterator<SinkEventElement> it = parseFileToEventTestingSink( "code" ).getEventList().iterator();
 
-        assertEquals( it, "body", "paragraph", "text", "paragraph_", "text", "verbatim", "text", "verbatim_", "body_" );
+        assertEquals( it, "head", "head_", "body", "paragraph", "text", "paragraph_", "text", "verbatim", "text", "verbatim_", "body_" );
 
         assertFalse( it.hasNext() );
     }
@@ -145,7 +143,7 @@
     {
         Iterator<SinkEventElement> it = parseFileToEventTestingSink( "image" ).getEventList().iterator();
 
-        assertEquals( it, "body", "paragraph", "text", "figureGraphics", "text", "paragraph_", "body_" );
+        assertEquals( it, "head", "head_", "body", "paragraph", "text", "figureGraphics", "text", "paragraph_", "body_" );
 
         assertFalse( it.hasNext() );
     }
@@ -160,7 +158,7 @@
     {
         Iterator<SinkEventElement> it = parseFileToEventTestingSink( "link" ).getEventList().iterator();
 
-        assertEquals( it, "body", "paragraph", "text", "link", "text", "link_", "text", "paragraph_", "body_" );
+        assertEquals( it, "head", "head_", "body", "paragraph", "text", "link", "text", "link_", "text", "paragraph_", "body_" );
 
         assertFalse( it.hasNext() );
     }
@@ -175,7 +173,7 @@
     {
         Iterator<SinkEventElement> it = parseFileToEventTestingSink( "list" ).getEventList().iterator();
 
-        assertEquals( it, "body", "list", "text", "listItem", "text", "listItem_", "text", "listItem", "text",
+        assertEquals( it, "head", "head_", "body", "list", "text", "listItem", "text", "listItem_", "text", "listItem", "text",
                       "listItem_", "text", "list_", "body_" );
 
         assertFalse( it.hasNext() );
@@ -191,13 +189,45 @@
     {
         Iterator<SinkEventElement> it = parseFileToEventTestingSink( "numbered-list" ).getEventList().iterator();
 
-        assertEquals( it, "body", "numberedList", "text", "numberedListItem", "text", "numberedListItem_", "text",
+        assertEquals( it, "head", "head_", "body", "numberedList", "text", "numberedListItem", "text", "numberedListItem_", "text",
                       "numberedListItem", "text", "numberedListItem_", "text", "numberedList_", "body_" );
 
         assertFalse( it.hasNext() );
     }
 
     /**
+     * Assert the metadata is passed through when parsing "metadata.md".
+     *
+     * @throws Exception if the event list is not correct when parsing the document.
+     */
+    public void testMetadataSinkEvent()
+        throws Exception
+    {
+        Iterator<SinkEventElement> it = parseFileToEventTestingSink( "metadata" ).getEventList().iterator();
+
+        assertEquals( it, "head", "title", "text", "title_", "author", "text", "author_", "date", "text", "date_",
+                      "head_", "body", "paragraph", "text", "paragraph_", "body_" );
+
+        assertFalse( it.hasNext() );
+    }
+
+    /**
+     * Assert the first header is passed as title event when parsing "first-heading.md".
+     *
+     * @throws Exception if the event list is not correct when parsing the document.
+     */
+    public void testFirstHeadingSinkEvent()
+        throws Exception
+    {
+        Iterator<SinkEventElement> it = parseFileToEventTestingSink( "first-heading" ).getEventList().iterator();
+
+        assertEquals( it, "head", "title", "text", "title_", "head_", "body", "section1", "sectionTitle1", "text",
+                      "sectionTitle1_", "paragraph", "text", "paragraph_", "section1_", "body_" );
+
+        assertFalse( it.hasNext() );
+    }
+
+    /**
      * Parse the file and return a {@link SinkEventTestingSink}.
      *
      * @param file the file to parse with {@link #parser}.

diff --git a/doxia-modules/doxia-module-markdown/src/test/resources/first-heading.md b/doxia-modules/doxia-module-markdown/src/test/resources/first-heading.md
new file mode 100644
index 0000000..622fffe
--- /dev/null
+++ b/doxia-modules/doxia-module-markdown/src/test/resources/first-heading.md

@@ -0,0 +1,10 @@
+
+
+
+
+
+
+First heading
+-------------
+
+Text
\ No newline at end of file

diff --git a/doxia-modules/doxia-module-markdown/src/test/resources/metadata.md b/doxia-modules/doxia-module-markdown/src/test/resources/metadata.md
new file mode 100644
index 0000000..b89dfaa
--- /dev/null
+++ b/doxia-modules/doxia-module-markdown/src/test/resources/metadata.md

@@ -0,0 +1,7 @@
+title: A title
+author: Somebody
+date: 2013
+
+# The document
+
+Some text
\ No newline at end of file
commit	4c433bdbc75c1af78735bf2d2a804cc732a4abb8	[log] [tgz]
author	Stephen Connolly <stephenc@apache.org>	Wed Nov 13 09:29:47 2013 +0000
committer	Stephen Connolly <stephenc@apache.org>	Wed Nov 13 09:29:47 2013 +0000
tree	f4017967964471a31cf4f80b85e5fbd4e5fa425a
parent	35e482beb9d803066c2b1f288f6d814558514539 [diff]