Make metadata identification less greedy, and use the first node as heading if it is a header and there is no other header available
git-svn-id: https://svn.apache.org/repos/asf/maven/doxia/doxia/trunk@1541436 13f79535-47bb-0310-9956-ffa450edef68
diff --git a/doxia-modules/doxia-module-markdown/src/main/java/org/apache/maven/doxia/module/markdown/MarkdownParser.java b/doxia-modules/doxia-module-markdown/src/main/java/org/apache/maven/doxia/module/markdown/MarkdownParser.java
index d46fab6..22f94fe 100644
--- a/doxia-modules/doxia-module-markdown/src/main/java/org/apache/maven/doxia/module/markdown/MarkdownParser.java
+++ b/doxia-modules/doxia-module-markdown/src/main/java/org/apache/maven/doxia/module/markdown/MarkdownParser.java
@@ -29,7 +29,11 @@
import org.codehaus.plexus.util.IOUtil;
import org.pegdown.Extensions;
import org.pegdown.PegDownProcessor;
+import org.pegdown.ast.HeaderNode;
+import org.pegdown.ast.Node;
import org.pegdown.ast.RootNode;
+import org.pegdown.ast.SuperNode;
+import org.pegdown.ast.TextNode;
import java.io.IOException;
import java.io.Reader;
@@ -65,13 +69,22 @@
* Regex that identifies a multimarkdown-style metadata section at the start of the document
*/
private static final String MULTI_MARKDOWN_METADATA_SECTION =
- "^((?:[^\\s:][^:]*):(?:.*(?:\r?\n\\s[^\\s].*)*\r?\n))+(?:\\s*\r?\n)";
+ "^(((?:[^\\s:][^:]*):(?:.*(?:\r?\n\\s[^\\s].*)*\r?\n))+)(?:\\s*\r?\n)";
/**
* Regex that captures the key and value of a multimarkdown-style metadata entry.
*/
private static final String MULTI_MARKDOWN_METADATA_ENTRY = "([^\\s:][^:]*):(.*(?:\r?\n\\s[^\\s].*)*)\r?\n";
+ /**
+ * In order to ensure that we have minimal risk of false positives when slurping metadata sections, the
+ * first key in the metadata section must be one of these standard keys or else the entire metadata section is
+ * ignored.
+ */
+ private static final String[] STANDARD_METADATA_KEYS =
+ { "title", "author", "date", "address", "affiliation", "copyright", "email", "keywords", "language", "phone",
+ "subtitle" };
+
/**
* {@inheritDoc}
@@ -88,38 +101,80 @@
html.append( "<head>" );
Pattern metadataPattern = Pattern.compile( MULTI_MARKDOWN_METADATA_SECTION, Pattern.MULTILINE );
Matcher metadataMatcher = metadataPattern.matcher( text );
+ boolean haveTitle = false;
if ( metadataMatcher.find() )
{
metadataPattern = Pattern.compile( MULTI_MARKDOWN_METADATA_ENTRY, Pattern.MULTILINE );
- for ( int i = 1; i <= metadataMatcher.groupCount(); i++ )
+ Matcher lineMatcher = metadataPattern.matcher( metadataMatcher.group( 1 ) );
+ boolean first = true;
+ while ( lineMatcher.find() )
{
- String line = metadataMatcher.group( i );
- Matcher lineMatcher = metadataPattern.matcher( line );
- if ( lineMatcher.matches() )
+ String key = StringUtils.trimToEmpty( lineMatcher.group( 1 ) );
+ if ( first )
{
- String key = StringUtils.trimToEmpty( lineMatcher.group( 1 ) );
- String value = StringUtils.trimToEmpty( lineMatcher.group( 2 ) );
- if ( "title".equalsIgnoreCase( key ) )
+ boolean found = false;
+ for ( String k : STANDARD_METADATA_KEYS )
{
- html.append( "<title>" );
- html.append( StringEscapeUtils.escapeXml( value ) );
- html.append( "</title>" );
+ if ( k.equalsIgnoreCase( key ) )
+ {
+ found = true;
+ break;
+ }
}
- else
+ if ( !found )
{
- html.append( "<meta name=\'" );
- html.append( StringEscapeUtils.escapeXml( key ) );
- html.append( "\' content=\'" );
- html.append( StringEscapeUtils.escapeXml( value ) );
- html.append( "\' />" );
+ break;
}
+ first = false;
+ }
+ String value = StringUtils.trimToEmpty( lineMatcher.group( 2 ) );
+ if ( "title".equalsIgnoreCase( key ) )
+ {
+ haveTitle = true;
+ html.append( "<title>" );
+ html.append( StringEscapeUtils.escapeXml( value ) );
+ html.append( "</title>" );
+ }
+ else if ( "author".equalsIgnoreCase( key ) )
+ {
+ html.append( "<meta name=\'author\' content=\'" );
+ html.append( StringEscapeUtils.escapeXml( value ) );
+ html.append( "\' />" );
+ }
+ else if ( "date".equalsIgnoreCase( key ) )
+ {
+ html.append( "<meta name=\'date\' content=\'" );
+ html.append( StringEscapeUtils.escapeXml( value ) );
+ html.append( "\' />" );
+ }
+ else
+ {
+ html.append( "<meta name=\'" );
+ html.append( StringEscapeUtils.escapeXml( key ) );
+ html.append( "\' content=\'" );
+ html.append( StringEscapeUtils.escapeXml( value ) );
+ html.append( "\' />" );
}
}
- text = text.substring( metadataMatcher.end() );
+ if ( !first )
+ {
+ text = text.substring( metadataMatcher.end() );
+ }
+ }
+ RootNode rootNode = PEGDOWN_PROCESSOR.parseMarkdown( text.toCharArray() );
+ if ( !haveTitle && rootNode.getChildren().size() > 0 )
+ {
+ // use the first node only if it is a heading
+ final Node firstNode = rootNode.getChildren().get( 0 );
+ if ( firstNode instanceof HeaderNode )
+ {
+ html.append( "<title>" );
+ html.append( StringEscapeUtils.escapeXml( nodeText( firstNode ) ) );
+ html.append( "</title>" );
+ }
}
html.append( "</head>" );
html.append( "<body>" );
- RootNode rootNode = PEGDOWN_PROCESSOR.parseMarkdown( text.toCharArray() );
html.append( new MarkdownToDoxiaHtmlSerializer().toHtml( rootNode ) );
html.append( "</body>" );
html.append( "</html>" );
@@ -130,4 +185,29 @@
throw new ParseException( "Failed reading Markdown source document", e );
}
}
+
+ public static String nodeText( Node node )
+ {
+ StringBuilder builder = new StringBuilder();
+ if ( node instanceof TextNode )
+ {
+ builder.append( TextNode.class.cast( node ).getText() );
+ }
+ else
+ {
+ for ( Node n : node.getChildren() )
+ {
+ if ( n instanceof TextNode )
+ {
+ builder.append( TextNode.class.cast( n ).getText() );
+ }
+ else if ( n instanceof SuperNode )
+ {
+ builder.append( nodeText( n ) );
+ }
+ }
+ }
+ return builder.toString();
+ }
+
}
diff --git a/doxia-modules/doxia-module-markdown/src/test/java/org/apache/maven/doxia/module/markdown/MarkdownParserTest.java b/doxia-modules/doxia-module-markdown/src/test/java/org/apache/maven/doxia/module/markdown/MarkdownParserTest.java
index 718274b..cc39888 100644
--- a/doxia-modules/doxia-module-markdown/src/test/java/org/apache/maven/doxia/module/markdown/MarkdownParserTest.java
+++ b/doxia-modules/doxia-module-markdown/src/test/java/org/apache/maven/doxia/module/markdown/MarkdownParserTest.java
@@ -19,18 +19,16 @@
* under the License.
*/
-import java.io.Reader;
-
-import java.util.Iterator;
-
import org.apache.maven.doxia.parser.AbstractParserTest;
import org.apache.maven.doxia.parser.ParseException;
import org.apache.maven.doxia.parser.Parser;
import org.apache.maven.doxia.sink.SinkEventElement;
import org.apache.maven.doxia.sink.SinkEventTestingSink;
-
import org.codehaus.plexus.util.IOUtil;
+import java.io.Reader;
+import java.util.Iterator;
+
/**
* Tests for {@link MarkdownParser}.
*
@@ -85,7 +83,7 @@
{
Iterator<SinkEventElement> it = parseFileToEventTestingSink( "paragraph" ).getEventList().iterator();
- assertEquals( it, "body", "paragraph", "text", "paragraph_", "body_" );
+ assertEquals( it, "head", "head_", "body", "paragraph", "text", "paragraph_", "body_" );
assertFalse( it.hasNext() );
}
@@ -100,7 +98,7 @@
{
Iterator<SinkEventElement> it = parseFileToEventTestingSink( "bold" ).getEventList().iterator();
- assertEquals( it, "body", "paragraph", "bold", "text", "bold_", "paragraph_", "body_" );
+ assertEquals( it, "head", "head_", "body", "paragraph", "bold", "text", "bold_", "paragraph_", "body_" );
assertFalse( it.hasNext() );
}
@@ -115,7 +113,7 @@
{
Iterator<SinkEventElement> it = parseFileToEventTestingSink( "italic" ).getEventList().iterator();
- assertEquals( it, "body", "paragraph", "italic", "text", "italic_", "paragraph_", "body_" );
+ assertEquals( it, "head", "head_", "body", "paragraph", "italic", "text", "italic_", "paragraph_", "body_" );
assertFalse( it.hasNext() );
}
@@ -130,7 +128,7 @@
{
Iterator<SinkEventElement> it = parseFileToEventTestingSink( "code" ).getEventList().iterator();
- assertEquals( it, "body", "paragraph", "text", "paragraph_", "text", "verbatim", "text", "verbatim_", "body_" );
+ assertEquals( it, "head", "head_", "body", "paragraph", "text", "paragraph_", "text", "verbatim", "text", "verbatim_", "body_" );
assertFalse( it.hasNext() );
}
@@ -145,7 +143,7 @@
{
Iterator<SinkEventElement> it = parseFileToEventTestingSink( "image" ).getEventList().iterator();
- assertEquals( it, "body", "paragraph", "text", "figureGraphics", "text", "paragraph_", "body_" );
+ assertEquals( it, "head", "head_", "body", "paragraph", "text", "figureGraphics", "text", "paragraph_", "body_" );
assertFalse( it.hasNext() );
}
@@ -160,7 +158,7 @@
{
Iterator<SinkEventElement> it = parseFileToEventTestingSink( "link" ).getEventList().iterator();
- assertEquals( it, "body", "paragraph", "text", "link", "text", "link_", "text", "paragraph_", "body_" );
+ assertEquals( it, "head", "head_", "body", "paragraph", "text", "link", "text", "link_", "text", "paragraph_", "body_" );
assertFalse( it.hasNext() );
}
@@ -175,7 +173,7 @@
{
Iterator<SinkEventElement> it = parseFileToEventTestingSink( "list" ).getEventList().iterator();
- assertEquals( it, "body", "list", "text", "listItem", "text", "listItem_", "text", "listItem", "text",
+ assertEquals( it, "head", "head_", "body", "list", "text", "listItem", "text", "listItem_", "text", "listItem", "text",
"listItem_", "text", "list_", "body_" );
assertFalse( it.hasNext() );
@@ -191,13 +189,45 @@
{
Iterator<SinkEventElement> it = parseFileToEventTestingSink( "numbered-list" ).getEventList().iterator();
- assertEquals( it, "body", "numberedList", "text", "numberedListItem", "text", "numberedListItem_", "text",
+ assertEquals( it, "head", "head_", "body", "numberedList", "text", "numberedListItem", "text", "numberedListItem_", "text",
"numberedListItem", "text", "numberedListItem_", "text", "numberedList_", "body_" );
assertFalse( it.hasNext() );
}
/**
+ * Assert the metadata is passed through when parsing "metadata.md".
+ *
+ * @throws Exception if the event list is not correct when parsing the document.
+ */
+ public void testMetadataSinkEvent()
+ throws Exception
+ {
+ Iterator<SinkEventElement> it = parseFileToEventTestingSink( "metadata" ).getEventList().iterator();
+
+ assertEquals( it, "head", "title", "text", "title_", "author", "text", "author_", "date", "text", "date_",
+ "head_", "body", "paragraph", "text", "paragraph_", "body_" );
+
+ assertFalse( it.hasNext() );
+ }
+
+ /**
+ * Assert the first header is passed as title event when parsing "first-heading.md".
+ *
+ * @throws Exception if the event list is not correct when parsing the document.
+ */
+ public void testFirstHeadingSinkEvent()
+ throws Exception
+ {
+ Iterator<SinkEventElement> it = parseFileToEventTestingSink( "first-heading" ).getEventList().iterator();
+
+ assertEquals( it, "head", "title", "text", "title_", "head_", "body", "section1", "sectionTitle1", "text",
+ "sectionTitle1_", "paragraph", "text", "paragraph_", "section1_", "body_" );
+
+ assertFalse( it.hasNext() );
+ }
+
+ /**
* Parse the file and return a {@link SinkEventTestingSink}.
*
* @param file the file to parse with {@link #parser}.
diff --git a/doxia-modules/doxia-module-markdown/src/test/resources/first-heading.md b/doxia-modules/doxia-module-markdown/src/test/resources/first-heading.md
new file mode 100644
index 0000000..622fffe
--- /dev/null
+++ b/doxia-modules/doxia-module-markdown/src/test/resources/first-heading.md
@@ -0,0 +1,10 @@
+
+
+
+
+
+
+First heading
+-------------
+
+Text
\ No newline at end of file
diff --git a/doxia-modules/doxia-module-markdown/src/test/resources/metadata.md b/doxia-modules/doxia-module-markdown/src/test/resources/metadata.md
new file mode 100644
index 0000000..b89dfaa
--- /dev/null
+++ b/doxia-modules/doxia-module-markdown/src/test/resources/metadata.md
@@ -0,0 +1,7 @@
+title: A title
+author: Somebody
+date: 2013
+
+# The document
+
+Some text
\ No newline at end of file