blob: 14b8b67014935551c35577ee302b8f82b186dfa6 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.apache.maven.doxia.module.markdown;
import javax.inject.Inject;
import javax.inject.Named;
import javax.inject.Singleton;
import java.io.IOException;
import java.io.Reader;
import java.util.Arrays;
import java.util.Collections;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.stream.Collectors;
import com.vladsch.flexmark.ast.Heading;
import com.vladsch.flexmark.ast.HtmlCommentBlock;
import com.vladsch.flexmark.ext.abbreviation.AbbreviationExtension;
import com.vladsch.flexmark.ext.autolink.AutolinkExtension;
import com.vladsch.flexmark.ext.definition.DefinitionExtension;
import com.vladsch.flexmark.ext.escaped.character.EscapedCharacterExtension;
import com.vladsch.flexmark.ext.gfm.strikethrough.StrikethroughExtension;
import com.vladsch.flexmark.ext.tables.TablesExtension;
import com.vladsch.flexmark.ext.typographic.TypographicExtension;
import com.vladsch.flexmark.ext.wikilink.WikiLinkExtension;
import com.vladsch.flexmark.ext.yaml.front.matter.YamlFrontMatterExtension;
import com.vladsch.flexmark.html.HtmlRenderer;
import com.vladsch.flexmark.util.ast.Node;
import com.vladsch.flexmark.util.ast.TextCollectingVisitor;
import com.vladsch.flexmark.util.data.MutableDataSet;
import org.apache.commons.io.IOUtils;
import org.apache.maven.doxia.markup.HtmlMarkup;
import org.apache.maven.doxia.markup.TextMarkup;
import org.apache.maven.doxia.module.xhtml5.Xhtml5Parser;
import org.apache.maven.doxia.parser.AbstractTextParser;
import org.apache.maven.doxia.parser.ParseException;
import org.apache.maven.doxia.sink.Sink;
import org.apache.maven.doxia.util.HtmlTools;
import org.codehaus.plexus.util.xml.pull.XmlPullParser;
/**
* <p>
* Implementation of {@link org.apache.maven.doxia.parser.Parser} for Markdown documents.
* </p>
* <p>
* Defers effective parsing to the <a href="https://github.com/vsch/flexmark-java">flexmark-java library</a>,
* which generates HTML content then delegates parsing of this content to a slightly modified Doxia Xhtml5 parser.
* (before 1.8, the <a href="http://pegdown.org">PegDown library</a> was used)
* </p>
*
* @author Vladimir Schneider
* @author Julien Nicoulaud
* @since 1.3
*/
@Singleton
@Named("markdown")
public class MarkdownParser extends AbstractTextParser implements TextMarkup {
/**
* Regex that identifies a multimarkdown-style metadata section at the start of the document
*
* In order to ensure that we have minimal risk of false positives when slurping metadata sections, the
* first key in the metadata section must be one of these standard keys or else the entire metadata section is
* ignored.
* @see <a href="https://fletcher.github.io/MultiMarkdown-5/metadata.html">Multimarkdown Metadata</a>
*/
private static final Pattern METADATA_SECTION_PATTERN = Pattern.compile(
"\\A^"
+ "(?:title|author|date|address|affiliation|copyright|email|keywords|language|phone|subtitle)"
+ "[ \\t]*:[\\S\\s]+?^[ \\t]*$",
Pattern.MULTILINE | Pattern.CASE_INSENSITIVE);
/**
* Regex that captures the key and value of a multimarkdown-style metadata entry.
* Group 1 captures the key, group 2 captures the value. Multivalues are not supported in the syntax!
* Multiline values need to be normalized
* @see <a href="https://fletcher.github.io/MultiMarkdown-5/metadata.html">Multimarkdown Metadata</a>
*
*/
private static final Pattern METADATA_ENTRY_PATTERN = Pattern.compile(
"^([^:\\r\\n]+?)[ \\t]*:([\\S\\s]+?)(?=(?:^(?:[^:\\r\\n]+?)[ \\t]*:)|^[ \\t]*$)", Pattern.MULTILINE);
/**
* The parser of the HTML produced by Flexmark, that we will
* use to convert this HTML to Sink events
*/
@Inject
private MarkdownHtmlParser parser;
/**
* Flexmark's Markdown parser (one static instance fits all)
*/
private static final com.vladsch.flexmark.parser.Parser FLEXMARK_PARSER;
/**
* Flexmark's Markdown Metadata parser
*/
private static final com.vladsch.flexmark.parser.Parser FLEXMARK_METADATA_PARSER;
/**
* Flexmark's HTML renderer (its output will be re-parsed and converted to Sink events)
*/
private static final HtmlRenderer FLEXMARK_HTML_RENDERER;
// Initialize the Flexmark parser and renderer, once and for all
static {
MutableDataSet flexmarkOptions = new MutableDataSet();
// Enable the extensions that we used to have in Pegdown
flexmarkOptions.set(
com.vladsch.flexmark.parser.Parser.EXTENSIONS,
Arrays.asList(
EscapedCharacterExtension.create(),
AbbreviationExtension.create(),
AutolinkExtension.create(),
DefinitionExtension.create(),
TypographicExtension.create(),
TablesExtension.create(),
WikiLinkExtension.create(),
StrikethroughExtension.create()));
// Disable wrong apostrophe replacement
flexmarkOptions.set(TypographicExtension.SINGLE_QUOTE_UNMATCHED, "&apos;");
// Additional options on the HTML rendering
flexmarkOptions.set(HtmlRenderer.HTML_BLOCK_OPEN_TAG_EOL, false);
flexmarkOptions.set(HtmlRenderer.HTML_BLOCK_CLOSE_TAG_EOL, false);
flexmarkOptions.set(HtmlRenderer.MAX_TRAILING_BLANK_LINES, -1);
// Build the Markdown parser
FLEXMARK_PARSER =
com.vladsch.flexmark.parser.Parser.builder(flexmarkOptions).build();
MutableDataSet flexmarkMetadataOptions = new MutableDataSet();
flexmarkMetadataOptions.set(
com.vladsch.flexmark.parser.Parser.EXTENSIONS, Arrays.asList(YamlFrontMatterExtension.create()));
FLEXMARK_METADATA_PARSER = com.vladsch.flexmark.parser.Parser.builder(flexmarkMetadataOptions)
.build();
// Build the HTML renderer
FLEXMARK_HTML_RENDERER = HtmlRenderer.builder(flexmarkOptions)
.linkResolverFactory(new FlexmarkDoxiaLinkResolver.Factory())
.build();
}
/** {@inheritDoc} */
@Override
public void parse(Reader source, Sink sink, String reference) throws ParseException {
try {
// Markdown to HTML (using flexmark-java library)
String html = toHtml(source);
// TODO: add locator for the markdown source (not the intermediate HTML format)
// this requires writing a custom renderer not leveraging the XHTML parser
// then HTML to Sink API
parser.parse(html, getWrappedSink(sink), "Intermediate HTML from " + reference);
} catch (IOException e) {
throw new ParseException("Failed reading Markdown source document", e);
}
}
private boolean processMetadataForHtml(StringBuilder html, StringBuilder source) {
final Map<String, List<String>> metadata;
final int endOffset; // end of metadata within source
// support two types of metadata:
if (source.toString().startsWith("---")) {
// 1. YAML front matter (https://github.com/vsch/flexmark-java/wiki/Extensions#yaml-front-matter)
Node documentRoot = FLEXMARK_METADATA_PARSER.parse(source.toString());
YamlFrontMatterVisitor visitor = new YamlFrontMatterVisitor();
visitor.visit(documentRoot);
metadata = visitor.getData();
endOffset = visitor.getEndOffset();
} else {
// 2. Multimarkdown metadata (https://fletcher.github.io/MultiMarkdown-5/metadata.html), not yet supported
// by Flexmark (https://github.com/vsch/flexmark-java/issues/550)
metadata = new LinkedHashMap<>();
Matcher metadataMatcher = METADATA_SECTION_PATTERN.matcher(source);
if (metadataMatcher.find()) {
String entry = metadataMatcher.group(0) + EOL;
Matcher entryMatcher = METADATA_ENTRY_PATTERN.matcher(entry);
while (entryMatcher.find()) {
String key = entryMatcher.group(1);
String value = normalizeMultilineValue(entryMatcher.group(2));
metadata.put(key, Collections.singletonList(value));
}
endOffset = metadataMatcher.end(0);
} else {
endOffset = 0;
}
}
if (endOffset > 0) {
// Trim the metadata from the source
source.delete(0, endOffset);
}
return writeHtmlMetadata(html, metadata);
}
static String normalizeMultilineValue(String value) {
return value.trim().replaceAll("[ \\t]*[\\r\\n]+[ \\t]*", " ");
}
private boolean writeHtmlMetadata(StringBuilder html, Map<String, List<String>> data) {
boolean containsTitle = false;
for (Entry<String, List<String>> entry : data.entrySet()) {
if (writeHtmlMetadata(html, entry.getKey(), entry.getValue())) {
containsTitle = true;
}
}
return containsTitle;
}
private boolean writeHtmlMetadata(StringBuilder html, String key, List<String> values) {
if ("title".equalsIgnoreCase(key)) {
html.append("<title>");
html.append(HtmlTools.escapeHTML(values.stream().collect(Collectors.joining(", ")), false));
html.append("</title>");
return true;
} else {
if (key.equalsIgnoreCase("author") && values.size() > 1) {
// for multiple authors emit multiple meta tags
for (String value : values) {
writeHtmlMetadata(html, key, Collections.singletonList(value));
}
} else {
// every other multi-value should just be concatenated and emitted in a single meta tag
final String separator;
if (key.equalsIgnoreCase("keywords")) {
separator = ",";
} else {
separator = EOL;
}
html.append("<meta name='");
html.append(HtmlTools.escapeHTML(key));
html.append("' content='");
html.append(HtmlTools.escapeHTML(values.stream().collect(Collectors.joining(separator))));
html.append("' />");
}
return false;
}
}
/**
* uses flexmark-java library to parse content and generate HTML output.
*
* @param source the Markdown source
* @return HTML content generated by flexmark-java
* @throws IOException passed through
*/
String toHtml(Reader source) throws IOException {
// Read the source
StringBuilder markdownText = new StringBuilder(IOUtils.toString(source));
// Now, build the HTML document
StringBuilder html = new StringBuilder(1000);
html.append("<html>");
html.append("<head>");
boolean haveTitle = processMetadataForHtml(html, markdownText);
// Now is the time to parse the Markdown document
// (after we've trimmed out the metadatas, and before we check for its headings)
Node documentRoot = FLEXMARK_PARSER.parse(markdownText.toString());
// Special trick: if there is no title specified as a metadata in the header, we will use the first
// heading as the document title
if (!haveTitle && documentRoot.hasChildren()) {
// Skip the comment nodes
Node firstNode = documentRoot.getFirstChild();
while (firstNode != null && firstNode instanceof HtmlCommentBlock) {
firstNode = firstNode.getNext();
}
// If this first non-comment node is a heading, we use it as the document title
if (firstNode != null && firstNode instanceof Heading) {
html.append("<title>");
TextCollectingVisitor collectingVisitor = new TextCollectingVisitor();
String headingText = collectingVisitor.collectAndGetText(firstNode);
html.append(HtmlTools.escapeHTML(headingText, false));
html.append("</title>");
}
}
html.append("</head>");
html.append("<body>");
// Convert our Markdown document to HTML and append it to our HTML
FLEXMARK_HTML_RENDERER.render(documentRoot, html);
html.append("</body>");
html.append("</html>");
return html.toString();
}
/**
* Internal parser for HTML generated by the Markdown library.
*
* 2 special things:
* <ul>
* <li> DIV elements are translated as Unknown Sink events
* </ul>
* PRE elements need to be "source" because the Xhtml5Sink will surround the
* corresponding verbatim() Sink event with a DIV element with class="source",
* which is how most Maven Skin (incl. Fluido) recognize a block of code, which
* needs to be highlighted accordingly.
*/
@Named
public static class MarkdownHtmlParser extends Xhtml5Parser {
public MarkdownHtmlParser() {
super();
}
@Override
protected void init() {
super.init();
}
@Override
protected boolean baseEndTag(XmlPullParser parser, Sink sink) {
boolean visited = super.baseEndTag(parser, sink);
if (!visited) {
if (parser.getName().equals(HtmlMarkup.DIV.toString())) {
handleUnknown(parser, sink, TAG_TYPE_END);
visited = true;
}
}
return visited;
}
@Override
protected boolean baseStartTag(XmlPullParser parser, Sink sink) {
boolean visited = super.baseStartTag(parser, sink);
if (!visited) {
if (parser.getName().equals(HtmlMarkup.DIV.toString())) {
handleUnknown(parser, sink, TAG_TYPE_START);
visited = true;
}
}
return visited;
}
}
}