blob: 000c86cc78d00355cf063637cfb73d1350cc81ee [file] [log] [blame]
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
package generator
import groovy.transform.CompileStatic
* This class is responsible for downloading a documentation page as generated through the Asciidoctor task
* of the Groovy build, then filter its contents in order to return only the body of the documentation, as HTML.
class DocumentationHTMLCleaner {
private final static String BODY_START = /<body/
private final static String BODY_END = /<\/body/
private final static String TOC_START = /<div id="toc"/
private final static String MAIN_START = /<div id="content"/
private final static String MAIN_END = /<div id="footer"/
* A list of links which are badly generated, but we know how to fix them
private final static Map<String,String> KNOWN_REPLACEMENTS = [
/docs\.groovy-lang\.org\/(latest|next)\/html\/documentation\/gdk\.html/: '',
/\/maven\/groovy-/: '/maven/apache-groovy-'
private static String cleanupPage(String location) {
def url = location.toURL()
try {
def fullHTML = url.getText('utf-8')
return extractBetween(fullHTML, BODY_START, BODY_END)
} catch (FileNotFoundException e) {
// 404 not found
private static String extractBetween(String html, String startString, String endString) {
def start = html.indexOf(startString)
if (start > 0) {
start = html.indexOf('>', start) + 1
if (start > 1) {
def end = html.indexOf(endString, start)
if (end > start) {
return html.substring(start, end)
public static DocPage parsePage(String location) {
String contents = cleanupPage(location)
if (contents==null) {
return new DocPage(content: "Contents not found for <a href='$location'>$location</a>, most likely because this section has not yet been written.")
String toc = extractTOC(contents)?:''
String main = extractBetween(contents, MAIN_START, MAIN_END)?:"Main body not found for <a href='$location'>$location</a>"
main = replaceInternalLinks(main)
new DocPage(toc: toc, content: main)
private static String replaceInternalLinks(String html) {
def replacer = { List<String> it ->
def (String tag, String attr, String url) = [it[1], it[2], it[3]]
url = url.replaceAll(/x(.+)\.(?:pagespeed.+)/, '$1')
if (!url.startsWith('http') && !url.startsWith('#') && 'target.html'!=url) {
"$tag $attr'${DocUtils.DOCS_BASEURL}/html/documentation/$url'"
} else {
html = html.replaceAll(/(a)\s+(href=)["'](.+?)["']/,replacer)
html = html.replaceAll(/(img)\s+(src=)["'](.+?)["']/,replacer)
KNOWN_REPLACEMENTS.each { link, repl ->
html = html.replaceAll(link, repl)
private static String extractTOC(final String html) {
int start = html.indexOf(TOC_START)
if (start > 0) {
int end = html.indexOf(MAIN_START)
if (end>0) {
def out = html.substring(start, end).replace("<div id=\"toctitle\">Table of Contents</div>", "")
end = out.size()-1
while (!out.substring(end, out.size()).startsWith('</div>')) end--
return out.substring(0, end)
static class DocPage {
String toc = ''
String content