generator/src/main/groovy/generator/DocumentationHTMLCleaner.groovy - groovy-website - Git at Google

 /*
  *  Licensed to the Apache Software Foundation (ASF) under one
  *  or more contributor license agreements.  See the NOTICE file
  *  distributed with this work for additional information
  *  regarding copyright ownership.  The ASF licenses this file
  *  to you under the Apache License, Version 2.0 (the
  *  "License"); you may not use this file except in compliance
  *  with the License.  You may obtain a copy of the License at
  *
  *    http://www.apache.org/licenses/LICENSE-2.0
  *
  *  Unless required by applicable law or agreed to in writing,
  *  software distributed under the License is distributed on an
  *  "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
  *  KIND, either express or implied.  See the License for the
  *  specific language governing permissions and limitations
  *  under the License.
  */
 package generator

 import groovy.transform.CompileStatic

 /**
  * This class is responsible for downloading a documentation page as generated through the Asciidoctor task
  * of the Groovy build, then filter its contents in order to return only the body of the documentation, as HTML.
  *
  * @author Cédric Champeau
  */
 @CompileStatic
 class DocumentationHTMLCleaner {
     private final static String BODY_START = /<body/
     private final static String BODY_END = /<\/body/
     private final static String TOC_START = /<div id="toc"/
     private final static String MAIN_START = /<div id="content"/
     private final static String MAIN_END = /<div id="footer"/

     /**
      * A list of links which are badly generated, but we know how to fix them
      */
     private final static Map<String,String> KNOWN_REPLACEMENTS = [
         /docs\.groovy-lang\.org\/(latest|next)\/html\/documentation\/gdk\.html/: 'groovy-lang.org/gdk.html',
         /\/maven\/groovy-/: '/maven/apache-groovy-'
     ]

     private static String cleanupPage(String location) {
         def url = location.toURL()
         try {
             def fullHTML = url.getText('utf-8')
             return extractBetween(fullHTML, BODY_START, BODY_END)
         } catch (FileNotFoundException e) {
             // 404 not found
         }

         null
     }

     private static String extractBetween(String html, String startString, String endString) {
         def start = html.indexOf(startString)
         if (start > 0) {
             start = html.indexOf('>', start) + 1
         }
         if (start > 1) {
             def end = html.indexOf(endString, start)
             if (end > start) {
                 return html.substring(start, end)
             }
         }
         null
     }

     public static DocPage parsePage(String location) {
         String contents = cleanupPage(location)
         if (contents==null) {
             return new DocPage(content: "Contents not found for <a href='$location'>$location</a>, most likely because this section has not yet been written.")
         }
         String toc = extractTOC(contents)?:''
         String main = extractBetween(contents, MAIN_START, MAIN_END)?:"Main body not found for <a href='$location'>$location</a>"
         main = replaceInternalLinks(main)
         new DocPage(toc: toc, content: main)
     }

     private static String replaceInternalLinks(String html) {
         def replacer = { List<String> it ->
             def (String tag, String attr, String url) = [it[1], it[2], it[3]]
             url = url.replaceAll(/x(.+)\.(?:pagespeed.+)/, '$1')
             if (!url.startsWith('http') && !url.startsWith('#') && 'target.html'!=url) {
                 "$tag $attr'${DocUtils.DOCS_BASEURL}/html/documentation/$url'"
             } else {
                 it[0]
             }
         }
         html = html.replaceAll(/(a)\s+(href=)["'](.+?)["']/,replacer)
         html = html.replaceAll(/(img)\s+(src=)["'](.+?)["']/,replacer)
         KNOWN_REPLACEMENTS.each { link, repl ->
             html = html.replaceAll(link, repl)
         }
         html
     }

     private static String extractTOC(final String html) {
         int start = html.indexOf(TOC_START)
         if (start > 0) {
             int end = html.indexOf(MAIN_START)
             if (end>0) {
                 def out = html.substring(start, end).replace("<div id=\"toctitle\">Table of Contents</div>", "")
                 end = out.size()-1
                 while (!out.substring(end, out.size()).startsWith('</div>')) end--
                 return out.substring(0, end)
             }
         }
         null
     }

     static class DocPage {
         String toc = ''
         String content
     }

 }
	/*
	* Licensed to the Apache Software Foundation (ASF) under one
	* or more contributor license agreements. See the NOTICE file
	* distributed with this work for additional information
	* regarding copyright ownership. The ASF licenses this file
	* to you under the Apache License, Version 2.0 (the
	* "License"); you may not use this file except in compliance
	* with the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing,
	* software distributed under the License is distributed on an
	* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
	* KIND, either express or implied. See the License for the
	* specific language governing permissions and limitations
	* under the License.
	*/
	package generator

	import groovy.transform.CompileStatic

	/**
	* This class is responsible for downloading a documentation page as generated through the Asciidoctor task
	* of the Groovy build, then filter its contents in order to return only the body of the documentation, as HTML.
	*
	* @author Cédric Champeau
	*/
	@CompileStatic
	class DocumentationHTMLCleaner {
	private final static String BODY_START = /<body/
	private final static String BODY_END = /<\/body/
	private final static String TOC_START = /<div id="toc"/
	private final static String MAIN_START = /<div id="content"/
	private final static String MAIN_END = /<div id="footer"/

	/**
	* A list of links which are badly generated, but we know how to fix them
	*/
	private final static Map<String,String> KNOWN_REPLACEMENTS = [
	/docs\.groovy-lang\.org\/(latest\|next)\/html\/documentation\/gdk\.html/: 'groovy-lang.org/gdk.html',
	/\/maven\/groovy-/: '/maven/apache-groovy-'
	]

	private static String cleanupPage(String location) {
	def url = location.toURL()
	try {
	def fullHTML = url.getText('utf-8')
	return extractBetween(fullHTML, BODY_START, BODY_END)
	} catch (FileNotFoundException e) {
	// 404 not found
	}

	null
	}

	private static String extractBetween(String html, String startString, String endString) {
	def start = html.indexOf(startString)
	if (start > 0) {
	start = html.indexOf('>', start) + 1
	}
	if (start > 1) {
	def end = html.indexOf(endString, start)
	if (end > start) {
	return html.substring(start, end)
	}
	}
	null
	}

	public static DocPage parsePage(String location) {
	String contents = cleanupPage(location)
	if (contents==null) {
	return new DocPage(content: "Contents not found for <a href='$location'>$location</a>, most likely because this section has not yet been written.")
	}
	String toc = extractTOC(contents)?:''
	String main = extractBetween(contents, MAIN_START, MAIN_END)?:"Main body not found for <a href='$location'>$location</a>"
	main = replaceInternalLinks(main)
	new DocPage(toc: toc, content: main)
	}

	private static String replaceInternalLinks(String html) {
	def replacer = { List<String> it ->
	def (String tag, String attr, String url) = [it[1], it[2], it[3]]
	url = url.replaceAll(/x(.+)\.(?:pagespeed.+)/, '$1')
	if (!url.startsWith('http') && !url.startsWith('#') && 'target.html'!=url) {
	"$tag $attr'${DocUtils.DOCS_BASEURL}/html/documentation/$url'"
	} else {
	it[0]
	}
	}
	html = html.replaceAll(/(a)\s+(href=)["'](.+?)["']/,replacer)
	html = html.replaceAll(/(img)\s+(src=)["'](.+?)["']/,replacer)
	KNOWN_REPLACEMENTS.each { link, repl ->
	html = html.replaceAll(link, repl)
	}
	html
	}

	private static String extractTOC(final String html) {
	int start = html.indexOf(TOC_START)
	if (start > 0) {
	int end = html.indexOf(MAIN_START)
	if (end>0) {
	def out = html.substring(start, end).replace("<div id=\"toctitle\">Table of Contents</div>", "")
	end = out.size()-1
	while (!out.substring(end, out.size()).startsWith('</div>')) end--
	return out.substring(0, end)
	}
	}
	null
	}

	static class DocPage {
	String toc = ''
	String content
	}

	}