| # |
| # Licensed to the Apache Software Foundation (ASF) under one |
| # or more contributor license agreements. See the NOTICE file |
| # distributed with this work for additional information |
| # regarding copyright ownership. The ASF licenses this file |
| # to you under the Apache License, Version 2.0 (the |
| # "License"); you may not use this file except in compliance |
| # with the License. You may obtain a copy of the License at |
| # |
| # http://www.apache.org/licenses/LICENSE-2.0 |
| # |
| # Unless required by applicable law or agreed to in writing, |
| # software distributed under the License is distributed on an |
| # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
| # KIND, either express or implied. See the License for the |
| # specific language governing permissions and limitations |
| # under the License. |
| # |
| |
| from __future__ import print_function |
| |
| import codecs as _codecs |
| import fnmatch as _fnmatch |
| import markdown2 as _markdown2 |
| import os as _os |
| import re as _re |
| import runpy as _runpy |
| import sys as _sys |
| import tempfile as _tempfile |
| |
| from collections import defaultdict as _defaultdict |
| from xml.etree.ElementTree import XML as _XML |
| |
| try: |
| from urllib.request import urlopen as _urlopen |
| except: |
| from urllib2 import urlopen as _urlopen |
| |
| try: |
| from urllib.parse import urlsplit as _urlsplit |
| except: |
| from urlparse import urlsplit as _urlsplit |
| |
| try: |
| from urllib.parse import urljoin as _urljoin |
| except: |
| from urlparse import urljoin as _urljoin |
| |
| _title_regex = _re.compile(r"<([hH][12]).*?>(.*?)</\1>") |
| _tag_regex = _re.compile(r"<.+?>") |
| _page_extensions = ".md", ".html.in", ".html", ".css", ".js" |
| _buffer_size = 128 * 1024 |
| |
| class Transom: |
| def __init__(self, site_url, input_dir, output_dir, home_dir=None): |
| self.site_url = site_url |
| self.input_dir = input_dir |
| self.output_dir = output_dir |
| self.home_dir = home_dir |
| |
| self.verbose = False |
| |
| self.template_path = _join(self.input_dir, "_transom_template.html") |
| self.config_path = _join(self.input_dir, "_transom_config.py") |
| |
| self.template_content = None |
| self.config_env = None |
| |
| extras = { |
| "code-friendly": True, |
| "footnotes": True, |
| "header-ids": True, |
| "markdown-in-html": True, |
| "metadata": True, |
| "tables": True, |
| } |
| |
| self.markdown = _markdown2.Markdown(extras=extras) |
| |
| self.files = list() |
| self.files_by_path = dict() |
| |
| self.resources = list() |
| self.pages = list() |
| |
| self.links = _defaultdict(set) |
| self.link_targets = set() |
| |
| def init(self): |
| if not _is_file(self.template_path): |
| if self.home_dir is not None: |
| path = _join(self.home_dir, "resources", "template.html") |
| self.template_path = path |
| |
| if not _is_file(self.template_path): |
| raise Exception("No template found") |
| |
| self.template_content = _read_file(self.template_path) |
| |
| init_globals = {"site_url": self.site_url} |
| |
| if _is_file(self.config_path): |
| self.config_env = _runpy.run_path(self.config_path, init_globals) |
| else: |
| self.config_env = init_globals |
| |
| self.traverse_input_pages("", None) |
| self.traverse_input_resources("") |
| |
| for file in self.files: |
| file.init() |
| |
| def render(self): |
| for page in self.pages: |
| page.load_input() |
| |
| for page in self.pages: |
| page.convert() |
| |
| for page in self.pages: |
| page.process() |
| |
| for page in self.pages: |
| page.render() |
| |
| for page in self.pages: |
| page.save_output() |
| |
| for resource in self.resources: |
| resource.save_output() |
| |
| if self.home_dir is not None: |
| self.copy_default_resources() |
| |
| def copy_default_resources(self): |
| from_dir = _join(self.home_dir, "resources") |
| to_dir = _join(self.output_dir, "transom") |
| subpaths = list() |
| |
| for root, dirs, files in _os.walk(from_dir): |
| dir = root[len(from_dir) + 1:] |
| |
| for file in files: |
| subpaths.append(_join(dir, file)) |
| |
| for subpath in subpaths: |
| from_file = _join(from_dir, subpath) |
| to_file = _join(to_dir, subpath) |
| |
| _copy_file(from_file, to_file) |
| |
| def check_output_files(self): |
| expected_files = set() |
| found_files = set() |
| |
| for file in self.files: |
| expected_files.add(file.output_path) |
| |
| self.traverse_output_files("", found_files) |
| |
| missing_files = expected_files.difference(found_files) |
| extra_files = found_files.difference(expected_files) |
| |
| if missing_files: |
| print("Missing files:") |
| |
| for path in sorted(missing_files): |
| print(" {}".format(path)) |
| |
| if extra_files: |
| print("Extra files:") |
| |
| for path in sorted(extra_files): |
| print(" {}".format(path)) |
| |
| return len(missing_files), len(extra_files) |
| |
| def traverse_output_files(self, subdir, files): |
| output_dir = _join(self.output_dir, subdir) |
| names = set(_os.listdir(output_dir)) |
| |
| for name in names: |
| path = _join(subdir, name) |
| output_path = _join(self.output_dir, path) |
| |
| if _is_file(output_path): |
| files.add(output_path) |
| elif _is_dir(output_path): |
| if name == ".svn": |
| continue |
| |
| if name == "transom": |
| continue |
| |
| self.traverse_output_files(path, files) |
| |
| def check_links(self, internal=True, external=False): |
| for page in self.pages: |
| page.load_output() |
| |
| for page in self.pages: |
| page.find_links() |
| |
| errors_by_link = _defaultdict(list) |
| links = self.filter_links(self.links) |
| |
| for i, link in enumerate(links): |
| if internal and link.startswith(self.site_url): |
| if link[len(self.site_url):].startswith("/transom"): |
| continue |
| |
| if link not in self.link_targets: |
| errors_by_link[link].append("Link has no target") |
| |
| if external and not link.startswith(self.site_url): |
| code, error = self.check_external_link(link) |
| |
| if code >= 400: |
| msg = "HTTP error code {}".format(code) |
| errors_by_link[link].append(msg) |
| |
| if error: |
| errors_by_link[link].append(error.message) |
| |
| _sys.stdout.write(".") |
| |
| if (i + 1) % 100 == 0: |
| _sys.stdout.write("\n") |
| |
| _sys.stdout.flush() |
| |
| print() |
| |
| for link in errors_by_link: |
| print("Link: {}".format(link)) |
| |
| for error in errors_by_link[link]: |
| print(" Error: {}".format(error)) |
| |
| for source in self.links[link]: |
| print(" Source: {}".format(source)) |
| |
| return len(errors_by_link) |
| |
| def filter_links(self, links): |
| config_path = _join(self.input_dir, "_transom_ignore_links") |
| |
| if _is_file(config_path): |
| ignore_patterns = _read_file(config_path).splitlines() |
| |
| def retain(link): |
| for pattern in ignore_patterns: |
| pattern = pattern.strip() |
| path = link[len(self.site_url) + 1:] |
| |
| if _fnmatch.fnmatch(path, pattern): |
| return False |
| |
| return True |
| |
| return filter(retain, links) |
| |
| return links |
| |
| def check_external_link(self, link): |
| sock, code, error = None, None, None |
| |
| try: |
| sock = _urlopen(link, timeout=5) |
| code = sock.getcode() |
| except IOError as e: |
| error = e |
| finally: |
| if sock: |
| sock.close() |
| |
| return code, error |
| |
| def traverse_input_pages(self, subdir, parent_page): |
| input_dir = _join(self.input_dir, subdir) |
| names = set(_os.listdir(input_dir)) |
| |
| if "_transom_ignore_pages" in names: |
| return |
| |
| for name in ("index.md", "index.html", "index.html.in"): |
| if name in names: |
| names.remove(name) |
| parent_page = _Page(self, _join(subdir, name), parent_page) |
| break |
| |
| for name in sorted(names): |
| if name.startswith("_transom_"): |
| continue |
| |
| if name == ".svn": |
| continue |
| |
| path = _join(subdir, name) |
| input_path = _join(self.input_dir, path) |
| |
| if _is_file(input_path): |
| if input_path.endswith(".html.in"): |
| ext = ".html.in" |
| else: |
| stem, ext = _os.path.splitext(name) |
| |
| if ext in _page_extensions: |
| _Page(self, path, parent_page) |
| elif _is_dir(input_path): |
| self.traverse_input_pages(path, parent_page) |
| |
| def traverse_input_resources(self, subdir): |
| input_dir = _join(self.input_dir, subdir) |
| names = set(_os.listdir(input_dir)) |
| |
| if "_transom_ignore_resources" in names: |
| return |
| |
| for name in sorted(names): |
| if name.startswith("_transom_"): |
| continue |
| |
| if name == ".svn": |
| continue |
| |
| path = _join(subdir, name) |
| input_path = _join(self.input_dir, path) |
| |
| if _is_file(input_path): |
| if path not in self.files_by_path: |
| _Resource(self, path) |
| elif _is_dir(input_path): |
| self.traverse_input_resources(path) |
| |
| def get_url(self, output_path): |
| path = output_path[len(self.output_dir) + 1:] |
| path = path.replace(_os.path.sep, "/") |
| |
| return "{}/{}".format(self.site_url, path) |
| |
| def info(self, message, *args): |
| if self.verbose: |
| print(message.format(*args)) |
| |
| def warn(self, message, *args): |
| message = message.format(*args) |
| print("Warning! {}".format(message)) |
| |
| class _File(object): |
| def __init__(self, site, path): |
| self.site = site |
| self.path = path |
| |
| self.input_path = _join(self.site.input_dir, self.path) |
| self.output_path = _join(self.site.output_dir, self.path) |
| self.url = self.site.get_url(self.output_path) |
| |
| self.site.files.append(self) |
| self.site.files_by_path[self.path] = self |
| |
| def init(self): |
| self.site.link_targets.add(self.url) |
| |
| if self.url.endswith("/index.html"): |
| self.site.link_targets.add(self.url[:-10]) |
| self.site.link_targets.add(self.url[:-11]) |
| |
| def replace_placeholders(self, content, page_vars): |
| out = list() |
| tokens = _re.split("({{.+?}})", content) |
| |
| for token in tokens: |
| if token[:2] != "{{" or token[-2:] != "}}": |
| out.append(token) |
| continue |
| |
| token_content = token[2:-2] |
| |
| if page_vars and token_content in page_vars: |
| out.append(page_vars[token_content]) |
| continue |
| |
| expr = token_content |
| env = self.site.config_env |
| |
| try: |
| result = eval(expr, env) |
| except Exception as e: |
| msg = "Expression '{}'; file '{}'; {}" |
| args = expr, self.input_path, e |
| |
| print(msg.format(*args)) |
| |
| out.append(token) |
| continue |
| |
| if result is not None: |
| out.append(str(result)) |
| |
| return "".join(out) |
| |
| def __repr__(self): |
| return _format_repr(self, self.path) |
| |
| class _Resource(_File): |
| def __init__(self, site, path): |
| super(_Resource, self).__init__(site, path) |
| |
| self.site.resources.append(self) |
| |
| def save_output(self): |
| _copy_file(self.input_path, self.output_path) |
| |
| class _Page(_File): |
| def __init__(self, site, path, parent): |
| super(_Page, self).__init__(site, path) |
| |
| self.parent = parent |
| |
| self.content = None |
| self.template_content = None |
| |
| self.title = None |
| self.attributes = dict() |
| |
| self.site.pages.append(self) |
| |
| def init(self): |
| if self.output_path.endswith(".md"): |
| self.output_path = "{}.html".format(self.output_path[:-3]) |
| elif self.output_path.endswith(".html.in"): |
| self.output_path = self.output_path[:-3] |
| |
| self.url = self.site.get_url(self.output_path) |
| |
| super(_Page, self).init() |
| |
| self.template_content = self.site.template_content |
| |
| input_dir, name = _split(self.input_path) |
| template_path = _join(input_dir, "_transom_template.html") |
| |
| if _is_file(template_path): |
| self.template_content = _read_file(template_path) |
| |
| def load_input(self): |
| self.site.info("Loading {}", self) |
| self.content = _read_file(self.input_path) |
| |
| def save_output(self, path=None): |
| self.site.info("Saving {} to {}", self, self.output_path) |
| |
| if path is None: |
| path = self.output_path |
| |
| _write_file(self.output_path, self.content) |
| |
| def load_output(self): |
| self.content = _read_file(self.output_path) |
| |
| def convert(self): |
| if self.path.endswith(".md"): |
| self.convert_from_markdown() |
| elif self.path.endswith(".html.in"): |
| self.convert_from_html_in() |
| |
| def convert_from_markdown(self): |
| self.site.info("Converting {} from markdown", self) |
| |
| # Strip out comments |
| content_lines = self.content.splitlines() |
| content_lines = [x for x in content_lines if not x.startswith(";;")] |
| |
| content = _os.linesep.join(content_lines) |
| content = self.site.markdown.convert(content) |
| |
| self.content = self.apply_template(content) |
| self.attributes.update(content.metadata) |
| |
| def convert_from_html_in(self): |
| self.site.info("Converting {} from html.in", self) |
| self.content = self.apply_template(self.content) |
| |
| def apply_template(self, content): |
| return self.template_content.replace("{{content}}", content) |
| |
| def process(self): |
| self.site.info("Processing {}", self) |
| |
| # Restore previous behavior |
| if self.parent is None: |
| self.title = "Home" |
| return |
| |
| dir, name = _split(self.output_path) |
| self.title = name |
| |
| if isinstance(self.title, bytes): |
| self.title = self.title.decode("utf8") |
| |
| match = _title_regex.search(self.content) |
| |
| if match: |
| self.title = match.group(2) |
| |
| self.title = _tag_regex.sub("", self.title) |
| self.title = self.title.strip() |
| |
| def render(self): |
| self.site.info("Rendering {}", self) |
| |
| page_vars = { |
| "title": self.title, |
| "path_navigation": self.render_path_navigation(), |
| "extra_headers" : self.attributes.get("extra_headers", ""), |
| } |
| |
| self.content = self.replace_placeholders(self.content, page_vars) |
| |
| def render_link(self): |
| return u"<a href=\"{}\">{}</a>".format(self.url, self.title) |
| |
| def render_path_navigation(self): |
| links = list() |
| page = self.parent |
| |
| links.append(self.title) |
| |
| while page: |
| links.append(page.render_link()) |
| page = page.parent |
| |
| links = u"".join((u"<li>{}</li>".format(x) for x in reversed(links))) |
| |
| return u"<ul id=\"-path-navigation\">{}</ul>".format(links) |
| |
| def find_links(self): |
| if not self.output_path.endswith(".html"): |
| return |
| |
| self.site.info("Finding links in {}", self) |
| |
| try: |
| root = self.parse_xml(self.content) |
| except Exception as e: |
| self.site.warn(str(e)) |
| return |
| |
| links = self.gather_links(root) |
| link_targets = self.gather_link_targets(root) |
| |
| for link in links: |
| if link == "?": |
| continue |
| |
| scheme, netloc, path, query, fragment = _urlsplit(link) |
| |
| if scheme and scheme not in ("file", "http", "https", "ftp"): |
| continue |
| |
| if netloc in ("issues.apache.org", "bugzilla.redhat.com"): |
| continue |
| |
| if (fragment and not path) or not path.startswith("/"): |
| link = _urljoin(self.url, link) |
| |
| self.site.links[link].add(self.url) |
| |
| self.site.link_targets.update(link_targets) |
| |
| def parse_xml(self, xml): |
| try: |
| return _XML(xml) |
| except Exception as e: |
| path = _tempfile.mkstemp(".xml")[1] |
| msg = "{} fails to parse; {}; see {}".format(self, str(e), path) |
| |
| with _open_file(path, "w") as file: |
| file.write(xml) |
| |
| raise Exception(msg) |
| |
| def gather_links(self, root_elem): |
| links = set() |
| |
| for elem in root_elem.iter("*"): |
| for name in ("href", "src", "action"): |
| try: |
| link = elem.attrib[name] |
| except KeyError: |
| continue |
| |
| links.add(link) |
| |
| return links |
| |
| def gather_link_targets(self, root_elem): |
| link_targets = set() |
| |
| for elem in root_elem.iter("*"): |
| try: |
| id = elem.attrib["id"] |
| except KeyError: |
| continue |
| |
| target = "{}#{}".format(self.url, id) |
| |
| if target in link_targets: |
| self.site.warn("Duplicate link target in '{}'", target) |
| |
| link_targets.add(target) |
| |
| return link_targets |
| |
| _join = _os.path.join |
| _split = _os.path.split |
| _is_file = _os.path.isfile |
| _is_dir = _os.path.isdir |
| |
| def _make_dir(dir): |
| if not _os.path.exists(dir): |
| _os.makedirs(dir) |
| |
| def _open_file(path, mode): |
| return _codecs.open(path, mode, "utf8", "replace", _buffer_size) |
| |
| def _read_file(path): |
| with _open_file(path, "r") as file: |
| return file.read() |
| |
| def _write_file(path, content): |
| _make_dir(_split(path)[0]) |
| |
| with _open_file(path, "w") as file: |
| return file.write(content) |
| |
| # Adapted from http://stackoverflow.com/questions/22078621/python-how-to-copy-files-fast |
| |
| _read_flags = _os.O_RDONLY |
| _write_flags = _os.O_WRONLY | _os.O_CREAT | _os.O_TRUNC |
| _eof = b"" |
| |
| def _copy_file(src, dst): |
| _make_dir(_split(dst)[0]) |
| |
| try: |
| fin = _os.open(src, _read_flags) |
| fout = _os.open(dst, _write_flags) |
| |
| for x in iter(lambda: _os.read(fin, _buffer_size), _eof): |
| _os.write(fout, x) |
| finally: |
| _os.close(fin) |
| _os.close(fout) |
| |
| def _format_repr(obj, *args): |
| cls = obj.__class__.__name__ |
| strings = [str(x) for x in args] |
| return "{}({})".format(cls, ",".join(strings)) |