| # Licensed to the Apache Software Foundation (ASF) under one or more |
| # contributor license agreements. See the NOTICE file distributed with |
| # this work for additional information regarding copyright ownership. |
| # The ASF licenses this file to You under the Apache License, Version 2.0 |
| # (the "License"); you may not use this file except in compliance with |
| # the License. You may obtain a copy of the License at |
| # |
| # http://www.apache.org/licenses/LICENSE-2.0 |
| # |
| # Unless required by applicable law or agreed to in writing, software |
| # distributed under the License is distributed on an "AS IS" BASIS, |
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| # See the License for the specific language governing permissions and |
| # limitations under the License. |
| |
| import traceback |
| import os |
| import sys |
| import re |
| from html.parser import HTMLParser |
| import urllib.parse as urlparse |
| |
| reHyperlink = re.compile(r'<a(\s+.*?)>', re.I) |
| reAtt = re.compile(r"""(?:\s+([a-z]+)\s*=\s*("[^"]*"|'[^']?'|[^'"\s]+))+""", re.I) |
| |
| # Char ::= #x9 | #xA | #xD | [#x20-#xD7FF] | [#xE000-#xFFFD] | [#x10000-#x10FFFF] /* any Unicode character, excluding the surrogate blocks, FFFE, and FFFF. */ |
| reValidChar = re.compile("^[^\u0000-\u0008\u000B-\u000C\u000E-\u001F\uFFFE\uFFFF]*$") |
| |
| # silly emacs: ' |
| |
| class FindHyperlinks(HTMLParser): |
| |
| def __init__(self, baseURL): |
| HTMLParser.__init__(self) |
| self.stack = [] |
| self.anchors = set() |
| self.links = [] |
| self.baseURL = baseURL |
| self.printed = False |
| |
| def handle_starttag(self, tag, attrs): |
| # NOTE: I don't think 'a' should be in here. But try debugging |
| # NumericRangeQuery.html. (Could be javadocs bug, it's a generic type...) |
| if tag not in ('link', 'meta', 'frame', 'br', 'hr', 'p', 'li', 'img', 'col', 'a'): |
| self.stack.append(tag) |
| if tag == 'a': |
| name = None |
| href = None |
| for attName, attValue in attrs: |
| if attName == 'name': |
| name = attValue |
| elif attName == 'href': |
| href = attValue |
| |
| if name is not None: |
| assert href is None |
| if name in self.anchors: |
| if name in ('serializedForm', |
| 'serialized_methods', |
| 'readObject(java.io.ObjectInputStream)', |
| 'writeObject(java.io.ObjectOutputStream)') \ |
| and self.baseURL.endswith('/serialized-form.html'): |
| # Seems like a bug in Javadoc generation... you can't have |
| # same anchor name more than once... |
| pass |
| else: |
| self.printFile() |
| raise RuntimeError('anchor "%s" appears more than once' % name) |
| else: |
| self.anchors.add(name) |
| elif href is not None: |
| assert name is None |
| href = href.strip() |
| self.links.append(urlparse.urljoin(self.baseURL, href)) |
| else: |
| if self.baseURL.endswith('/AttributeSource.html'): |
| # LUCENE-4010: AttributeSource's javadocs has an unescaped <A> generics!! Seems to be a javadocs bug... (fixed in Java 7) |
| pass |
| else: |
| raise RuntimeError('couldn\'t find an href nor name in link in %s: only got these attrs: %s' % (self.baseURL, attrs)) |
| |
| def handle_endtag(self, tag): |
| if tag in ('link', 'meta', 'frame', 'br', 'hr', 'p', 'li', 'img', 'col', 'a'): |
| return |
| |
| if len(self.stack) == 0: |
| raise RuntimeError('%s %s:%s: saw </%s> no opening <%s>' % (self.baseURL, self.getpos()[0], self.getpos()[1], tag, self.stack[-1])) |
| |
| if self.stack[-1] == tag: |
| self.stack.pop() |
| else: |
| raise RuntimeError('%s %s:%s: saw </%s> but expected </%s>' % (self.baseURL, self.getpos()[0], self.getpos()[1], tag, self.stack[-1])) |
| |
| def printFile(self): |
| if not self.printed: |
| print() |
| print(' ' + self.baseURL) |
| self.printed = True |
| |
| def parse(baseURL, html): |
| global failures |
| # look for broken unicode |
| if not reValidChar.match(html): |
| print(' WARNING: invalid characters detected in: %s' % baseURL) |
| failures = True |
| return [], [] |
| |
| parser = FindHyperlinks(baseURL) |
| try: |
| parser.feed(html) |
| parser.close() |
| except: |
| # TODO: Python's html.parser is now always lenient, which is no good for us: we want correct HTML in our javadocs |
| parser.printFile() |
| print(' WARNING: failed to parse %s:' % baseURL) |
| traceback.print_exc(file=sys.stdout) |
| failures = True |
| return [], [] |
| |
| #print ' %d links, %d anchors' % \ |
| # (len(parser.links), len(parser.anchors)) |
| return parser.links, parser.anchors |
| |
| failures = False |
| |
| def checkAll(dirName): |
| """ |
| Checks *.html (recursively) under this directory. |
| """ |
| |
| global failures |
| |
| # Find/parse all HTML files first |
| print() |
| print('Crawl/parse...') |
| allFiles = {} |
| |
| if os.path.isfile(dirName): |
| root, fileName = os.path.split(dirName) |
| iter = ((root, [], [fileName]),) |
| else: |
| iter = os.walk(dirName) |
| |
| for root, dirs, files in iter: |
| for f in files: |
| main, ext = os.path.splitext(f) |
| ext = ext.lower() |
| |
| # maybe?: |
| # and main not in ('serialized-form'): |
| if ext in ('.htm', '.html') and \ |
| not f.startswith('.#') and \ |
| main not in ('deprecated-list',): |
| # Somehow even w/ java 7 generaged javadocs, |
| # deprecated-list.html can fail to escape generics types |
| fullPath = os.path.join(root, f).replace(os.path.sep,'/') |
| fullPath = 'file:%s' % urlparse.quote(fullPath) |
| # parse and unparse the URL to "normalize" it |
| fullPath = urlparse.urlunparse(urlparse.urlparse(fullPath)) |
| #print ' %s' % fullPath |
| allFiles[fullPath] = parse(fullPath, open('%s/%s' % (root, f), encoding='UTF-8').read()) |
| |
| # ... then verify: |
| print() |
| print('Verify...') |
| for fullPath, (links, anchors) in allFiles.items(): |
| #print fullPath |
| printed = False |
| for link in links: |
| |
| origLink = link |
| |
| # TODO: use urlparse? |
| idx = link.find('#') |
| if idx != -1: |
| anchor = link[idx+1:] |
| link = link[:idx] |
| else: |
| anchor = None |
| |
| # remove any whitespace from the middle of the link |
| link = ''.join(link.split()) |
| |
| idx = link.find('?') |
| if idx != -1: |
| link = link[:idx] |
| |
| # TODO: normalize path sep for windows... |
| if link.startswith('http://') or link.startswith('https://'): |
| # don't check external links |
| |
| if link.find('lucene.apache.org/java/docs/mailinglists.html') != -1: |
| # OK |
| pass |
| elif link == 'http://lucene.apache.org/core/': |
| # OK |
| pass |
| elif link == 'http://lucene.apache.org/solr/': |
| # OK |
| pass |
| elif link == 'http://lucene.apache.org/solr/resources.html': |
| # OK |
| pass |
| elif link.find('lucene.apache.org/java/docs/discussion.html') != -1: |
| # OK |
| pass |
| elif link.find('lucene.apache.org/core/discussion.html') != -1: |
| # OK |
| pass |
| elif link.find('lucene.apache.org/solr/mirrors-solr-latest-redir.html') != -1: |
| # OK |
| pass |
| elif link.find('lucene.apache.org/solr/quickstart.html') != -1: |
| # OK |
| pass |
| elif (link.find('svn.apache.org') != -1 |
| or link.find('lucene.apache.org') != -1)\ |
| and os.path.basename(fullPath) != 'Changes.html': |
| if not printed: |
| printed = True |
| print() |
| print(fullPath) |
| print(' BAD EXTERNAL LINK: %s' % link) |
| elif link.startswith('mailto:'): |
| if link.find('@lucene.apache.org') == -1 and link.find('@apache.org') != -1: |
| if not printed: |
| printed = True |
| print() |
| print(fullPath) |
| print(' BROKEN MAILTO (?): %s' % link) |
| elif link.startswith('javascript:'): |
| # ok...? |
| pass |
| elif 'org/apache/solr/client/solrj/beans/Field.html' in link: |
| # see LUCENE-4011: this is a javadocs bug for constants |
| # on annotations it seems? |
| pass |
| elif link.startswith('file:'): |
| if link not in allFiles: |
| filepath = urlparse.unquote(urlparse.urlparse(link).path) |
| if not (os.path.exists(filepath) or os.path.exists(filepath[1:])): |
| if not printed: |
| printed = True |
| print() |
| print(fullPath) |
| print(' BROKEN LINK: %s' % link) |
| elif anchor is not None and anchor not in allFiles[link][1]: |
| if not printed: |
| printed = True |
| print() |
| print(fullPath) |
| print(' BROKEN ANCHOR: %s' % origLink) |
| else: |
| if not printed: |
| printed = True |
| print() |
| print(fullPath) |
| print(' BROKEN URL SCHEME: %s' % origLink) |
| failures = failures or printed |
| |
| return failures |
| |
| if __name__ == '__main__': |
| if checkAll(sys.argv[1]): |
| print() |
| print('Broken javadocs links were found!') |
| sys.exit(1) |
| sys.exit(0) |
| |