| #!/usr/bin/env ruby |
| |
| # Licensed to the Apache Software Foundation (ASF) under one or more |
| # contributor license agreements. See the NOTICE file distributed with |
| # this work for additional information regarding copyright ownership. |
| # The ASF licenses this file to You under the Apache License, Version 2.0 |
| # (the "License"); you may not use this file except in compliance with |
| # the License. You may obtain a copy of the License at |
| # |
| # http://www.apache.org/licenses/LICENSE-2.0 |
| # |
| # Unless required by applicable law or agreed to in writing, software |
| # distributed under the License is distributed on an "AS IS" BASIS, |
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| # See the License for the specific language governing permissions and |
| # limitations under the License. |
| |
| =begin |
| Checks a download page URL for compliance with ASF guidelines. |
| |
| |
| Note: the GUI interface is currently at www/members/download_check.cgi |
| |
| =end |
| |
| $LOAD_PATH.unshift '/srv/whimsy/lib' |
| require 'whimsy/asf' |
| require 'wunderbar' |
| require 'net/http' |
| require 'nokogiri' |
| require 'time' |
| |
| =begin |
| Checks performed: (F=fatal, E=error, W=warn) |
| TBA |
| =end |
| |
| $CLI = false |
| $VERBOSE = false |
| |
| $ARCHIVE_CHECK = false |
| $ALWAYS_CHECK_LINKS = false |
| $NO_CHECK_LINKS = false |
| $NOFOLLOW = false # may be reset |
| $ALLOW_HTTP = false # http links generate Warning, not Error |
| $FAIL_FAST = false |
| $SHOW_LINKS = false |
| |
| $VERSION = nil |
| |
| # Check archives have hash and sig |
| $vercheck = {} # key = archive name, value = array of [type, hash/sig...] |
| # collect versions for summary display |
| $versions = Hash.new {|h1, k1| h1[k1] = Hash.new {|h2, k2| h2[k2] = Array.new} } # key = version, value = Hash, key = arch basename, value = array of [extensions] |
| |
| # match an artifact |
| # TODO detect artifacts by URL as well if possible |
| # $1 = base, $2 = extension |
| # OOO SF links end in /download |
| ARTIFACT_RE = %r{/([^/]+\.(pom|crate|tar|tar\.xz|tar\.gz|deb|nbm|dmg|sh|zip|tgz|far|tar\.bz2|jar|whl|war|msi|exe|rar|rpm|nar|xml|vsix))([&?]action=download|/download)?$} |
| |
| def init |
| # build a list of validation errors |
| @tests = [] |
| @fails = 0 |
| if $NO_CHECK_LINKS |
| $NOFOLLOW = true |
| I "Will not check links" |
| elsif $ALWAYS_CHECK_LINKS |
| I "Will check links even if download page has errors" |
| else |
| I "Will check links if download page has no errors" |
| end |
| I "Will %s archive.apache.org links in checks" % ($ARCHIVE_CHECK ? 'include' : 'not include') |
| end |
| |
| # save the result of a test |
| def test(severity, txt) |
| @tests << {severity => txt} |
| unless severity == :I or severity == :W |
| @fails += 1 |
| if $FAIL_FAST |
| puts txt |
| caller.each {|c| puts c} |
| exit! |
| end |
| end |
| end |
| |
| def F(txt) |
| test(:F, txt) |
| end |
| |
| def E(txt) |
| test(:E, txt) |
| end |
| |
| def W(txt) |
| test(:W, txt) |
| end |
| |
| def I(txt) |
| test(:I, txt) |
| end |
| |
| # extract test entries with key k |
| def tests(k) |
| @tests.map {|t| t[k]}.compact |
| end |
| |
| # extract test entries with key k |
| def testentries(k) |
| @tests.select {|t| t[k]}.compact |
| end |
| |
| def showList(list, header) |
| unless list.empty? |
| _h2_ header |
| _ul do |
| list.each { |item| _li item } |
| end |
| end |
| end |
| |
| def displayHTML |
| fatals = tests(:F) |
| errors = tests(:E) |
| warns = tests(:W) |
| |
| if !fatals.empty? |
| _h2_.bg_danger "The page at #{@url} failed our checks:" |
| elsif !errors.empty? |
| _h2_.bg_warning "The page at #{@url} has some problems:" |
| elsif !warns.empty? |
| _h2_.bg_warning "The page at #{@url} has some minor issues" |
| else |
| _h2_.bg_success "The page at #{@url} looks OK, thanks for using this service" |
| end |
| |
| if @fails > 0 |
| showList(fatals, "Fatal errors:") |
| showList(errors, "Errors:") |
| end |
| |
| showList(warns, "Warnings:") |
| |
| _h2_ 'Tests performed' |
| _ol do |
| @tests.each { |t| t.map {|k, v| _li "#{k}: - #{v}"}} |
| end |
| _h4_ 'F: fatal, E: Error, W: warning, I: info (success)' |
| end |
| |
| def check_url(url) |
| uri = URI.parse(url) |
| unless uri.scheme |
| W "No scheme for URL #{url}, assuming http" |
| uri = URI.parse("http:" + url) |
| end |
| return uri if %w{http https}.include? uri.scheme |
| raise ArgumentError.new("Unexpected url: #{url}") |
| end |
| |
| # Return uri, code|nil, response|error |
| def fetch_url(url, method=:head, depth=0, followRedirects=true) # string input |
| uri = URI.parse(url) |
| begin |
| Net::HTTP.start(uri.host, uri.port, use_ssl: uri.scheme == 'https') do |https| |
| case method |
| when :head |
| request = Net::HTTP::Head.new(uri.request_uri) |
| when :get |
| request = Net::HTTP::Get.new(uri.request_uri) |
| else |
| raise "Invalid method #{method}" |
| end |
| response = https.request(request) |
| if followRedirects and response.code =~ /^3\d\d/ |
| return uri, nil, "Too many redirects: #{depth} > 3" if depth > 3 |
| fetch_url response['location'], method, depth + 1 # string |
| else |
| return uri, response.code, response |
| end |
| end |
| rescue Exception => e |
| return uri, nil, e |
| end |
| end |
| |
| # Head an HTTP URL => uri, code, response |
| def HEAD(url) |
| puts ">> HEAD #{url}" if $VERBOSE |
| fetch_url(url, :head) |
| end |
| |
| # get an HTTP URL => response |
| def GET(url, followRedirects=true) |
| puts ">> GET #{url}" if $VERBOSE |
| fetch_url(url, :get, 0, followRedirects)[2] |
| end |
| |
| # Check page exists => uri, code, response|nil |
| def check_head_3(path, severity = :E, log=true) |
| uri, code, response = HEAD(path) |
| if code == '403' # someone does not like Whimsy? |
| W "HEAD #{path} - HTTP status: #{code} - retry" |
| uri, code, response = HEAD(path) |
| end |
| unless code == '200' |
| test(severity, "HEAD #{path} - HTTP status: #{code}") unless severity.nil? |
| return uri, code, nil |
| end |
| I "Checked HEAD #{path} - OK (#{code})" if log |
| return uri, code, response |
| end |
| |
| # Check page exists => response or nil |
| def check_head(path, severity = :E, log=true) |
| check_head_3(path, severity, log)[2] |
| end |
| |
| # check page can be read => body or response or nil |
| def check_page(path, severity=:E, log=true, returnRes=false, followRedirects=true) |
| response = GET(path, followRedirects) |
| code = response.code || '?' |
| unless code == '200' or (!followRedirects and code =~ /^3\d\d/) |
| test(severity, "GET #{path} - HTTP status: #{code}") unless severity.nil? |
| return nil |
| end |
| I "Checked GET #{path} - OK (#{code})" if log |
| puts "Fetched #{path} - OK (#{code})" if $CLI |
| return returnRes ? response : response.body |
| end |
| |
| def WE(msg) |
| if $ALLOW_HTTP |
| W msg |
| else |
| E msg |
| end |
| end |
| |
| # returns www|archive, stem and the hash extension |
| def check_hash_loc(h, tlp) |
| tlpQE = Regexp.escape(tlp) # in case of meta-chars |
| tlpQE = "(?:ooo|#{tlpQE})" if tlp == 'openoffice' |
| tlpQE = "(?:lucene|#{tlpQE})" if tlp == 'solr' # temporary override |
| tlpQE = "(?:tubemq|inlong)" if tlp == 'inlong' # renamed |
| tlpQE = "(?:hadoop/)?ozone" if tlp == 'ozone' # moved |
| if h =~ %r{^(https?)://(?:(archive|www)\.)?apache\.org/dist/(?:incubator/)?#{tlpQE}/.*?([^/]+)\.(\w{3,6})$} |
| WE "HTTPS! #{h}" unless $1 == 'https' |
| return $2 || '', $3, $4 # allow for no host before apache.org |
| # Allow // after .org (pulsar) |
| elsif h =~ %r{^(https?)://(downloads)\.apache\.org//?(?:incubator/)?#{tlpQE}/.*?([^/]+)\.(\w{3,6})$} |
| WE "HTTPS! #{h}" unless $1 == 'https' |
| return $2, $3, $4 |
| # https://repo1.maven.org/maven2/org/apache/shiro/shiro-spring/1.1.0/shiro-spring-1.1.0.jar.asc |
| elsif h =~ %r{^(https?)://repo1?\.(maven)(?:\.apache)?\.org/maven2/org/apache/#{tlpQE}/.+/([^/]+\.(?:jar|xml))\.(\w{3,6})$} # Maven |
| WE "HTTPS! #{h}" unless $1 == 'https' |
| W "Unexpected hash location #{h} for #{tlp}" unless ($vercheck[$3][0] rescue '') == 'maven' |
| return $2, $3, $4 |
| else |
| if h =~ %r{-bin-} |
| W "Unexpected bin hash location #{h} for #{tlp}" |
| else |
| E "Unexpected hash location #{h} for #{tlp}" |
| end |
| nil |
| end |
| end |
| |
| # get the https? links as Array of [href, text] |
| def get_links(path, body, checkSpaces=false) |
| doc = Nokogiri::HTML(body) |
| nodeset = doc.css('a[href]') # Get anchors w href attribute via css |
| nodeset.map { |node| |
| tmp = node.attribute("href").to_s |
| href = tmp.strip |
| if checkSpaces && tmp != href |
| W "Spurious space(s) in '#{tmp}'" |
| end |
| if href =~ %r{^?Preferred=https?://} |
| href = path + URI.decode_www_form_component(href) |
| end |
| text = node.text.gsub(/[[:space:]]+/, ' ').strip |
| [href, text] unless href =~ %r{/httpcomponents.+/xdoc/downloads.xml} # breadcrumb link to source |
| }.select {|x, _y| x =~ %r{^(https?:)?//} } |
| end |
| |
| VERIFY_TEXT = [ |
| 'the integrity of the downloaded files', |
| 'Verify Authenticity of Downloads', |
| 'verify the integrity', # commons has this as a link; perhaps try converting page to text only? |
| 'verify that checksums and signatures are correct', |
| '#verifying-signature', |
| 'check that the download has completed OK', |
| 'You should verify your download', |
| 'downloads can be verified', |
| 'www.apache.org/info/verification', |
| 'www.apache.org/dyn/closer.cgi#verify', |
| 'verify your mirrored downloads', |
| 'verify your downloads', |
| 'verify the downloaded files', |
| 'All downloads should be verified', |
| 'verification instructions', |
| ' encouraged to verify ', |
| 'To check a GPG signature', |
| 'To verify Hadoop', |
| 'Instructions for verifying your mirrored downloads', # fineract |
| 'How to verify the download?', # OOO |
| ] |
| |
| ALIASES = { |
| 'sig' => 'asc', |
| 'pgp' => 'asc', |
| 'gpg' => 'asc', |
| 'pgpasc' => 'asc', |
| 'sign' => 'asc', |
| 'signature' => 'asc', |
| 'signature(.asc)' => 'asc', |
| 'ascsignature' => 'asc', |
| 'pgpsignature' => 'asc', |
| 'pgpsignatures' => 'asc', |
| 'gpgsignature' => 'asc', |
| 'openpgpsignature' => 'asc', |
| } |
| |
| # Need to be able to check if download is for a PMC or podling |
| # parameter is the website URL |
| # Also want to convert site to TLP |
| URL2TLP = {} # URL host to TLP conversion |
| URL2TLP['jspwiki-wiki'] = 'jspwiki' # https://jspwiki-wiki.apache.org/Wiki.jsp?page=Downloads |
| URL2TLP['xmlbeans'] = 'poi' # xmlbeans now being maintained by POI |
| PMCS = Set.new # is this a TLP? |
| ASF::Committee.pmcs.map do |p| |
| site = p.site[%r{//(.+?)\.apache\.org}, 1] |
| name = p.name |
| URL2TLP[site] = name unless site == name |
| PMCS << name |
| end |
| |
| # Convert text reference to extension |
| # e.g. SHA256 => sha256; [SIG] => asc |
| def text2ext(txt) |
| # need to strip twice to handle ' [ asc ] ' |
| # TODO: perhaps just remove all white-space? |
| tmp = txt.downcase.strip.sub(%r{^\.}, '').sub(%r{^\[(.+)\]$}, '\1').sub('-', ''). |
| sub(/ ?(digest|checksum)/, '').sub(/ \(tar\.gz\)| \(zip\)| /, ''). |
| sub('(opens new window)', ''). # doris |
| strip |
| return 'sha256' if tmp =~ %r{\A[A-Fa-f0-9]{64}\z} |
| return 'sha512' if tmp =~ %r{\A[A-Fa-f0-9]{128}\z} |
| ALIASES[tmp] || tmp |
| end |
| |
| # Suite: perform all the HTTP checks |
| def checkDownloadPage(path, tlp, version) |
| begin |
| _checkDownloadPage(path.strip, tlp, version) |
| rescue Exception => e |
| F e |
| if $CLI |
| p e |
| puts e.backtrace |
| end |
| end |
| end |
| |
| def _checkDownloadPage(path, tlp, version) |
| isTLP = PMCS.include? tlp |
| if version == '' |
| I "Checking #{path} [#{tlp}] TLP #{isTLP} ..." |
| else |
| I "Checking #{path} [#{tlp}] TLP #{isTLP} for version #{version} only ..." |
| end |
| |
| # check the main body |
| if $ALLOW_JS |
| body = `/srv/whimsy/tools/render-page.js #{path}` |
| else |
| body = check_page(path) |
| end |
| |
| return unless body |
| |
| hasDisclaimer = body.gsub(%r{\s+}, ' ').include? 'Incubation is required of all newly accepted' |
| |
| if isTLP |
| W "#{tlp} has Incubator disclaimer" if hasDisclaimer |
| elsif hasDisclaimer |
| I "#{tlp} has Incubator disclaimer" |
| else |
| E "#{tlp} does not have Incubator disclaimer" |
| end |
| |
| # Some pages are mainly a single line (e.g. Hop) |
| # This make matching the appropriate match context tricky without traversing the DOM |
| body.scan(%r{(^.*?([^<>]+?(nightly|snapshot)[^<>]+?)).*$}i) do |m| |
| m.each do |n| |
| if n.size < 160 |
| if n =~ %r{API |/api/|-docs-} # snapshot docs Datasketches (Flink)? |
| W "Found reference to NIGHTLY or SNAPSHOT docs?: #{n}" |
| else |
| # ignore trafficcontrol bugfix message |
| unless n.include? "Fixed TO log warnings when generating snapshots" or |
| n.include? "Kafka Raft support for snapshots" or |
| n.include? "zkSnapshotC" or # ZooKeepeer |
| n.include? "/issues.apache.org/jira/browse/" # Daffodil |
| W "Found reference to NIGHTLY or SNAPSHOT builds: #{n}" |
| end |
| end |
| break |
| end |
| end |
| end |
| |
| if body.include? 'dist.apache.org' |
| E 'Page must not link to dist.apache.org' |
| else |
| I 'Page does not reference dist.apache.org' |
| end |
| |
| if body.include? 'repository.apache.org' |
| E 'Page must not link to repository.apache.org' |
| else |
| I 'Page does not reference repository.apache.org' |
| end |
| |
| deprecated = Time.parse('2018-01-01') |
| |
| links = get_links(path, body, true) |
| if links.size < 6 # source+binary * archive+sig+hash |
| E "Page does not have enough links: #{links.size} < 6 -- perhaps it needs JavaScript?" |
| end |
| |
| if $CLI |
| puts "Checking link syntax" |
| links.each do |h, t| |
| if h =~ %r{^([a-z]{3,6})://} |
| W "scheme? %s %s" % [h, t] unless %w(http https).include? $1 |
| else |
| W "syntax? %s %s" % [h, t] unless h.start_with? '//' |
| end |
| end |
| end |
| if $SHOW_LINKS |
| links.each {|l| p l} |
| end |
| |
| tlpQE = Regexp.escape(tlp) # in case of meta-chars |
| tlpQE = "(?:lucene|#{tlpQE})" if tlp == 'solr' # temporary override |
| # check KEYS link |
| # TODO: is location used by hc allowed, e.g. |
| # https://www.apache.org/dist/httpcomponents/httpclient/KEYS |
| expurl = "https://[downloads.|www.]apache.org/[dist/][incubator/]#{tlp}/KEYS" |
| expurlre = %r{^https://((www\.)?apache\.org/dist|downloads\.apache\.org)/(incubator/)?#{tlpQE}/KEYS$} |
| keys = links.select {|h, _v| h =~ expurlre} |
| if keys.size >= 1 |
| keyurl = keys.first.first |
| keytext = keys.first[1] |
| if keytext.include? 'KEYS' |
| I 'Found KEYS link' |
| else |
| W "Found KEYS: '#{keytext}'" |
| end |
| check_head(keyurl, :E) # log |
| else |
| keys = links.select {|h, v| h.end_with? 'KEYS' || v.strip == 'KEYS' || v == 'KEYS file' || v == '[KEYS]'} |
| if keys.size >= 1 |
| I 'Found KEYS link' |
| keyurl = keys.first.first |
| if keyurl =~ expurlre |
| I "KEYS links to #{expurl} as expected" |
| elsif keyurl =~ %r{^https://www\.apache\.org/dist/#{tlpQE}/[^/]+/KEYS$} |
| W "KEYS: expected: #{expurl}\n actual: #{keyurl}" |
| elsif keyurl =~ %r{^https://downloads\.apache\.org/#{tlpQE}/[^/]+/KEYS$} |
| W "KEYS: expected: #{expurl}\n actual: #{keyurl}" |
| else |
| E "KEYS: expected: #{expurl}\n actual: #{keyurl}" |
| end |
| check_head(keyurl, :E) # log |
| else |
| E 'Could not find KEYS link' |
| end |
| end |
| |
| hasGPGverify = false |
| # Check if GPG verify has two parameters |
| body.scan(%r{^.+gpg --verify.+$}) { |m| |
| hasGPGverify = true |
| unless m =~ %r{gpg --verify\s+\S+\.asc\s+\S+} |
| W "gpg verify should specify second param: #{m.strip} see:\nhttps://www.apache.org/info/verification.html#specify_both" |
| end |
| } |
| |
| # Look for incorrect gpg qualifiers |
| body.scan(%r{(gpg[[:space:]]+(.+?)(?:import|verify))}) { |m| |
| pfx = m[1] |
| unless pfx.sub(%r{<span[^>]*>}, '') == '--' |
| W "gpg requires -- before qualifiers, not #{pfx.inspect}: #{m[0].strip}" |
| end |
| } |
| |
| # check for verify instructions |
| bodytext = body.gsub(/\s+/, ' ') # single line |
| if VERIFY_TEXT.any? {|text| bodytext.include? text} |
| I 'Found reference to download verification' |
| elsif hasGPGverify |
| W 'Found reference to GPG verify; assuming this is part of download verification statement' |
| else |
| E 'Could not find statement of the need to verify downloads' |
| end |
| |
| # check if page refers to md5sum |
| body.scan(%r{^.+md5sum.+$}) {|m| |
| W "Found md5sum: #{m.strip}" |
| } |
| |
| links.each do |h, t| |
| # These might also be direct links to mirrors |
| if h =~ ARTIFACT_RE |
| base = File.basename($1) |
| # puts "base: " + base |
| if $vercheck[base] # might be two links to same archive |
| W "Already seen link for #{base}" |
| else |
| ext = $2 # save for use after RE match |
| $vercheck[base] = [h =~ %r{^https?://archive.apache.org/} ? 'archive' : (h =~ %r{https?://repo\d?\.maven(\.apache)?\.org/} ? 'maven' : 'live')] |
| unless $vercheck[base].first == 'archive' |
| stem = base[0..-(ext.size + 2)] |
| # version must include '.', e.g. xxx-m.n.oyyy |
| # Apache_OpenOffice-SDK_4.1.10_Linux_x86-64_install-deb_en-US |
| if stem =~ %r{^.+?[-_]v?(\d+(?:\.\d+)+)(.*)$} |
| # $1 = version |
| # $2 any suffix, e.g. -bin, -src (or other) |
| ver = $1 # main version |
| suff = $2 |
| # does version have a suffix such as beta1, M3 etc? |
| # jmeter needs _ here; brpc uses rc02 |
| if suff =~ %r{^(-RC\d+|-rc\d+|-incubating|-ALPHA|[-.]?M\d+|[-~]?(alpha|beta)\d?(?:-\d)?)} |
| ver += $1 |
| end |
| $versions[ver][stem] << ext |
| elsif stem =~ %r{netbeans-(\d+)-}i |
| $versions[$1][stem] << ext |
| else |
| W "Cannot parse #{stem} for version" |
| end |
| end |
| end |
| # Text must include a '.' (So we don't check 'Source') |
| if t.include?('.') and base != File.basename(t.sub(/[Mm]irrors? for /, '').strip) |
| # text might be short version of link |
| tmp = t.strip.sub(%r{.*/}, '') # |
| if base == tmp |
| W "Mismatch?: #{h} and '#{t}'" |
| elsif base.end_with? tmp |
| W "Mismatch?: #{h} and '#{tmp}'" |
| elsif base.sub(/-bin\.|-src\./, '.').end_with? tmp |
| W "Mismatch?: #{h} and '#{tmp}'" |
| else |
| W "Mismatch2: #{h}\n link: '#{base}'\n text: '#{tmp}'" |
| end |
| end |
| end |
| end |
| |
| links.each do |h, t| |
| # Must occur before mirror check below |
| # match all hashes and sigs here (invalid locations are detected later) |
| if h =~ %r{^https?://.+?/([^/]+\.(asc|sha\d+|md5|sha|mds))$} |
| base = File.basename($1) |
| ext = $2 |
| stem = base[0..-(2 + ext.length)] |
| if $vercheck[stem] |
| $vercheck[stem] << ext |
| else |
| E "Bug: found hash #{h} for missing artifact #{stem}" |
| end |
| t.sub!('âžš', '').strip! # age |
| next if t == '' # empire-db |
| tmp = text2ext(t) |
| next if ext == tmp # i.e. link is just the type or [TYPE] |
| next if ext == 'sha' and tmp == 'sha1' # historic |
| next if %w(sha256 md5 mds sha512 sha1).include?(ext) and %w(SHA digest Digest CheckSum checksums).include?(t) # generic |
| next if ext == 'mds' and (tmp == 'hashes' or t == 'Digests') |
| if base != t |
| if t == 'Download' # MXNet |
| W "Mismatch: #{h} and '#{t}'" |
| elsif not %w{checksum Hash}.include? t |
| if h =~ %r{^https?://archive\.apache\.org/dist/} # only warn for archives |
| W "Mismatch: #{h} and '#{t}'" |
| else |
| E "Mismatch: #{h} and '#{t}'" |
| end |
| end |
| end |
| end |
| end |
| |
| |
| # did we find all required elements? |
| $vercheck.each do |k, w| |
| v = w.dup |
| typ = v.shift |
| unless v.include? "asc" and v.any? {|e| e =~ /^sha\d+$/ or e == 'md5' or e == 'sha' or e == 'mds'} |
| if typ == 'live' |
| E "#{k} missing sig/hash: (found only: #{v.inspect})" |
| elsif typ == 'archive' || typ == 'maven' # Maven does not include recent hash types; so warn only |
| W "#{k} missing sig/hash: (found only: #{v.inspect})" |
| else |
| E "#{k} missing sig/hash: (found only: #{v.inspect}) TYPE=#{typ}" |
| end |
| end |
| W "#{k} Prefer SHA* over MDS #{v.inspect}" if typ == 'live' && v.include?('mds') && v.none? {|e| e =~ /^sha\d+$/} |
| end |
| |
| if @fails > 0 and not $ALWAYS_CHECK_LINKS |
| W "** Not checking links **" |
| $NOFOLLOW = true |
| end |
| |
| # Still check links if versions not seen |
| if $versions.size == 0 |
| E "Could not detect any artifact versions -- perhaps it needs JavaScript?" |
| end |
| |
| # Check if the links can be read |
| |
| links.each do |h, t| |
| if h =~ %r{\.(asc|sha256|sha512)$} |
| host, _stem, _ext = check_hash_loc(h, tlp) |
| if host == 'archive' |
| if $ARCHIVE_CHECK |
| check_head(h, :E) # log |
| else |
| I "Ignoring archived hash #{h}" |
| end |
| elsif host |
| if $NOFOLLOW |
| I "Skipping artifact hash #{h}" |
| else |
| uri, _code, _response = check_head_3(h, :E) # log |
| unless uri.to_s == h |
| h1 = h.sub(%r{//(www\.)?apache\.org/dist/}, '//downloads.apache.org/') |
| unless uri.to_s == h1 |
| W "Redirected hash: #{h} => #{uri}" |
| end |
| end |
| end |
| else |
| # will have been reported by check_hash_loc |
| end |
| elsif h =~ ARTIFACT_RE |
| name = $1 |
| _ext = $2 |
| if h =~ %r{https?://archive\.apache\.org/} |
| unless $ARCHIVE_CHECK |
| I "Ignoring archived artifact #{h}" |
| next |
| end |
| end |
| # Ideally would like to check for use of closer.lua/.cgi, but some projects pre-populate the pages |
| # TODO: would it help to check host against mirrors.list? |
| if h =~ %r{https?://(www\.)?apache\.org/dist} or h =~ %r{https?://downloads.apache.org/} |
| E "Must use mirror system #{h}" |
| next |
| elsif h =~ %r{https?://repo\d\.maven\.org/.+(-src|-source)} |
| E "Must use mirror system for source #{h}" |
| next |
| end |
| if $NOFOLLOW |
| I "Skipping artifact #{h}" |
| next |
| end |
| res = check_head(h, :E, false) # nolog |
| next unless res |
| # if HEAD returns content_type and length it's probably a direct link |
| ct = res.content_type |
| cl = res.content_length |
| if ct and cl |
| I "#{h} OK: #{ct} #{cl}" |
| else # need to try to download the mirror page |
| path = nil |
| # action=download needs special handling |
| if h =~ %r{^https?://(www\.)?apache\.org/dyn/.*action=download} |
| res = check_page(h, :E, false, true, false) |
| next unless res |
| unless res.code =~ /^3\d\d$/ |
| E "Expected redirect, got #{res.code}" |
| next |
| end |
| path = res['Location'] or E("Could not extract Location from #{h}") |
| else |
| bdy = check_page(h, :E, false) |
| if bdy |
| lks = get_links(path, bdy) |
| lks.each do |l, _t| |
| # Don't want to match archive server (closer.cgi defaults to it if file is not found) |
| if l.end_with?(name) and l !~ %r{//archive\.apache\.org/} |
| path = l |
| break |
| end |
| end |
| if bdy.include? 'The object is in our archive' |
| W "File is archived: '#{name}' in page: '#{h}'" |
| next |
| end |
| end |
| end |
| if path |
| res = check_head(path, :E, false) # nolog |
| next unless res |
| ct = res.content_type |
| cl = res.content_length |
| if ct and cl |
| I "OK: #{ct} #{cl} #{path}" |
| elsif cl |
| I "NAK: ct='#{ct}' cl='#{cl}' #{path}" |
| else |
| E "NAK: ct='#{ct}' cl='#{cl}' #{path}" |
| end |
| else |
| E "Could not find link for '#{name}' in page: '#{h}' (missing)" |
| end |
| end |
| elsif h =~ %r{\.(md5|sha\d*)$} |
| host, stem, _ext = check_hash_loc(h, tlp) |
| if $NOFOLLOW |
| I "Skipping deprecated hash #{h}" |
| next |
| end |
| if %w{www downloads archive maven}.include?(host) or host == '' |
| next unless $ARCHIVE_CHECK or host != 'archive' |
| res = check_head(h, :E, false) # nolog |
| next unless res |
| lastmod = res['last-modified'] |
| date = Time.parse(lastmod) |
| # Check if older than 2018? |
| if date < deprecated |
| I "Deprecated hash found #{h} #{t}; however #{lastmod} is older than #{deprecated}" |
| # OK |
| else |
| unless host == 'maven' and stem.end_with? '.jar' # Maven has yet to be upgraded... |
| W "Deprecated hash found #{h} #{t} - do not use for current releases #{lastmod}" |
| end |
| end |
| else |
| E "Unhandled host: #{host} in #{h}" |
| end |
| elsif h =~ %r{/KEYS$} or t == 'KEYS' |
| # already handled |
| elsif h =~ %r{^https?://www\.apache\.org/?(licenses/.*|foundation/.*|events/.*)?$} |
| # standard links |
| elsif h =~ %r{https?://people.apache.org/phonebook.html} |
| elsif h.start_with? 'https://cwiki.apache.org/confluence/' |
| # Wiki |
| elsif h.start_with? 'https://wiki.apache.org/' |
| # Wiki |
| elsif h.start_with? 'https://svn.apache.org/' |
| # E "Public download pages should not link to unreleased code: #{h}" # could be a sidebar/header link |
| elsif h =~ %r{^https?://(archive|www)\.apache\.org/dist/} |
| W "Not yet handled #{h} #{t}" unless h =~ /RELEASE[-_]NOTES/ or h =~ %r{^https?://archive.apache.org/dist/#{tlpQE}/} |
| else |
| # Ignore everything else? |
| end |
| end |
| |
| end |
| |
| def getTLP(url) # convert URL to TLP/podling |
| if url =~ %r{^https?://cwiki\.apache\.org/confluence/display/(\S+)/} |
| tlp = $1.downcase |
| elsif url =~ %r{^https?://([^.]+)(\.incubator|\.us|\.eu)?\.apache\.org/} |
| tlp = URL2TLP[$1] || $1 |
| elsif url =~ %r{^https?://([^.]+)\.openoffice\.org/} |
| tlp = 'openoffice' |
| else |
| tlp = nil |
| F "Unknown TLP for URL #{url}" |
| end |
| tlp |
| end |
| |
| # Called by GUI when POST is pushed |
| def doPost(options) |
| $ALWAYS_CHECK_LINKS = options[:checklinks] |
| $NO_CHECK_LINKS = options[:nochecklinks] |
| $ARCHIVE_CHECK = options[:archivecheck] |
| init |
| url = options[:url] |
| tlp = options[:tlp] |
| tlp = getTLP(url) if tlp == '' |
| if tlp |
| checkDownloadPage(url, tlp, options[:version]) |
| end |
| displayHTML |
| end |
| |
| |
| if __FILE__ == $0 |
| $CLI = true |
| $VERBOSE = true |
| $ALWAYS_CHECK_LINKS = ARGV.delete '--always' |
| $NO_CHECK_LINKS = ARGV.delete '--nolinks' |
| $ARCHIVE_CHECK = ARGV.delete '--archivecheck' |
| $ALLOW_HTTP = ARGV.delete '--http' |
| $FAIL_FAST = ARGV.delete '--ff' |
| $SHOW_LINKS = ARGV.delete '--show-links' |
| $ALLOW_JS = ARGV.delete '--js-allow' |
| |
| # check for any unhandled options |
| ARGV.each do |arg| |
| if arg.start_with? '--' |
| raise ArgumentError.new("Invalid option #{arg}; expecting always|nolinks|archivecheck|http|ff|show-links") |
| end |
| end |
| |
| init |
| |
| version = '' |
| url = ARGV[0] |
| if ARGV.size == 1 |
| tlp = getTLP(url) |
| else |
| tlp = ARGV[1] |
| version = ARGV[2] || '' |
| end |
| |
| checkDownloadPage(url, tlp, version) |
| |
| # display the test results as text |
| puts "" |
| puts "=================" |
| puts "" |
| @tests.each { |t| t.map {|k, v| puts "#{k}: - #{v}"}} |
| puts "" |
| testentries(:W).each { |t| t.map {|k, v| puts "#{k}: - #{v}"}} |
| testentries(:E).each { |t| t.map {|k, v| puts "#{k}: - #{v}"}} |
| testentries(:F).each { |t| t.map {|k, v| puts "#{k}: - #{v}"}} |
| puts "" |
| |
| # Only show in CLI version for now |
| puts "Version summary" |
| $versions.sort.each do |k, v| |
| puts k |
| v.sort.each do |l, w| |
| puts " #{l} #{w}" |
| end |
| end |
| puts "" |
| |
| if @fails > 0 |
| puts "NAK: #{url} had #{@fails} errors" |
| else |
| puts "OK: #{url} passed all the tests" |
| end |
| puts "" |
| |
| end |