Add initial draft of download page checker
diff --git a/tools/download_check.rb b/tools/download_check.rb
new file mode 100755
index 0000000..ccfcb80
--- /dev/null
+++ b/tools/download_check.rb
@@ -0,0 +1,576 @@
+#!/usr/bin/env ruby
+
+=begin
+Checks a download page URL for compliance with ASF guidelines.
+
+
+Note: the GUI interface is currently at www/members/download_check.cgi
+
+=end
+
+require 'wunderbar'
+require 'net/http'
+require 'nokogiri'
+require 'time'
+
+=begin
+Checks performed: (F=fatal, E=error, W=warn)
+TBA
+=end
+
+$SAFE = 1
+
+$CLI = false
+$VERBOSE = false
+
+$ARCHIVE_CHECK = false
+$ALWAYS_CHECK_LINKS = false
+$NO_CHECK_LINKS = false
+$NOFOLLOW = false # may be reset
+
+$VERSION = nil
+
+# match an artifact
+ARTIFACT_RE = %r{/([^/]+\.(tar|tar\.gz|zip|tgz|tar\.bz2|jar|war))$}
+
+def init
+ # build a list of validation errors
+ @tests = []
+ @fails = 0
+ if $NO_CHECK_LINKS
+ $NOFOLLOW = true
+ I "Will not check links"
+ else
+ if $ALWAYS_CHECK_LINKS
+ I "Will check links even if download page has errors"
+ else
+ I "Will check links if download page has no errors"
+ end
+ end
+ I "Will %s archive.apache.org links in checks" % ($ARCHIVE_CHECK ? 'include' : 'not include')
+end
+
+# save the result of a test
+def test(severity, txt)
+ @tests << {severity => txt}
+ @fails +=1 unless severity == :I or severity == :W
+end
+
+def F(txt)
+ test(:F, txt)
+end
+
+def E(txt)
+ test(:E, txt)
+end
+
+def W(txt)
+ test(:W, txt)
+end
+
+def I(txt)
+ test(:I, txt)
+end
+
+# extract test entries with key k
+def tests(k)
+ @tests.map{|t| t[k]}.compact
+end
+
+# extract test entries with key k
+def testentries(k)
+ @tests.select{|t| t[k]}.compact
+end
+
+def showList(list, header)
+ unless list.empty?
+ _h2_ header
+ _ul do
+ list.each { |item| _li item }
+ end
+ end
+end
+
+def displayHTML
+ fatals = tests(:F)
+ errors = tests(:E)
+ warns = tests(:W)
+
+ if !fatals.empty?
+ _h2_.bg_danger "The page at #@url failed our checks:"
+ elsif !errors.empty?
+ _h2_.bg_warning "The page at #@url has some problems:"
+ elsif !warns.empty?
+ _h2_.bg_warning "The page at #@url has some minor issues"
+ else
+ _h2_.bg_success "The page at #@url looks OK, thanks for using this service"
+ end
+
+ if @fails > 0
+ showList(fatals, "Fatal errors:")
+ showList(errors, "Errors:")
+ # Cannot easily copy/paste URLs; use layout suitable for copy/paste into e.g. JIRA issue/e-mail
+ _p do
+ _ 'Please see the Apache download page configuration instructions [1] for further details on configuring your mirror server.'
+ end
+ _p do
+ _ '[1] '
+ _a 'TBA', href: 'TBA'
+ end
+ end
+
+ showList(warns, "Warnings:")
+
+ _h2_ 'Tests performed'
+ _ol do
+ @tests.each { |t| t.map{|k,v| _li "#{k}: - #{v}"}}
+ end
+ _h4_ 'F: fatal, E: Error, W: warning, I: info (success)'
+end
+
+# get an HTTP URL
+def HEAD(url)
+ puts ">> HEAD #{url}" if $VERBOSE
+ url.untaint
+ uri = URI.parse(url)
+ unless uri.scheme
+ W "No scheme for URL #{url}, assuming http"
+ uri = URI.parse("http:"+url)
+ end
+ http = Net::HTTP.new(uri.host, uri.port)
+ http.use_ssl = uri.scheme == 'https'
+ request = Net::HTTP::Head.new(uri.request_uri)
+ http.request(request)
+end
+
+# get an HTTP URL=> response
+def GET(url)
+ puts ">> GET #{url}" if $VERBOSE
+ url.untaint
+ uri = URI.parse(url).untaint
+ unless uri.scheme
+ W "No scheme for URL #{url}, assuming http"
+ uri = URI.parse("http:"+url).untaint
+ end
+ http = Net::HTTP.new(uri.host, uri.port)
+ http.use_ssl = uri.scheme == 'https'
+ request = Net::HTTP::Get.new(uri.request_uri)
+ http.request(request.untaint)
+end
+
+# Check page exists
+def check_head(path, severity = :E, expectedStatus = "200", log=true)
+ response = HEAD(path)
+ code = response.code || '?'
+ if code != expectedStatus
+ test(severity, "HEAD #{path} - HTTP status: #{code} expected: #{expectedStatus}") unless severity == nil
+ return nil
+ end
+ I "Checked HEAD #{path} - OK (#{code})" if log
+ response
+end
+
+# check page can be read => body
+def check_page(path, severity=:E, expectedStatus="200", log=true)
+ response = GET(path)
+ code = response.code || '?'
+ if code != expectedStatus
+ test(severity, "Fetched #{path} - HTTP status: #{code} expected: #{expectedStatus}") unless severity == nil
+ return nil
+ end
+ I "Fetched #{path} - OK (#{code})" if log
+ puts "Fetched #{path} - OK (#{code})" if $CLI
+ if code == '200'
+ return response.body
+ else
+ return response
+ end
+end
+
+# Check closer/download page
+def check_closer_down(url)
+ # N.B. HEAD does not work; it returns success
+ res = check_page(url, :E, "302", false)
+ loc = res['location']
+ res = check_head(loc, :E, "200", false)
+ return unless res
+ ct = res.content_type
+ cl = res.content_length
+ if ct and cl
+ I "Checked #{url} OK - ct=#{ct} cl=#{cl}"
+ elsif cl > 0
+ W "Possible issue with #{url} ct=#{ct} cl=#{cl}"
+ else
+ E "Problem with #{url} ct=#{ct} cl=#{cl}"
+ end
+end
+
+# returns www|archive, stem and the hash extension
+def check_hash_loc(h,tlp)
+ if h =~ %r{^(https?)://(?:(archive|www)\.)?apache\.org/dist/(?:incubator/)?#{tlp}/.*([^/]+)(\.(\w{3,6}))$}
+ E "HTTPS! #{h}" unless $1 == 'https'
+ return $2,$3,$4
+ else
+ E "Unexpected hash location #{h} for #{tlp}"
+ nil
+ end
+end
+
+# get the https? links as Array of [href,text]
+def get_links(body)
+ doc = Nokogiri::HTML(body)
+ nodeset = doc.css('a[href]') # Get anchors w href attribute via css
+ links = nodeset.map {|node|
+ href = node.attribute("href").to_s
+ text = node.text.gsub(/[[:space:]]+/,' ')
+ [href,text]
+ }.select{|x,y| x =~ %r{^(https?:)?//} }
+end
+
+VERIFY_TEXT = [
+ 'the integrity of the downloaded files',
+ 'verify the integrity', # commons has this as a link; perhaps try converting page to text only?
+ 'verify that checksums and signatures are correct',
+ '#verifying-signature',
+ 'check that the download has completed OK',
+ 'You should verify your download',
+ 'downloads can be verified',
+ 'www.apache.org/info/verification.html',
+ 'verify your mirrored downloads',
+ 'verify your downloads',
+ 'All downloads should be verified',
+]
+
+ALIASES = {
+ 'sig' => 'asc',
+ 'pgp' => 'asc',
+}
+# Convert text reference to extension
+# e.g. SHA256 => sha256; [SIG] => asc
+def text2ext(txt)
+ if txt.size <= 10
+ tmp = txt.downcase.sub(%r{^\[(.+)\]$},'\1')
+ ALIASES[tmp] || tmp
+ else
+ txt
+ end
+end
+
+# Suite: perform all the HTTP checks
+def checkDownloadPage(path, tlp, version)
+ begin
+ _checkDownloadPage(path.strip, tlp, version)
+ rescue Exception => e
+ F e
+ if $CLI
+ p e
+ puts e.backtrace
+ end
+ end
+end
+
+def _checkDownloadPage(path, tlp, version)
+ if version != ''
+ I "Checking #{path} [#{tlp}] for version #{version} only ..."
+ else
+ I "Checking #{path} [#{tlp}] ..."
+ end
+
+ # check the main body
+ if path.start_with? 'http'
+ body = check_page(path)
+ else
+ file = path
+ if file.start_with? '~'
+ file = ENV['HOME'] + file[1..-1]
+ end
+ body = File.read(file.untaint)
+ end
+
+ return unless body
+
+ if body.include? 'dist.apache.org'
+ E 'Page must not link to dist.apache.org'
+ else
+ I 'Page does not reference dist.apache.org'
+ end
+
+ if body.include? 'repository.apache.org'
+ E 'Page must not link to repository.apache.org'
+ else
+ I 'Page does not reference repository.apache.org'
+ end
+
+ deprecated = Time.parse('2018-01-01')
+
+ links = get_links(body)
+
+ # check KEYS link
+ # TODO: is location used by hc allowed, e.g.
+ # https://www.apache.org/dist/httpcomponents/httpclient/KEYS
+ expurl = "https://[www.]apache.org/dist/[incubator/]#{tlp}/KEYS"
+ expurlre = %r{^https://(www\.)?apache\.org/dist/(incubator/)?#{tlp}/KEYS$}
+ keys = links.select{|h,v| v.strip == 'KEYS' || v == 'KEYS file' || v == '[KEYS]'}
+ if keys.size >= 1
+ I 'Found KEYS link'
+ keyurl = keys.first.first
+ if keyurl =~ expurlre
+ I "KEYS links to #{expurl} as expected"
+ else
+ if keyurl =~ %r{^https://www\.apache\.org/dist/#{tlp}/[^/]+/KEYS$}
+ W "KEYS: expected: #{expurl}\n actual: #{keyurl}"
+ else
+ E "KEYS: expected: #{expurl}\n actual: #{keyurl}"
+ end
+ end
+ else
+ E 'Could not find KEYS link'
+ end
+
+ # check for verify instructions
+ bodytext = body.gsub(/\s+/,' ') # single line
+ if VERIFY_TEXT.any? {|text| bodytext.include? text}
+ I 'Found reference to download verification'
+ else
+ E 'Could not find statement of the need to verify downloads'
+ end
+
+ # Check if GPG verify has two parameters
+ body.scan(%r{^.+gpg --verify.+$}){|m|
+ unless m =~ %r{gpg --verify\s+\S+\.asc\s+\S+}
+ W "gpg verify without second param: #{m.strip}"
+ end
+ }
+
+ # check if page refers to md5sum
+ body.scan(%r{^.+md5sum.+$}){|m|
+ W "Found md5sum: #{m.strip}"
+ }
+
+ # Check archives have hash and sig
+ vercheck = Hash.new() # key = archive name, value = array of hash/sig
+ links.each do |h,t|
+ # Must occur before mirror check below
+ if h =~ %r{^https?://www.apache.org/dist/(.+\.(asc|sha\d+|md5))$}
+ base = File.basename($1)
+ ext = $2
+ stem = base[0..-(2+ext.length)]
+ if vercheck[stem]
+ vercheck[stem] << ext
+ else
+ E "Bug: found hash for missing artifact #{stem}"
+ end
+ tmp = text2ext(t)
+ next if ext == tmp # i.e. link is just the type or [TYPE]
+ if not base == t
+ E "Mismatch: #{h} and #{t}"
+ end
+ # These might also be direct links to mirrors
+ elsif h =~ ARTIFACT_RE
+ base = File.basename($1)
+ # puts "base: " + base
+ if vercheck[base] # might be two links to same archive
+ W "Already seen link for #{base}"
+ else
+ vercheck[base] = []
+ end
+ # Text must include a '.' (So we don't check 'Source')
+ if t.include?('.') and not base == t
+ # text might be short version of link
+ tmp = t.strip.sub(%r{.*/},'') #
+ if base == tmp
+ W "Mismatch?: #{h} and #{t}"
+ else
+ E "Mismatch: #{h} and '#{tmp}'"
+ end
+ end
+ end
+ end
+
+ # did we find all required elements?
+ vercheck.each do |k,v|
+ unless v.include? "asc" and v.any? {|e| e =~ /^sha\d+$/ or e == 'md5'}
+ E "#{k} missing sig/hash: #{v.inspect}"
+ end
+ end
+
+ if @fails > 0 and not $ALWAYS_CHECK_LINKS
+ W "** Not checking links **"
+ $NOFOLLOW = true
+ end
+
+ links.each do |h,t|
+ if h =~ %r{\.(asc|sha256|sha512)$}
+ host, stem, ext = check_hash_loc(h,tlp)
+ if host == 'archive'
+ I "Ignoring archive hash #{h}"
+ elsif host
+ if $NOFOLLOW
+ I "Skipping archive hash #{h}"
+ else
+ check_head(h, :E, "200", true)
+ end
+ else
+ # will have been reported by check_hash_loc
+ end
+ # mirror downloads need to be treated differently
+ elsif h =~ %r{^https?://www.apache.org/dyn/.*action=download}
+ if $NOFOLLOW
+ I "Skipping download artifact #{h}"
+ else
+ check_closer_down(h)
+ end
+ elsif h =~ ARTIFACT_RE
+ if $NOFOLLOW
+ I "Skipping archive artifact #{h}"
+ next
+ end
+ name = $1
+ ext = $2
+ if h =~ %r{https?://archive\.apache\.org/}
+ I "Ignoring archive artifact #{h}"
+ next
+ end
+ if h =~ %r{https?://(www\.)?apache\.org/dist}
+ E "Must use mirror system #{h}"
+ next
+ end
+ res = check_head(h, :E, "200", false)
+ next unless res
+ # if HEAD returns content_type and length it's probably a direct link
+ ct = res.content_type
+ cl = res.content_length
+ if ct and cl
+ I "#{h} OK: #{ct} #{cl}"
+ else # need to try to download the mirror page
+ path = nil
+ bdy = check_page(h, :E, "200", false)
+ if bdy
+ lks = get_links(bdy)
+ lks.each do |l,t|
+ if l.end_with? name
+ path = l
+ break
+ end
+ end
+ end
+ if path
+ res = check_head(path, :E, "200", false)
+ next unless res
+ ct = res.content_type
+ cl = res.content_length
+ if ct and cl
+ I "OK: #{ct} #{cl} #{path}"
+ else
+ E "NAK: #{ct} #{cl} #{path}"
+ end
+ else
+ E "Could not find link for #{name} in #{h}"
+ end
+ end
+ elsif h =~ %r{\.(md5|sha.*)$}
+ host,_,_ = check_hash_loc(h,tlp)
+ if $NOFOLLOW
+ I "Skipping deprecated hash #{h}"
+ next
+ end
+ if host == 'www' or host == ''
+ res = check_head(h,:E, "200", false)
+ next unless res
+ lastmod = res['last-modified']
+ date = Time.parse(lastmod)
+ # Check if older than 2018?
+ if date < deprecated
+ I "Deprecated hash found #{h} #{t}; however #{lastmod} is older than #{deprecated}"
+ # OK
+ else
+ E "Deprecated hash found #{h} #{t} - do not use for current releases #{lastmod}"
+ end
+ end
+ elsif h =~ %r{/KEYS$} or t == 'KEYS'
+ # already handled
+ elsif h =~ %r{^https?://www\.apache\.org/?(licenses/.*|foundation/.*|events/.*)?$}
+ # standard links
+ elsif h =~ %r{https?://people.apache.org/phonebook.html}
+ elsif h.start_with? 'https://cwiki.apache.org/confluence/'
+ # Wiki
+ elsif h.start_with? 'https://wiki.apache.org/'
+ # Wiki
+ elsif h.start_with? 'https://svn.apache.org/'
+ # E "Public download pages should not link to unreleased code: #{h}" # could be a sidebar/header link
+ elsif h =~ %r{^https?://(archive|www)\.apache\.org/dist/}
+ W "Not yet handled #{h} #{t}" unless h =~ /RELEASE[-_]NOTES/ or h =~ %r{^https?://archive.apache.org/dist/#{tlp}/}
+ else
+ # Ignore everything else?
+ end
+ end
+
+end
+
+def getTLP(url)
+ if url =~ %r{^https?://([^.]+)(\.incubator)?\.apache\.org/}
+ tlp = $1
+ tlp = 'httpcomponents' if tlp == 'hc'
+ elsif url =~ %r{^https?://([^.]+)\.openoffice\.org/}
+ tlp = 'openoffice'
+ else
+ tlp = nil
+ F "Unknown TLP for URL #{url}"
+ end
+ tlp
+end
+
+# Called by GUI when POST is pushed
+def doPost(options)
+ $ALWAYS_CHECK_LINKS = options[:checklinks]
+ $NO_CHECK_LINKS = options[:nochecklinks]
+ $ARCHIVE_CHECK = options[:archivecheck]
+ init
+ url = options[:url]
+ tlp = getTLP(url)
+ if tlp
+ checkDownloadPage(url, tlp, options[:version])
+ end
+ displayHTML
+end
+
+
+if __FILE__ == $0
+ $CLI = true
+ $SAFE = 0
+ $VERBOSE =true
+ $ALWAYS_CHECK_LINKS = ARGV.delete '--always'
+ $NO_CHECK_LINKS = ARGV.delete '--nolinks'
+ $ARCHIVE_CHECK = ARGV.delete '--archivecheck'
+
+ version = ''
+ if ARGV.size == 1
+ url = ARGV[0]
+ tlp = getTLP(url)
+ else
+ url = ARGV[0]
+ tlp = ARGV[1]
+ version = ARGV[2] || ''
+ end
+
+ init
+
+ checkDownloadPage(url, tlp, version)
+
+ # display the test results as text
+ puts ""
+ puts "================="
+ puts ""
+ @tests.each { |t| t.map{|k, v| puts "#{k}: - #{v}"}}
+ puts ""
+ testentries(:W).each { |t| t.map{|k, v| puts "#{k}: - #{v}"}}
+ testentries(:E).each { |t| t.map{|k, v| puts "#{k}: - #{v}"}}
+ testentries(:F).each { |t| t.map{|k, v| puts "#{k}: - #{v}"}}
+ puts ""
+ if @fails > 0
+ puts "NAK: #{url} had #{@fails} errors"
+ else
+ puts "OK: #{url} passed all the tests"
+ end
+ puts ""
+end
diff --git a/www/members/download_check.cgi b/www/members/download_check.cgi
new file mode 100755
index 0000000..3d57939
--- /dev/null
+++ b/www/members/download_check.cgi
@@ -0,0 +1,99 @@
+#!/usr/bin/env ruby
+PAGETITLE = "ASF Download Page Checker - BETA"
+$LOAD_PATH.unshift '/srv/whimsy/lib'
+require 'wunderbar'
+require 'wunderbar/bootstrap'
+require 'whimsy/asf'
+require "../../tools/download_check.rb"
+
+_html do
+ _body? do
+ _whimsy_body(
+ title: PAGETITLE,
+ related: {
+ 'https://www.apache.org/legal/release-policy.html#release-announcements' => 'Release announcements',
+ 'https://www.apache.org/dev/release-distribution#download-links' => 'Download links and cryptographic files',
+ 'https://www.apache.org/dev/release-download-pages.html#download-page' => 'KEYS file and download verification',
+ 'https://www.apache.org/dev/release-distribution#sigs-and-sums' => 'MD5 and SHA1 are deprecated',
+ },
+ helpblock: -> {
+ _p do
+ _b '*** BETA ***'
+ end
+ _p 'This page can be used to check that an Apache download page has been set up correctly.'
+ _p do
+ _'The download page is checked for the following:'
+ _ul do
+ _li 'Does not link to dist.apache.org'
+ _li 'Page does not reference repository.apache.org'
+ _li 'Has link to KEYS file'
+ _li 'References need to verify downloads'
+ _li 'If gpg verify example is given, should include second parameter'
+ _li 'Each artifact has a signature and a hash, which should not be MD5 or SHA1'
+# _li 'If a version is specified, there must be an artifact link with that version'
+ end
+ _p 'If any errors are found, no further checks are made unless "Always check links" is enabled'
+ _p 'Links are checked by using HTTP HEAD requests; however links to the archive server are not checked unless "Check archive server links" is selected'
+ end
+ }
+ ) do
+ _whimsy_panel('Check Download page', style: 'panel-success') do
+ _form.form_horizontal method: 'post' do
+ _div.form_group do
+ _label.control_label.col_sm_2 'Page URL', for: 'url'
+ _div.col_sm_10 do
+ _input.form_control.name name: 'url', required: true,
+ value: ENV['QUERY_STRING'],
+ placeholder: 'download URL',
+ size: 50
+ end
+ end
+# _div.form_group do
+# _label.control_label.col_sm_2 'Version to check', for: 'version'
+# _div.col_sm_10 do
+# _input.form_control.name name: 'version', required: false,
+# placeholder: 'optional version to check',
+# size: 50
+# end
+# end
+ _div.form_group do
+ _label.control_label.col_sm_2 'Always check links', for: 'checklinks'
+ _div.col_sm_10 do
+ _input name: 'checklinks', type: 'checkbox', value: 'true', checked: false
+ end
+ end
+ _div.form_group do
+ _label.control_label.col_sm_2 'Never check links', for: 'nochecklinks'
+ _div.col_sm_10 do
+ _input name: 'nochecklinks', type: 'checkbox', value: 'true', checked: false
+ end
+ end
+ _div.form_group do
+ _label.control_label.col_sm_2 'Check links to archive server', for: 'archivecheck'
+ _div.col_sm_10 do
+ _input name: 'archivecheck', type: 'checkbox', value: 'true', checked: false
+ end
+ end
+ _div.form_group do
+ _div.col_sm_offset_2.col_sm_10 do
+ _input.btn.btn_default type: 'submit', value: 'Check Page'
+ end
+ end
+ end
+ end
+ _div.well.well_lg do
+ if _.post?
+ doPost(
+ {
+ url: @url,
+ version: '', # TODO @version when implemented
+ checklinks: @checklinks == 'true',
+ nochecklinks: @nochecklinks == 'true',
+ archivecheck: @archivecheck == 'true'
+ })
+ end
+ end
+ end
+ end
+end
+