blob: 5153d94ed5ed1d84fd6ea765f5e06c59bd08b2c0 [file] [log] [blame]
#!/usr/bin/env ruby
=begin
Checks a mirror URL for compliance with ASF mirroring guidelines.
Derived partly from
https://svn.apache.org/repos/asf/infrastructure/site-tools/trunk/mirrors/check_mirror.pl
TODO this is a work in progress...
Ideally the causes of some of the problems should be reported ...
Note: the GUI interface is currently at www/members/mirror_check.cgi
=end
require 'wunderbar'
require 'net/http'
=begin
Checks performed: (F=fatal, E=error, W=warn)
F: zzz/time.txt is readable
F: its contents is a number followed by text
W: test whether time is more than 1 day old
W: test its content-type (missing or text/plain)
F: BASE is readable and non-empty
W: has html + body headers and body/html trailers
W: body matches m!<(img|IMG) (src|SRC)="/icons/!
E: check index against TLP list m!> ?$dir/?<!
E: tlp dir: check can be read (mirrors sometimes have incorrect protections)
W: 'favicon.ico' and 'zzz/' must both be in page
W: favicon.ico must appear after zzz/ to show folders first
E: 'harmony' should be redirected with 404
E: 'zzz/___' should generate 404
W: 'zzz/README' content-type text/plain
E: header must match /<h\d>Apache Software Foundation Distribution Meta-Directory</h\d>/
E: footer must match /This directory contains meta-data for the ASF mirroring system./
E: mirror-tests/ must exist
W: its files must not have content-encoding:
1mb.img.7z 1mb.img.bz2 1mb.img.tar.gz 1mb.img.tgz 1mb.img.zip
W: zzz/mirror-tests/redirect-test/ should redirect to http://www.apache.org/ (302)
TODO - any more checks?
=end
$SAFE = 1
URLMATCH = %r!^https?://[^/]+/(\S+/)?$!i
HTTPDIRS = %w(zzz/ zzz/mirror-tests/) # must exist
HDRMATCH = %r!<h\d>Apache Software Foundation Distribution Meta-Directory</h\d>! # must be on the zzz index page
FTRMATCH = %r!This directory contains meta-data for the ASF mirroring system.! # must be on the zzz index page
HASHDR = %r!<html( [^>]+)?>.+?<body>!im
HASFTR = %r!</body>.*?</html>!im
HTTPDIR = 'zzz/' # must appear in index page
HTTP404 = 'zzz/___'; # Non-existent URL; should generate 404
HTTPTEXT = 'zzz/README'; # text file (without extension) should generate Content-Type text/plain or none
MIRRORTEST = 'zzz/mirror-tests/';
MIRRORTEST_FILES = %w(1mb.img.7z 1mb.img.bz2 1mb.img.tar.gz 1mb.img.tgz 1mb.img.zip) # no Content-Encoding !
# save the result of a test
def test(severity, txt)
@tests << {severity => txt}
@fails+=1 unless severity == :I
end
def F(txt)
test(:F, txt)
end
def E(txt)
test(:E, txt)
end
def W(txt)
test(:W, txt)
end
def I(txt)
test(:I, txt)
end
# extract test entries with key k
def tests(k)
@tests.map{|t| t[k]}.compact
end
# get an HTTP URL
def getHTTPHdrs(url)
url.untaint
uri = URI.parse(url)
http = Net::HTTP.new(uri.host, uri.port)
http.use_ssl = uri.scheme == 'https'
request = Net::HTTP::Head.new(uri.request_uri)
http.request(request)
end
def check_redirect(base, page, expectedLocation, severity=:W, expectedStatus = "302", log=true)
path = base + page
response = getHTTPHdrs(path)
if response.code != expectedStatus
test severity, "HTTP status #{response.code} for '#{path}'" unless severity == nil
return nil
end
if response['location'] != expectedLocation
test severity, "HTTP location #{response['location']} for '#{path}' - expected '#{expectedLocation}'" unless severity == nil
return nil
end
I "Fetched #{path} - redirected OK to #{response['location']}" if log
response
end
def check_CT(base, page, severity=:E, expectedStatus = "200")
path = base + page
response = getHTTPHdrs(path)
if response.code != expectedStatus
test severity, "HTTP status #{response.code} for '#{path}'" unless severity == nil
return nil
end
ct = response['Content-Type'] || 'unknown'
ce = response['Content-Encoding']
# TODO also check CT - some mirrors return text/plain for img??
if ce
W "Checking #{path} - Content-Type: #{ct} WARN: Content-Encoding: #{ce}"
else
I "Checking #{path} - Content-Type: #{ct}"
end
end
# get an HTTP URL=> response
def getHTTP(url)
url.untaint
uri = URI.parse(url)
http = Net::HTTP.new(uri.host, uri.port)
http.use_ssl = uri.scheme == 'https'
request = Net::HTTP::Get.new(uri.request_uri)
http.request(request)
end
# check page can be read => body
def check_page(base, page, severity=:E, expectedStatus="200", log=true)
path = base + page
response = getHTTP(path)
code = response.code || '?'
if code != expectedStatus
test(severity, "Fetched #{path} - HTTP status: #{code} expected: #{expectedStatus}") unless severity == nil
return nil
end
I "Fetched #{path} - OK" if log
response.body
end
def checkIndex(page, type)
asfData = @pages[type]
links = parseIndexPage(page)
if type == :tlps
fav = links.index('favicon.ico')
zzz = links.index('zzz')
if fav and zzz
if fav < zzz
W "Index for #{type}: incorrect #{type} page order - found favicon.ico before zzz/; folders should be listed before files"
else
I "Index for #{type}: found favicon.ico and zzz/ in the page in the correct order (i.e. folders are listed before files)"
end
else
W "Index for #{type}: expected to find favicon.ico #{fav} and zzz/ #{zzz} in the page, but at least one is missing"
end
end
links.each {|l|
W "Index for #{type}: the link #{l} is not shown on ASF site" unless asfData.include? l
}
asfData.each {|l|
W "Index for #{type}: the link #{l} is not shown on the mirror site" unless links.include? l or l == 'openoffice'
}
end
# nginx <tr><td><a href="activemq/" title="activemq">activemq/</a></td><td>-</td><td>2019-Nov-25 18:00</td></tr>
# ASF <tr><td valign="top"><img src="/icons/folder.gif" alt="[DIR]"></td><td><a href="accumulo/">accumulo/</a></td><td align="right">2019-08-07 23:42 </td><td align="right"> - </td><td>&nbsp;</td></tr>
# parse an HTTP server Index page => array of file/folder names
def parseIndexPage(page)
folders = []
# ASF main page references currently look like this: <a href="abdera/">abdera/</a>
# the Perl script looked for this match: m!> ?$dir/?<!
links = page.scan(%r{<a href=['"]([.a-z0-9-]+)/?['"](?: title=['"][.a-z0-9-]+/?['"])?>([.a-z0-9-]+)/?</a>})
links.each { |l|
if l[1] == l[0]
folders << l[1]
end
}
folders
end
# Check page has sensible headers and footers
def checkHdrFtr(path, body)
hasHTMLhdr = HASHDR.match(body)
hasHTMLftr = HASFTR.match(body)
if hasHTMLhdr
if hasHTMLftr
I "#{path} has header and footer"
else
W "#{path} is incomplete - no footer found"
end
else # no header
if hasHTMLftr
W "#{path} is incomplete - no header found"
else
W "#{path} is incomplete - no header or footer found"
end
end
end
# Suite: perform all the HTTP checks
def checkHTTP(base)
# We don't check the pattern on the form for two reasons:
# - not all browsers support it
# - allows the input to be more flexible
# Fix up the URL
base.strip!
base += '/' unless base.end_with? '/'
base = 'http://' + base unless base.start_with? 'http'
# Now check the syntax:
I "Checking #{base} ..."
unless URLMATCH.match(base)
F "Invalid URL syntax: #{base}"
return
end
setup
response = getHTTPHdrs(base)
server = response['server']
if server =~ /Apache/
I "Server: #{server}"
else
W "Server: '#{server}' - expected 'Apache' in server response"
end
# Check the mirror time (and that zzz/ is readable)
time = check_page(base, 'zzz/time.txt', severity = :F)
if time
match = /^(\d+) \S+$/.match(time)
if match
now = Time.now.to_i
stamp = match[1].to_i
age = (now - stamp)/60 # minutes
if age > 60*24
W "Mirror is over 1 day old: #{age} minutes"
else
I "Mirror is less than 1 day old: #{age} minutes"
end
else
F "Invalid time.txt contents: #{time}"
end
else
return # cannot process further (already recorded the error
end
# check the main body
body = check_page(base, '')
checkHdrFtr(base, body)
if %r{<(img|IMG) (src|SRC)="/icons/}.match(body)
I "Index page has icons as expected"
else
W "Missing or unexpected img icon tags"
end
checkIndex(body, :tlps)
ibody = check_page(base, 'incubator/')
checkHdrFtr(base+'incubator/', ibody)
checkIndex(ibody, :podlings)
check_page(base, 'harmony/', :E, expectedStatus="404")
zbody = check_page(base, HTTPDIR)
# Not sure this is useful on its own anymore
# It was originally used to detect sites with advertising wrappers,
# but most recent examples have been tables around directory listings
# which is obviously OK as it does not affect the user experience.
# if %r{<table}i.match(zbody)
# W "#{HTTPDIR} - TABLE detected"
# else
# I "#{HTTPDIR} - No TABLE detected, OK"
# end
checkHdrFtr(base+HTTPDIR, zbody)
if HDRMATCH.match(zbody)
I "Index page for #{HTTPDIR} contains the expected header text"
else
W "Index page for #{HTTPDIR} does not contain the expected header text"
end
if FTRMATCH.match(zbody)
I "Index page for #{HTTPDIR} contains the expected footer text"
else
W "Index page for #{HTTPDIR} does not contain the expected footer text"
end
check_page(base,HTTP404,:E, expectedStatus="404")
# Check that archives don't have Content-Encoding
MIRRORTEST_FILES.each do |file|
check_CT(base, MIRRORTEST + file)
end
check_redirect(base, 'zzz/mirror-tests/redirect-test/xyz', 'http://www.apache.org/')
end
def init
# build a list of validation errors
@tests = []
@fails = 0
end
def setup
tlps = parseIndexPage(check_page('https://downloads.apache.org/','',:F,"200",log=false))
podlings = parseIndexPage(check_page('https://downloads.apache.org/incubator/','',:F,"200",false))
@pages = {:tlps => tlps, :podlings => podlings}
end
def showList(list, header)
unless list.empty?
_h2_ header
_ul do
list.each { |item| _li item }
end
end
end
def display
fatals = tests(:F)
errors = tests(:E)
warns = tests(:W)
if !fatals.empty?
_h2_.bg_danger "The mirror at #@url failed our checks:"
elsif !errors.empty?
_h2_.bg_warning "The mirror at #@url has some problems:"
elsif !warns.empty?
_h2_.bg_warning "The mirror at #@url has some minor issues"
else
_h2_.bg_success "The mirror at #@url looks OK, thanks for using this service"
end
if @fails > 0
showList(fatals, "Fatal errors:")
showList(errors, "Errors:")
showList(warns, "Warnings:")
# Cannot easily copy/paste URLs; use layout suitable for copy/paste into e.g. JIRA issue/e-mail
_p do
_ 'Please see the Apache mirror configuration instructions [1] for further details on configuring your mirror server.'
end
_p do
_ '[1] '
_a 'http://www.apache.org/info/how-to-mirror.html#Configuration', href: 'http://www.apache.org/info/how-to-mirror.html#Configuration'
end
end
_h2_ 'Tests performed'
_ol do
@tests.each { |t| t.map{|k,v| _li "#{k}: - #{v}"}}
end
_h4_ 'F: fatal, E: Error, W: warning, I: info (success)'
end
# Called by GUI when POST is pushed
def doPost(url)
init
checkHTTP(url)
display
end
if __FILE__ == $0
$SAFE = 0
init
url = ARGV[0] || "localhost" # easier to test in an IDE
checkHTTP(url+"") # allow url to be untainted later
# display the test results
@tests.each { |t| t.map{|k, v| puts "#{k}: - #{v}"}}
if @fails > 0
puts "#{url} had #{@fails} errors"
else
puts "#{url} passed all the tests"
end
end