#!/usr/bin/env ruby

=begin
Checks a download page URL for compliance with ASF guidelines.


Note: the GUI interface is currently at www/members/download_check.cgi

=end

require 'wunderbar'
require 'net/http'
require 'nokogiri'
require 'time'

=begin
Checks performed: (F=fatal, E=error, W=warn)
TBA
=end

$SAFE = 1

$CLI = false
$VERBOSE = false

$ARCHIVE_CHECK = false
$ALWAYS_CHECK_LINKS = false
$NO_CHECK_LINKS = false
$NOFOLLOW = false # may be reset

$VERSION = nil

# match an artifact
# TODO detect artifacts by URL as well if possible
ARTIFACT_RE = %r{/([^/]+\.(tar|tar\.gz|zip|tgz|tar\.bz2|jar|war|msi|rar|rpm|nar))$}

def init
  # build a list of validation errors
  @tests = []
  @fails = 0
  if $NO_CHECK_LINKS
    $NOFOLLOW = true
    I "Will not check links"
  else
    if $ALWAYS_CHECK_LINKS
      I "Will check links even if download page has errors"
    else
      I "Will check links if download page has no errors"
    end      
  end
  I "Will %s archive.apache.org links in checks" % ($ARCHIVE_CHECK ? 'include' : 'not include')
end

# save the result of a test
def test(severity, txt)
  @tests << {severity => txt}
  @fails +=1 unless severity == :I or severity == :W
end

def F(txt)
  test(:F, txt)
end

def E(txt)
  test(:E, txt)
end

def W(txt)
  test(:W, txt)
end

def I(txt)
  test(:I, txt)
end

# extract test entries with key k
def tests(k)
  @tests.map{|t| t[k]}.compact
end

# extract test entries with key k
def testentries(k)
  @tests.select{|t| t[k]}.compact
end

def showList(list, header)
  unless list.empty?
    _h2_ header
    _ul do
      list.each { |item| _li item }
    end
  end
end

def displayHTML
  fatals = tests(:F)
  errors = tests(:E)
  warns = tests(:W)

  if !fatals.empty?
    _h2_.bg_danger "The page at #@url failed our checks:"
  elsif !errors.empty?
    _h2_.bg_warning "The page at #@url has some problems:"
  elsif !warns.empty?
    _h2_.bg_warning "The page at #@url has some minor issues"
  else
    _h2_.bg_success "The page at #@url looks OK, thanks for using this service"
  end

  if @fails > 0
    showList(fatals, "Fatal errors:")
    showList(errors, "Errors:")
  end

  showList(warns, "Warnings:")

  _h2_ 'Tests performed'
  _ol do
    @tests.each { |t| t.map{|k,v| _li "#{k}: - #{v}"}}
  end
  _h4_ 'F: fatal, E: Error, W: warning, I: info (success)'
end

# get an HTTP URL
def HEAD(url)
  puts ">> HEAD #{url}" if $VERBOSE
  url.untaint
  uri = URI.parse(url)
  unless uri.scheme
    W "No scheme for URL #{url}, assuming http"
    uri = URI.parse("http:"+url)
  end
  http = Net::HTTP.new(uri.host, uri.port)
  http.use_ssl = uri.scheme == 'https'
  request = Net::HTTP::Head.new(uri.request_uri)
  http.request(request)
end

# get an HTTP URL=> response
def GET(url)
  puts ">> GET #{url}" if $VERBOSE
  url.untaint
  uri = URI.parse(url).untaint
  unless uri.scheme
    W "No scheme for URL #{url}, assuming http"
    uri = URI.parse("http:"+url).untaint
  end
  http = Net::HTTP.new(uri.host, uri.port)
  http.use_ssl = uri.scheme == 'https'
  request = Net::HTTP::Get.new(uri.request_uri)
  http.request(request.untaint)
end

# Check page exists
def check_head(path, severity = :E, expectedStatus = "200", log=true)
  response = HEAD(path)
  code = response.code ||  '?'
  if code == '403' # someone does not like Whimsy?
    W "HEAD #{path} - HTTP status: #{code} - retry"
    response = HEAD(path)
    code = response.code ||  '?'
  end
  if code != expectedStatus
    test(severity, "HEAD #{path} - HTTP status: #{code} expected: #{expectedStatus}") unless severity == nil
    return nil
  end
  I "Checked HEAD #{path} - OK (#{code})" if log
  response
end

# check page can be read => body
def check_page(path, severity=:E, expectedStatus="200", log=true)
  response = GET(path)
  code = response.code ||  '?'
  if code != expectedStatus
    test(severity, "Fetched #{path} - HTTP status: #{code} expected: #{expectedStatus}") unless severity == nil
    return nil
  end
  I "Fetched #{path} - OK (#{code})" if log
  puts "Fetched #{path} - OK (#{code})" if $CLI
  if code == '200'
    return response.body
  else
    return response
  end
end

# Check closer/download page
def check_closer_down(url)
  # N.B. HEAD does not work; it returns success
  res = check_page(url, :E, "302", false)
  loc = res['location']
  res = check_head(loc, :E, "200", false)
  return unless res
  ct = res.content_type
  cl = res.content_length
  if ct and cl
    I "Checked #{url} OK - ct=#{ct} cl=#{cl}"
  elsif cl > 0
    W "Possible issue with #{url} ct=#{ct} cl=#{cl}"
  else
    E "Problem with #{url} ct=#{ct} cl=#{cl}"
  end
end

# returns www|archive, stem and the hash extension
def check_hash_loc(h,tlp)
  if h =~ %r{^(https?)://(?:(archive|www)\.)?apache\.org/dist/(?:incubator/)?#{tlp}/.*([^/]+)(\.(\w{3,6}))$}
    E "HTTPS! #{h}" unless $1 == 'https'
    return $2,$3,$4
  else
    E "Unexpected hash location #{h} for #{tlp}"
    nil
  end
end

# get the https? links as Array of [href,text]
def get_links(body)
  doc = Nokogiri::HTML(body)
  nodeset = doc.css('a[href]')    # Get anchors w href attribute via css
  links = nodeset.map {|node|
    href = node.attribute("href").to_s
    text = node.text.gsub(/[[:space:]]+/,' ')
    [href,text]
  }.select{|x,y| x =~ %r{^(https?:)?//} }
end

VERIFY_TEXT = [
 'the integrity of the downloaded files',
 'verify the integrity', # commons has this as a link; perhaps try converting page to text only?
 'verify that checksums and signatures are correct',
 '#verifying-signature',
 'check that the download has completed OK',
 'You should verify your download',
 'downloads can be verified',
 'www.apache.org/info/verification.html',
 'verify your mirrored downloads',
 'verify your downloads',
 'All downloads should be verified',
 'verification instructions',
]

ALIASES = {
    'sig' => 'asc',
    'pgp' => 'asc',
    'signature' => 'asc',
    'pgp signature' => 'asc',
    'openpgp signature' => 'asc',
}
# Convert text reference to extension
# e.g. SHA256 => sha256; [SIG] => asc
def text2ext(txt)
    tmp = txt.downcase.sub(%r{^\[(.+)\]$},'\1').sub('-','').sub(' checksum','')
    ALIASES[tmp] || tmp
end

# Suite: perform all the HTTP checks
def checkDownloadPage(path, tlp, version)
    begin
        _checkDownloadPage(path.strip, tlp, version)
    rescue Exception => e
        F e
        if $CLI
          p e
          puts e.backtrace 
        end
    end
end

def _checkDownloadPage(path, tlp, version)
  if version != ''
    I "Checking #{path} [#{tlp}] for version #{version} only ..."
  else
    I "Checking #{path} [#{tlp}] ..."
  end

  # check the main body
  if path.start_with? 'http'
    body = check_page(path)
  else
    file = path
    if file.start_with? '~'
      file = ENV['HOME'] + file[1..-1]
    end
    body = File.read(file.untaint)
  end
  
  return unless body

  if body.include? 'dist.apache.org'
    E 'Page must not link to dist.apache.org'
  else
    I 'Page does not reference dist.apache.org'
  end

  if body.include? 'repository.apache.org'
    E 'Page must not link to repository.apache.org'
  else
    I 'Page does not reference repository.apache.org'
  end

  deprecated = Time.parse('2018-01-01')
  
  links = get_links(body)
  
  # check KEYS link
  # TODO: is location used by hc allowed, e.g.
  #   https://www.apache.org/dist/httpcomponents/httpclient/KEYS
  expurl = "https://[www.]apache.org/dist/[incubator/]#{tlp}/KEYS"
  expurlre = %r{^https://(www\.)?apache\.org/dist/(incubator/)?#{tlp}/KEYS$}
  keys = links.select{|h,v| h =~ expurlre}
  if keys.size >= 1
    keytext = keys.first[1]
    if keytext.strip == 'KEYS'
        I 'Found KEYS link'
    else
        W "Found KEYS: '#{keytext}'"
    end
  else
    keys = links.select{|h,v| v.strip == 'KEYS' || v == 'KEYS file' || v == '[KEYS]'}
    if keys.size >= 1
      I 'Found KEYS link'
      keyurl = keys.first.first
      if keyurl =~ expurlre
        I "KEYS links to #{expurl} as expected"
      else
        if keyurl =~ %r{^https://www\.apache\.org/dist/#{tlp}/[^/]+/KEYS$}
          W "KEYS: expected: #{expurl}\n             actual: #{keyurl}"
        else
          E "KEYS: expected: #{expurl}\n             actual: #{keyurl}"
        end
      end
    else
      E 'Could not find KEYS link'
    end
  end
  
  # check for verify instructions
  bodytext = body.gsub(/\s+/,' ') # single line
  if VERIFY_TEXT.any? {|text| bodytext.include? text}
    I 'Found reference to download verification'
  else
    E 'Could not find statement of the need to verify downloads'
  end
  
  # Check if GPG verify has two parameters
  body.scan(%r{^.+gpg --verify.+$}){|m|
    unless m =~ %r{gpg --verify\s+\S+\.asc\s+\S+}
      W "gpg verify without second param: #{m.strip}"
    end
  }
  
  # check if page refers to md5sum
  body.scan(%r{^.+md5sum.+$}){|m|
    W "Found md5sum: #{m.strip}"
  }
  
  # Check archives have hash and sig
  vercheck = Hash.new() # key = archive name, value = array of hash/sig
  links.each do |h,t|
    # Must occur before mirror check below
    if h =~ %r{^https?://(?:archive|www)\.apache\.org/dist/(.+\.(asc|sha\d+|md5|sha))$}
        base = File.basename($1)
        ext = $2
        stem = base[0..-(2+ext.length)]
        if vercheck[stem]
          vercheck[stem] << ext
        else
          E "Bug: found hash for missing artifact #{stem}"
        end
        tmp = text2ext(t)
        next if ext == tmp # i.e. link is just the type or [TYPE]
        if not base == t and not t == 'checksum'
            E "Mismatch: #{h} and '#{t}'"
        end
    # These might also be direct links to mirrors
    elsif h =~ ARTIFACT_RE
        base = File.basename($1)
  #         puts "base: " + base
        if vercheck[base]  # might be two links to same archive
            W "Already seen link for #{base}"
        else
            vercheck[base] = []
        end
        # Text must include a '.' (So we don't check 'Source')
        if t.include?('.') and not base == t
          # text might be short version of link
          tmp = t.strip.sub(%r{.*/},'').downcase # 
          if base == tmp
            W "Mismatch?: #{h} and #{t}"
          elsif base.end_with? tmp
            W "Mismatch?: #{h} and '#{tmp}'"
          elsif base.sub(/-bin\.|-src\./,'.').end_with? tmp
            W "Mismatch?: #{h} and '#{tmp}'"
          else
            W "Mismatch2: #{h} and '#{tmp}'"
          end
        end        
    end
  end
  
  # did we find all required elements?
  vercheck.each do |k,v|
    unless v.include? "asc" and v.any? {|e| e =~ /^sha\d+$/ or e == 'md5' or e == 'sha'}
      E "#{k} missing sig/hash: (found only: #{v.inspect})"
    end
  end

  if @fails > 0 and not $ALWAYS_CHECK_LINKS
    W "** Not checking links **"
    $NOFOLLOW = true
  end

  links.each do |h,t|
    if h =~ %r{\.(asc|sha256|sha512)$}
      host, stem, ext = check_hash_loc(h,tlp)
      if host == 'archive'
        I "Ignoring archive hash #{h}"
      elsif host
        if $NOFOLLOW
          I "Skipping archive hash #{h}"
        else
          check_head(h, :E, "200", true)
        end
      else
        # will have been reported by check_hash_loc
      end
    # mirror downloads need to be treated differently
    elsif h =~ %r{^https?://www.apache.org/dyn/.*action=download}
      if $NOFOLLOW
          I "Skipping download artifact #{h}"
      else
          check_closer_down(h)
      end
    elsif h =~ ARTIFACT_RE
      if $NOFOLLOW
        I "Skipping archive artifact #{h}"
        next
      end
      name = $1
      ext = $2
      if h =~ %r{https?://archive\.apache\.org/}
        I "Ignoring archive artifact #{h}"
        next
      end
      if h =~ %r{https?://(www\.)?apache\.org/dist}
        E "Must use mirror system #{h}"
        next
      end
      res = check_head(h, :E, "200", false)
      next unless res
      # if HEAD returns content_type and length it's probably a direct link
      ct = res.content_type
      cl = res.content_length
      if ct and cl
        I "#{h} OK: #{ct} #{cl}"
      else # need to try to download the mirror page
        path = nil
        bdy = check_page(h, :E, "200", false)
        if bdy
          lks = get_links(bdy)
          lks.each do |l,t|
             # Don't want to match archive server (closer.cgi defaults to it if file is not found)
             if l.end_with?(name) and l !~ %r{//archive\.apache\.org/}
                path = l
                break
             end
          end
        end
        if path
          res = check_head(path, :E, "200", false)
          next unless res
          ct = res.content_type
          cl = res.content_length
          if ct and cl
            I "OK: #{ct} #{cl} #{path}"
          elsif cl
            W "NAK: ct='#{ct}' cl='#{cl}' #{path}"
          else
            E "NAK: ct='#{ct}' cl='#{cl}' #{path}"
          end
        else
          E "Could not find link for #{name} in #{h}"
        end
      end
    elsif h =~ %r{\.(md5|sha.*)$}
      host,_,_ = check_hash_loc(h,tlp)
      if $NOFOLLOW
        I "Skipping deprecated hash #{h}"
        next
      end
      if host == 'www' or host == ''
        res = check_head(h,:E, "200", false)
        next unless res
        lastmod = res['last-modified']
        date = Time.parse(lastmod)
        # Check if older than 2018?
        if date < deprecated
          I "Deprecated hash found #{h} #{t}; however #{lastmod} is older than #{deprecated}"
          # OK
        else
          W "Deprecated hash found #{h} #{t} - do not use for current releases #{lastmod}"
        end
      end
    elsif h =~ %r{/KEYS$} or t == 'KEYS'
      # already handled
    elsif h =~ %r{^https?://www\.apache\.org/?(licenses/.*|foundation/.*|events/.*)?$}
      # standard links
    elsif h =~ %r{https?://people.apache.org/phonebook.html}
    elsif h.start_with? 'https://cwiki.apache.org/confluence/'
      # Wiki
    elsif h.start_with? 'https://wiki.apache.org/'
      # Wiki
    elsif h.start_with? 'https://svn.apache.org/'
      #        E "Public download pages should not link to unreleased code: #{h}" # could be a sidebar/header link
    elsif h =~ %r{^https?://(archive|www)\.apache\.org/dist/}
      W "Not yet handled #{h} #{t}" unless h =~ /RELEASE[-_]NOTES/ or h =~ %r{^https?://archive.apache.org/dist/#{tlp}/}
    else
      # Ignore everything else?
    end
  end

end

def getTLP(url)
  if url =~ %r{^https?://([^.]+)(\.incubator|\.us|\.eu)?\.apache\.org/}
     tlp = $1
     tlp = 'httpcomponents' if tlp == 'hc'
     tlp = 'jspwiki' if tlp == 'jspwiki-wiki' # https://jspwiki-wiki.apache.org/Wiki.jsp?page=Downloads
  elsif url =~ %r{^https?://([^.]+)\.openoffice\.org/}
     tlp = 'openoffice'
  else
     tlp = nil
     F "Unknown TLP for URL #{url}"
  end
  tlp
end

# Called by GUI when POST is pushed
def doPost(options)
  $ALWAYS_CHECK_LINKS = options[:checklinks]
  $NO_CHECK_LINKS = options[:nochecklinks]
  $ARCHIVE_CHECK = options[:archivecheck]
  init
  url = options[:url]
  tlp = options[:tlp]
  tlp = getTLP(url) if tlp == ''
  if tlp
    checkDownloadPage(url, tlp, options[:version])
  end
  displayHTML
end


if __FILE__ == $0
  $CLI = true
  $SAFE = 0
  $VERBOSE =true
  $ALWAYS_CHECK_LINKS = ARGV.delete '--always'
  $NO_CHECK_LINKS = ARGV.delete '--nolinks'
  $ARCHIVE_CHECK = ARGV.delete '--archivecheck'

  init

  version = ''
  if ARGV.size == 1
    url = ARGV[0]
    tlp = getTLP(url)
  else
    url = ARGV[0]
    tlp = ARGV[1]
    version = ARGV[2] || ''
  end

  checkDownloadPage(url, tlp, version)

  # display the test results as text
  puts ""
  puts "================="
  puts ""
  @tests.each { |t| t.map{|k, v| puts "#{k}: - #{v}"}}
  puts ""
  testentries(:W).each { |t| t.map{|k, v| puts "#{k}: - #{v}"}}
  testentries(:E).each { |t| t.map{|k, v| puts "#{k}: - #{v}"}}
  testentries(:F).each { |t| t.map{|k, v| puts "#{k}: - #{v}"}}
  puts ""
  if @fails > 0
    puts "NAK: #{url} had #{@fails} errors"
  else
    puts "OK: #{url} passed all the tests"
  end
  puts ""
end
