blob: 4e26758005da2409adfcba3ab1ba5bbeb62b4b9a [file] [log] [blame]
#!/usr/bin/env ruby
# Scans Apache project homepages and captures text|urls for common links
# Gathers data that can be used to check for policy compliance:
# https://www.apache.org/foundation/marks/pmcs#navigation
# http://www.apache.org/events/README.txt
# See Also: lib/whimsy/sitestandards.rb
#
# Makes no value judgements. Simply extracts raw data for offline analysis.
$LOAD_PATH.unshift '/srv/whimsy/lib'
require 'net/http'
require 'nokogiri'
require 'json'
require 'whimsy/asf'
require 'whimsy/cache'
require 'whimsy/sitestandards'
require_relative 'asf-site-check'
$stdout.sync = true
# Normalize spaces in text runs
def squash(text)
return text.scrub.gsub(/[[:space:]]+/, ' ').strip
end
# Get text from a node; use parent if text does not appear to be complete
# This is used when scanning for some links that may
# be in an image or other commonly related node on websites
def getText(txt, node, match=/Apache Software Foundation/i)
parent = nil # debug to show where parent needed to be fetched
if txt !~ match # have we got all the text?
if node.parent.name == 'a' # e.g. whimsical. such parents don't have extra text.
newnode = node.parent.parent
else
newnode = node.parent
end
# ensure <br> is treated as a separator when extracting the combined text
newnode.css('br').each { |br| br.replace(" ") }
txt = squash(newnode.text)
parent = true
end
return txt, parent
end
# helper for multiple events
# TODO should we show them all?
def save_events(data, value)
prev = data[:events]
if prev and prev != value
puts "Events: already have '#{prev}', not storing '#{value}'"
else
data[:events] = value
end
end
# Extract link text, skipping invisible stuff (assumed to be a class ending with 'sr-only')
def get_link_text(anode)
bits = []
anode.traverse do |node|
if node.name == 'text'
bits << node.text unless node.parent.name == 'span' and node.parent.attribute('class')&.value&.end_with? 'sr-only'
end
end
bits.join(' ')
end
# Parse an Apache project website and return text|urls that match our checks
# @return Hash of symbols: text|url found from a check made
# @see SiteStandards for definitions of what we should scan for (in general)
def parse(id, site, name)
data = {}
# force https to avoid issue with cache (sites should use https anyway)
site.sub!(%r{^http:},'https:')
SiteStandards::COMMON_CHECKS.each_key do |k|
data[k.to_sym] = nil
end
data[:display_name] = name
data[:uri] = site
uri = URI.parse(site)
begin
Socket.getaddrinfo(uri.host, uri.scheme)
rescue SocketError => se
data[:errors] = se.message
return data
end
begin
uri, response, status = $cache.get(site.to_s)
rescue IOError => ioe
data[:errors] = ioe.message
return data
end
puts "#{id} #{uri} #{status}"
# Bail and return if getting the site returns an error code
if response.respond_to? :code and response.code =~ /^[45]/
data[:errors] = "cache.get(#{site}) error code #{response.code}"
return data
end
doc = Nokogiri::HTML(response)
if $saveparse
file = File.join('/tmp',"site-scan_#{$$}.txt")
File.write(file, doc.to_s)
$stderr.puts "Wrote parsed input to #{file}"
end
data[:uri] = uri.to_s
# FIRST: scan each link's a_href to see if we need to capture it
# also capture script src for events
doc.traverse do |a|
if a.name == 'script'
a_src = a['src'].to_s.strip
if a_src =~ SiteStandards::COMMON_CHECKS['events'][SiteStandards::CHECK_CAPTURE]
save_events data, uri + a_src
end
end
next unless a.name == 'a'
# Normalize the text and href for our capture purposes
a_href = a['href'].to_s.strip
a_text = get_link_text(a) # Not down-cased yet
$stderr.puts "#{a_text.inspect} #{a_href}" if $verbose
# Check the href urls for some patterns
if a_href =~ SiteStandards::COMMON_CHECKS['foundation'][SiteStandards::CHECK_CAPTURE]
img = a.at('img')
if img
# use the title (hover text) in preference to the source
data[:foundation] = img['title'] ? squash(img['title']) : uri + img['src'].strip
else
data[:foundation] = squash(a_text)
end
end
if a_href =~ SiteStandards::COMMON_CHECKS['events'][SiteStandards::CHECK_CAPTURE]
# Hack to ignore hidden links on main site
save_events data, uri + a_href unless a['class'] == 'visible-home' and uri.path != '/'
end
# Check the a_text strings for other patterns
a_text = a_text.downcase.strip # needs to be downcased here
# Note this is an unusual case
if (a_text =~ SiteStandards::COMMON_CHECKS['license'][SiteStandards::CHECK_TEXT]) and
(a_href =~ SiteStandards::COMMON_CHECKS['license'][SiteStandards::CHECK_CAPTURE])
begin
data[:license] = uri + a_href
rescue StandardError
data[:license] = a_href
end
end
%w(thanks security sponsorship privacy).each do |check|
if a_text =~ SiteStandards::COMMON_CHECKS[check][SiteStandards::CHECK_CAPTURE]
begin
data[check.to_sym] = uri + a_href
rescue StandardError
data[check.to_sym] = a_href
end
end
end
end
# SECOND: scan each text node to match and capture
doc.traverse do |node|
next unless node.is_a?(Nokogiri::XML::Text)
txt = squash(node.text)
# allow override if phrase looks good
if (txt =~ SiteStandards::COMMON_CHECKS['trademarks'][SiteStandards::CHECK_CAPTURE] and not data[:trademarks]) or
txt =~ /are trademarks of [Tt]he Apache Software/
t, p = getText(txt, node)
# drop previous text if it looks like Copyright sentence
data[:trademarks] = t.sub(/^.*?Copyright .+? Foundation[.]?/, '').strip
data[:tradeparent] = p if p
end
if txt =~ SiteStandards::COMMON_CHECKS['copyright'][SiteStandards::CHECK_CAPTURE]
t, p = getText(txt, node)
# drop text around the Copyright (or the symbol)
data[:copyright] = t.sub(/^.*?((Copyright|©) .+? Foundation[.]?).*/, '\1').strip
data[:copyparent] = p if p
end
# Note we also check for incubator disclaimer (immaterial of tlp|podling)
if txt =~ SiteStandards::PODLING_CHECKS['disclaimer'][SiteStandards::CHECK_CAPTURE]
t, _p = getText(txt, node, / is an effort undergoing/)
data[:disclaimer] = t
end
end
# THIRD: see if an image has been uploaded
data[:image] = ASF::SiteImage.find(id)
# Check for resource loading from non-ASF domains
if $skipresourcecheck
data[:resources] = "Not checked"
else
cmd = ['node', '/srv/whimsy/tools/scan-page.js', site]
out, err, status = exec_with_timeout(cmd, 60)
if status
ext_urls = out.split("\n").reject {|x| ASFDOMAIN.asfhost? x}.tally
resources = ext_urls.values.sum
data[:resources] = "Found #{resources} external resources: #{ext_urls}"
else
data[:resources] = err
end
end
# TODO: does not find js references such as:
# ga.src = ('https:' == document.location.protocol ? 'https://ssl' : 'http://www') + '.google-analytics.com/ga.js';
return data
end
require 'timeout'
# the node script appears to stall sometimes, so apply a timeout
def exec_with_timeout(cmd, timeout)
begin
# stdout, stderr pipes
rout, wout = IO.pipe
rerr, werr = IO.pipe
stdout, stderr = nil
status = false
pid = Process.spawn(*cmd, pgroup: true, :out => wout, :err => werr)
Timeout.timeout(timeout) do
Process.waitpid(pid)
status = $?.success?
# close write ends so we can read from them
wout.close
werr.close
stdout = rout.readlines.join
stderr = rerr.readlines.join
unless status
$stderr.puts "WARN: #{Time.now} failed scanning #{cmd} #{pid} #{stderr}"
stderr = 'Scanning failed'
end
end
rescue Timeout::Error
# Try to determine why the kill does not tidy the chrome processes
# Also whether a kill was actually issued!
puts "WARN: timeout scanning #{cmd[-1]} #{pid}"
$stderr.puts "WARN: #{Time.now} timeout scanning #{cmd[-1]} #{pid}"
stderr = 'Timeout'
ret=''
# Try to show process tree
cmd = "ps -lfg #{$$}"
begin
$stderr.puts "WARN: #{Time.now} #{cmd}:"
$stderr.puts `#{cmd}`
reaper = Process.detach(pid) # ensure the process is reaped
# kill -pid responds with EINVAL - invalid argument
$stderr.puts "WARN: #{Time.now} about to kill -15 #{pid}"
ret = Process.kill(-15, pid) # SIGTERM
$stderr.puts "WARN: #{Time.now} sent kill -15 #{pid} ret=#{ret}"
thrd = reaper.join 30 # allow some time for process to exit
if thrd # original process has finished
$stderr.puts "WARN: #{Time.now} process completed #{thrd.value}"
else # not yet finished, try a stronger kill
$stderr.puts "WARN: #{Time.now} about to kill -9 #{pid}"
ret = Process.kill(-9, pid) # SIGKILL
$stderr.puts "WARN: #{Time.now} sent kill -9 #{pid} ret=#{ret}"
thrd = reaper.join 5 # allow some time for process to exit
if thrd
$stderr.puts "WARN: #{Time.now} process completed #{thrd.value}"
else
$stderr.puts "ERROR: #{Time.now} failed to kill -9 #{pid}"
end
end
rescue StandardError => e
$stderr.puts "WARN: #{Time.now} ret=#{ret} exception: #{e}"
end
$stderr.puts "WARN: #{Time.now} #{cmd}:"
$stderr.puts `#{cmd}`
ensure
wout.close unless wout.closed?
werr.close unless werr.closed?
# dispose the read ends of the pipes
rout.close
rerr.close
end
return stdout, stderr, status
end
#########################################################################
# Main execution begins here
results = {}
podlings = {}
$cache = Cache.new(dir: 'site-scan')
$verbose = ARGV.delete '--verbose'
$saveparse = ARGV.delete '--saveparse'
$skipresourcecheck = ARGV.delete '--noresource'
puts "Started: #{Time.now}" # must agree with site-scan monitor
# USAGE:
# site-scan.rb https://whimsical.apache.org [Whimsy] [whimsy-scan.json] - to scan one project
# site-scan.rb [project-output.json] [podlings-output.json] [projname podlingname ...]
# If additional projname|podlingname are provided, only scans those sites
if ARGV.first =~ /^https?:\/\/\w/
# Scan a single URL provided by user
site = ARGV.shift
name = ARGV.shift || site[/\/(\w[^.]*)/, 1].capitalize
output_projects = ARGV.shift
results[name] = parse(name, site, name)
else
# Gather output filenames (if any) and scan various projects
if ARGV.first =~ %r{[./]} # have we a file name?
output_projects = ARGV.shift
if ARGV.first =~ %r{[./]} # have we another file name?
output_podlings = ARGV.shift
else
output_podlings = nil
end
else
output_projects = nil
end
# Scan committees, including non-pmcs
ASF::Committee.load_committee_info
committees = (ASF::Committee.pmcs + ASF::Committee.nonpmcs).uniq
committees.sort_by {|committee| committee.name}.each do |committee|
next unless committee.site
# if more parameters specified, parse only those names
if ARGV.length > 0
next unless ARGV.include? committee.name
end
results[committee.name] = parse(committee.name, committee.site, committee.display_name)
end
# Scan podlings that have a website
ASF::Podling.list.sort_by(&:name).each do |podling|
if podling.status == 'current' and podling.podlingStatus[:website]
# if more parameters specified, parse only those names
if ARGV.length > 0
next unless ARGV.include? podling.name
end
podlings[podling.name] = parse(podling.name, podling.podlingStatus[:website], podling.display_name)
end
end
end
# Output all results
if output_projects
File.write(output_projects, JSON.pretty_generate(results))
else
puts JSON.pretty_generate(results)
end
if output_podlings
File.write(output_podlings, JSON.pretty_generate(podlings))
else
puts JSON.pretty_generate(podlings)
end
puts "Ended: #{Time.now}" # must agree with site-scan monitor