blob: 1346ddcd07220ed70c065c8dae7e41dbe6a68691 [file] [log] [blame]
#!/usr/bin/env ruby
# Scans Apache project homepages and captures text|urls for common links
# Gathers data that can be used to check for policy compliance:
# https://www.apache.org/foundation/marks/pmcs#navigation
# http://www.apache.org/events/README.txt
# See Also: lib/whimsy/sitestandards.rb
#
# Makes no value judgements. Simply extracts raw data for offline analysis.
$LOAD_PATH.unshift '/srv/whimsy/lib'
require 'net/http'
require 'nokogiri'
require 'json'
require 'whimsy/asf'
require 'whimsy/cache'
require 'whimsy/sitestandards'
# Normalize spaces in text runs
def squash(text)
return text.scrub.gsub(/[[:space:]]+/, ' ').strip
end
# Get text from a node; use parent if text does not appear to be complete
# This is used when scanning for some links that may
# be in an image or other commonly related node on websites
def getText(txt, node, match=/Apache Software Foundation/i)
parent = nil # debug to show where parent needed to be fetched
if not txt =~ match # have we got all the text?
if node.parent.name == 'a' # e.g. whimsical. such parents don't have extra text.
newnode = node.parent.parent
else
newnode = node.parent
end
# ensure <br> is treated as a separator when extracting the combined text
newnode.css('br').each{ |br| br.replace(" ") }
txt = squash(newnode.text)
parent = true
end
return txt, parent
end
# Parse an Apache project website and return text|urls that match our checks
# @return Hash of symbols: text|url found from a check made
# @see SiteStandards for definitions of what we should scan for (in general)
def parse(id, site, name)
data = {}
SiteStandards::COMMON_CHECKS.keys.each do |k|
data[k.to_sym] = nil
end
data[:display_name] = name
data[:uri] = site
uri = URI.parse(site)
begin
Socket.getaddrinfo(uri.host, uri.scheme)
rescue SocketError => se
data[:errors] = se.message
return data
end
begin
uri, response, status = $cache.get(site.to_s)
rescue IOError => ioe
data[:errors] = ioe.message
return data
end
$stderr.puts "#{id} #{uri} #{status}"
# Bail and return if getting the site returns an error code
if response.respond_to? :code and response.code =~ /^[45]/
data[:errors] = "cache.get(#{site.to_s}) error code #{response.code}"
return data
end
doc = Nokogiri::HTML(response)
data[:uri] = uri.to_s
# FIRST: scan each link's a_href to see if we need to capture it
doc.css('a').each do |a|
# Normalize the text and href for our capture purposes
a_href = a['href'].to_s.strip
a_text = a.text.downcase.strip
$stderr.puts "#{a_text} #{a_href}" if $verbose
# Check the href urls for some patterns
if a_href =~ SiteStandards::COMMON_CHECKS['foundation'][SiteStandards::CHECK_CAPTURE]
img = a.at('img')
if img
# use the title (hover text) in preference to the source
data[:foundation] = img['title'] ? squash(img['title']) : uri + img['src'].strip
else
data[:foundation] = squash(a.text)
end
end
if a_href =~ SiteStandards::COMMON_CHECKS['events'][SiteStandards::CHECK_CAPTURE]
img = a.at('img')
if img
data[:events] = uri + img['src'].strip
else
data[:events] = uri + a_href
end
end
# Check the a_text strings for other patterns
# Note this is an unusual case
if (a_text =~ SiteStandards::COMMON_CHECKS['license'][SiteStandards::CHECK_TEXT]) and
(a_href =~ SiteStandards::COMMON_CHECKS['license'][SiteStandards::CHECK_CAPTURE])
begin
data[:license] = uri + a_href
rescue
data[:license] = a_href
end
end
%w(thanks security sponsorship).each do |check|
if a_text =~ SiteStandards::COMMON_CHECKS[check][SiteStandards::CHECK_CAPTURE]
begin
data[check.to_sym] = uri + a_href
rescue
data[check.to_sym] = a_href
end
end
end
end
# SECOND: scan each text node to match and capture
doc.traverse do |node|
next unless node.is_a?(Nokogiri::XML::Text)
txt = squash(node.text)
# allow override if phrase looks good
if (txt =~ SiteStandards::COMMON_CHECKS['trademarks'][SiteStandards::CHECK_CAPTURE] and not data[:trademarks]) or txt =~/are trademarks of [Tt]he Apache Software/
t, p = getText(txt, node)
# drop previous text if it looks like Copyright sentence
data[:trademarks] = t.sub(/^.*?Copyright .+? Foundation[.]?/,'').strip
data[:tradeparent] = p if p
end
if txt =~ SiteStandards::COMMON_CHECKS['copyright'][SiteStandards::CHECK_CAPTURE]
t, p = getText(txt, node)
# drop text around the Copyright (or the symbol)
data[:copyright] = t.sub(/^.*?((Copyright|©) .+? Foundation[.]?).*/,'\1').strip
data[:copyparent] = p if p
end
# Note we also check for incubator disclaimer (immaterial of tlp|podling)
if txt =~ SiteStandards::PODLING_CHECKS['disclaimer'][SiteStandards::CHECK_CAPTURE]
t, p = getText(txt, node, / is an effort undergoing/)
data[:disclaimer] = t
end
end
# THIRD: see if an image has been uploaded
data[:image] = ASF::SiteImage.find(id)
return data
end
#########################################################################
# Main execution begins here
results = {}
podlings = {}
$cache = Cache.new(dir: 'site-scan')
$verbose = ARGV.delete '--verbose'
# USAGE:
# site-scan.rb https://whimsical.apache.org [whimsy] [whimsy-scan.json] - to scan one project
# site-scan.rb [project-output.json] [podlings-output.json] [projname podlingname ...]
# If additional projname|podlingname are provided, only scans those sites
if ARGV.first =~ /^https?:\/\/\w/
# Scan a single URL provided by user
site = ARGV.shift
name = ARGV.shift || site[/\/(\w[^.]*)/, 1].capitalize
output_projects = ARGV.shift
results[name] = parse(name, site, name)
else
# Gather output filenames (if any) and scan various projects
if ARGV.first =~ %r{[./]} # have we a file name?
output_projects = ARGV.shift
if ARGV.first =~ %r{[./]} # have we another file name?
output_podlings = ARGV.shift
else
output_podlings = nil
end
else
output_projects = nil
end
# Scan committees, including non-pmcs
ASF::Committee.load_committee_info
committees = (ASF::Committee.pmcs + ASF::Committee.nonpmcs).uniq
committees.sort_by {|committee| committee.name}.each do |committee|
next unless committee.site
# if more parameters specified, parse only those names
if ARGV.length > 0
next unless ARGV.include? committee.name
end
results[committee.name] = parse(committee.name, committee.site, committee.display_name)
end
# Scan podlings that have a website
ASF::Podling.list.each do |podling|
if podling.status == 'current' and podling.podlingStatus[:website]
# if more parameters specified, parse only those names
if ARGV.length > 0
next unless ARGV.include? podling.name
end
podlings[podling.name] = parse(podling.name, podling.podlingStatus[:website], podling.display_name)
end
end
end
# Output all results
if output_projects
File.write(output_projects, JSON.pretty_generate(results))
else
puts JSON.pretty_generate(results)
end
if output_podlings
File.write(output_podlings, JSON.pretty_generate(podlings))
else
puts JSON.pretty_generate(podlings)
end