tools/site-scan.rb - whimsy - Git at Google

 #!/usr/bin/env ruby
 # Scans Apache project homepages and captures text|urls for common links
 # Gathers data that can be used to check for policy compliance:
 #   https://www.apache.org/foundation/marks/pmcs#navigation
 #   http://www.apache.org/events/README.txt
 #   See Also: lib/whimsy/sitestandards.rb
 #
 # Makes no value judgements.  Simply extracts raw data for offline analysis.
 $LOAD_PATH.unshift '/srv/whimsy/lib'
 require 'net/http'
 require 'nokogiri'
 require 'json'
 require 'whimsy/asf'
 require 'whimsy/cache'
 require 'whimsy/sitestandards'

 # Normalize spaces in text runs
 def squash(text)
   return text.scrub.gsub(/[[:space:]]+/, ' ').strip
 end

 # Get text from a node; use parent if text does not appear to be complete
 # This is used when scanning for some links that may
 #   be in an image or other commonly related node on websites
 def getText(txt, node, match=/Apache Software Foundation/i)
   parent = nil # debug to show where parent needed to be fetched
   if not txt =~ match # have we got all the text?
     if node.parent.name == 'a' # e.g. whimsical. such parents don't have extra text.
       newnode = node.parent.parent
     else
       newnode = node.parent
     end
     # ensure <br> is treated as a separator when extracting the combined text
     newnode.css('br').each{ |br| br.replace(" ") }
     txt = squash(newnode.text)
     parent = true
   end
   return txt, parent
 end

 # Parse an Apache project website and return text|urls that match our checks
 # @return Hash of symbols: text|url found from a check made
 # @see SiteStandards for definitions of what we should scan for (in general)
 def parse(id, site, name)
   data = {}
   SiteStandards::COMMON_CHECKS.keys.each do |k|
     data[k.to_sym] = nil
   end
   data[:display_name] = name
   data[:uri] = site
   uri = URI.parse(site)
   begin
     Socket.getaddrinfo(uri.host, uri.scheme)
   rescue SocketError => se
     data[:errors] = se.message
     return data
   end
   begin
     uri, response, status = $cache.get(site.to_s)
   rescue IOError => ioe
     data[:errors] = ioe.message
     return data
   end
   $stderr.puts "#{id} #{uri} #{status}"
   # Bail and return if getting the site returns an error code
   if response.respond_to? :code and response.code =~ /^[45]/
     data[:errors] = "cache.get(#{site.to_s}) error code #{response.code}"
     return data
   end
   doc = Nokogiri::HTML(response)
   data[:uri] = uri.to_s

   # FIRST: scan each link's a_href to see if we need to capture it
   doc.css('a').each do |a|
     # Normalize the text and href for our capture purposes
     a_href = a['href'].to_s.strip
     a_text = a.text.downcase.strip
     $stderr.puts "#{a_text} #{a_href}" if $verbose

     # Check the href urls for some patterns
     if a_href =~ SiteStandards::COMMON_CHECKS['foundation'][SiteStandards::CHECK_CAPTURE]
       img = a.at('img')
       if img
         # use the title (hover text) in preference to the source
         data[:foundation] = img['title'] ? squash(img['title']) : uri + img['src'].strip
       else
         data[:foundation] = squash(a.text)
       end
     end

     if a_href =~ SiteStandards::COMMON_CHECKS['events'][SiteStandards::CHECK_CAPTURE]
       img = a.at('img')
       if img
         data[:events] = uri + img['src'].strip
       else
         data[:events] = uri + a_href
       end
     end

     # Check the a_text strings for other patterns
     # Note this is an unusual case
     if (a_text =~ SiteStandards::COMMON_CHECKS['license'][SiteStandards::CHECK_TEXT]) and
         (a_href =~ SiteStandards::COMMON_CHECKS['license'][SiteStandards::CHECK_CAPTURE])
       begin
         data[:license] = uri + a_href
       rescue
         data[:license] = a_href
       end
     end

     %w(thanks security sponsorship).each do |check|
       if a_text =~ SiteStandards::COMMON_CHECKS[check][SiteStandards::CHECK_CAPTURE]
         begin
           data[check.to_sym] = uri + a_href
         rescue
           data[check.to_sym] = a_href
         end
       end
     end
   end

   # SECOND: scan each text node to match and capture
   doc.traverse do |node|
     next unless node.is_a?(Nokogiri::XML::Text)
     txt = squash(node.text)
     # allow override if phrase looks good
     if (txt =~ SiteStandards::COMMON_CHECKS['trademarks'][SiteStandards::CHECK_CAPTURE] and not data[:trademarks]) or txt =~/are trademarks of [Tt]he Apache Software/
       t, p = getText(txt, node)
       # drop previous text if it looks like Copyright sentence
       data[:trademarks] = t.sub(/^.*?Copyright .+? Foundation[.]?/,'').strip
       data[:tradeparent] = p if p
     end
     if txt =~ SiteStandards::COMMON_CHECKS['copyright'][SiteStandards::CHECK_CAPTURE]
       t, p = getText(txt, node)
       # drop text around the Copyright (or the symbol)
       data[:copyright] = t.sub(/^.*?((Copyright|©) .+? Foundation[.]?).*/,'\1').strip
       data[:copyparent] = p if p
     end
     # Note we also check for incubator disclaimer (immaterial of tlp|podling)
     if txt =~ SiteStandards::PODLING_CHECKS['disclaimer'][SiteStandards::CHECK_CAPTURE]
       t, p = getText(txt, node, / is an effort undergoing/)
       data[:disclaimer] = t
     end
   end
   # THIRD: see if an image has been uploaded
   data[:image] = ASF::SiteImage.find(id)

   return data
 end

 #########################################################################
 # Main execution begins here
 results = {}
 podlings = {}
 $cache = Cache.new(dir: 'site-scan')
 $verbose = ARGV.delete '--verbose'

 # USAGE:
 # site-scan.rb https://whimsical.apache.org [whimsy] [whimsy-scan.json] - to scan one project
 # site-scan.rb [project-output.json] [podlings-output.json] [projname podlingname ...]
 # If additional projname|podlingname are provided, only scans those sites
 if ARGV.first =~ /^https?:\/\/\w/
   # Scan a single URL provided by user
   site = ARGV.shift
   name = ARGV.shift || site[/\/(\w[^.]*)/, 1].capitalize
   output_projects = ARGV.shift
   results[name] = parse(name, site, name)
 else
   # Gather output filenames (if any) and scan various projects
   if ARGV.first =~ %r{[./]} # have we a file name?
     output_projects = ARGV.shift
     if ARGV.first =~ %r{[./]} # have we another file name?
       output_podlings = ARGV.shift
     else
       output_podlings = nil
     end
   else
     output_projects = nil
   end

   # Scan committees, including non-pmcs
   ASF::Committee.load_committee_info
   committees = (ASF::Committee.pmcs + ASF::Committee.nonpmcs).uniq
   committees.sort_by {|committee| committee.name}.each do |committee|
     next unless committee.site
     # if more parameters specified, parse only those names
     if ARGV.length > 0
       next unless ARGV.include? committee.name
     end
     results[committee.name] = parse(committee.name, committee.site, committee.display_name)
   end

   # Scan podlings that have a website
   ASF::Podling.list.each do |podling|
     if podling.status == 'current' and podling.podlingStatus[:website]
       # if more parameters specified, parse only those names
       if ARGV.length > 0
         next unless ARGV.include? podling.name
       end
       podlings[podling.name] = parse(podling.name, podling.podlingStatus[:website], podling.display_name)
     end
   end
 end

 # Output all results
 if output_projects
   File.write(output_projects, JSON.pretty_generate(results))
 else
   puts JSON.pretty_generate(results)
 end
 if output_podlings
   File.write(output_podlings, JSON.pretty_generate(podlings))
 else
   puts JSON.pretty_generate(podlings)
 end
	#!/usr/bin/env ruby
	# Scans Apache project homepages and captures text\|urls for common links
	# Gathers data that can be used to check for policy compliance:
	# https://www.apache.org/foundation/marks/pmcs#navigation
	# http://www.apache.org/events/README.txt
	# See Also: lib/whimsy/sitestandards.rb
	#
	# Makes no value judgements. Simply extracts raw data for offline analysis.
	$LOAD_PATH.unshift '/srv/whimsy/lib'
	require 'net/http'
	require 'nokogiri'
	require 'json'
	require 'whimsy/asf'
	require 'whimsy/cache'
	require 'whimsy/sitestandards'

	# Normalize spaces in text runs
	def squash(text)
	return text.scrub.gsub(/[[:space:]]+/, ' ').strip
	end

	# Get text from a node; use parent if text does not appear to be complete
	# This is used when scanning for some links that may
	# be in an image or other commonly related node on websites
	def getText(txt, node, match=/Apache Software Foundation/i)
	parent = nil # debug to show where parent needed to be fetched
	if not txt =~ match # have we got all the text?
	if node.parent.name == 'a' # e.g. whimsical. such parents don't have extra text.
	newnode = node.parent.parent
	else
	newnode = node.parent
	end
	# ensure <br> is treated as a separator when extracting the combined text
	newnode.css('br').each{ \|br\| br.replace(" ") }
	txt = squash(newnode.text)
	parent = true
	end
	return txt, parent
	end

	# Parse an Apache project website and return text\|urls that match our checks
	# @return Hash of symbols: text\|url found from a check made
	# @see SiteStandards for definitions of what we should scan for (in general)
	def parse(id, site, name)
	data = {}
	SiteStandards::COMMON_CHECKS.keys.each do \|k\|
	data[k.to_sym] = nil
	end
	data[:display_name] = name
	data[:uri] = site
	uri = URI.parse(site)
	begin
	Socket.getaddrinfo(uri.host, uri.scheme)
	rescue SocketError => se
	data[:errors] = se.message
	return data
	end
	begin
	uri, response, status = $cache.get(site.to_s)
	rescue IOError => ioe
	data[:errors] = ioe.message
	return data
	end
	$stderr.puts "#{id} #{uri} #{status}"
	# Bail and return if getting the site returns an error code
	if response.respond_to? :code and response.code =~ /^[45]/
	data[:errors] = "cache.get(#{site.to_s}) error code #{response.code}"
	return data
	end
	doc = Nokogiri::HTML(response)
	data[:uri] = uri.to_s

	# FIRST: scan each link's a_href to see if we need to capture it
	doc.css('a').each do \|a\|
	# Normalize the text and href for our capture purposes
	a_href = a['href'].to_s.strip
	a_text = a.text.downcase.strip
	$stderr.puts "#{a_text} #{a_href}" if $verbose

	# Check the href urls for some patterns
	if a_href =~ SiteStandards::COMMON_CHECKS['foundation'][SiteStandards::CHECK_CAPTURE]
	img = a.at('img')
	if img
	# use the title (hover text) in preference to the source
	data[:foundation] = img['title'] ? squash(img['title']) : uri + img['src'].strip
	else
	data[:foundation] = squash(a.text)
	end
	end

	if a_href =~ SiteStandards::COMMON_CHECKS['events'][SiteStandards::CHECK_CAPTURE]
	img = a.at('img')
	if img
	data[:events] = uri + img['src'].strip
	else
	data[:events] = uri + a_href
	end
	end

	# Check the a_text strings for other patterns
	# Note this is an unusual case
	if (a_text =~ SiteStandards::COMMON_CHECKS['license'][SiteStandards::CHECK_TEXT]) and
	(a_href =~ SiteStandards::COMMON_CHECKS['license'][SiteStandards::CHECK_CAPTURE])
	begin
	data[:license] = uri + a_href
	rescue
	data[:license] = a_href
	end
	end

	%w(thanks security sponsorship).each do \|check\|
	if a_text =~ SiteStandards::COMMON_CHECKS[check][SiteStandards::CHECK_CAPTURE]
	begin
	data[check.to_sym] = uri + a_href
	rescue
	data[check.to_sym] = a_href
	end
	end
	end
	end

	# SECOND: scan each text node to match and capture
	doc.traverse do \|node\|
	next unless node.is_a?(Nokogiri::XML::Text)
	txt = squash(node.text)
	# allow override if phrase looks good
	if (txt =~ SiteStandards::COMMON_CHECKS['trademarks'][SiteStandards::CHECK_CAPTURE] and not data[:trademarks]) or txt =~/are trademarks of [Tt]he Apache Software/
	t, p = getText(txt, node)
	# drop previous text if it looks like Copyright sentence
	data[:trademarks] = t.sub(/^.*?Copyright .+? Foundation[.]?/,'').strip
	data[:tradeparent] = p if p
	end
	if txt =~ SiteStandards::COMMON_CHECKS['copyright'][SiteStandards::CHECK_CAPTURE]
	t, p = getText(txt, node)
	# drop text around the Copyright (or the symbol)
	data[:copyright] = t.sub(/^.?((Copyright\|©) .+? Foundation[.]?)./,'\1').strip
	data[:copyparent] = p if p
	end
	# Note we also check for incubator disclaimer (immaterial of tlp\|podling)
	if txt =~ SiteStandards::PODLING_CHECKS['disclaimer'][SiteStandards::CHECK_CAPTURE]
	t, p = getText(txt, node, / is an effort undergoing/)
	data[:disclaimer] = t
	end
	end
	# THIRD: see if an image has been uploaded
	data[:image] = ASF::SiteImage.find(id)

	return data
	end

	#########################################################################
	# Main execution begins here
	results = {}
	podlings = {}
	$cache = Cache.new(dir: 'site-scan')
	$verbose = ARGV.delete '--verbose'

	# USAGE:
	# site-scan.rb https://whimsical.apache.org [whimsy] [whimsy-scan.json] - to scan one project
	# site-scan.rb [project-output.json] [podlings-output.json] [projname podlingname ...]
	# If additional projname\|podlingname are provided, only scans those sites
	if ARGV.first =~ /^https?:\/\/\w/
	# Scan a single URL provided by user
	site = ARGV.shift
	name = ARGV.shift \|\| site[/\/(\w[^.]*)/, 1].capitalize
	output_projects = ARGV.shift
	results[name] = parse(name, site, name)
	else
	# Gather output filenames (if any) and scan various projects
	if ARGV.first =~ %r{[./]} # have we a file name?
	output_projects = ARGV.shift
	if ARGV.first =~ %r{[./]} # have we another file name?
	output_podlings = ARGV.shift
	else
	output_podlings = nil
	end
	else
	output_projects = nil
	end

	# Scan committees, including non-pmcs
	ASF::Committee.load_committee_info
	committees = (ASF::Committee.pmcs + ASF::Committee.nonpmcs).uniq
	committees.sort_by {\|committee\| committee.name}.each do \|committee\|
	next unless committee.site
	# if more parameters specified, parse only those names
	if ARGV.length > 0
	next unless ARGV.include? committee.name
	end
	results[committee.name] = parse(committee.name, committee.site, committee.display_name)
	end

	# Scan podlings that have a website
	ASF::Podling.list.each do \|podling\|
	if podling.status == 'current' and podling.podlingStatus[:website]
	# if more parameters specified, parse only those names
	if ARGV.length > 0
	next unless ARGV.include? podling.name
	end
	podlings[podling.name] = parse(podling.name, podling.podlingStatus[:website], podling.display_name)
	end
	end
	end

	# Output all results
	if output_projects
	File.write(output_projects, JSON.pretty_generate(results))
	else
	puts JSON.pretty_generate(results)
	end
	if output_podlings
	File.write(output_podlings, JSON.pretty_generate(podlings))
	else
	puts JSON.pretty_generate(podlings)
	end