| #!/usr/bin/env ruby |
| $LOAD_PATH.unshift File.realpath(File.expand_path('../../lib', __FILE__)) |
| |
| # |
| # Scans committee pages for compliance with requirements and recommendations: |
| # https://www.apache.org/foundation/marks/pmcs#navigation |
| # http://www.apache.org/events/README.txt |
| # |
| # Makes no value judgements. Simply extracts raw data for offline analysis. |
| # |
| |
| require 'whimsy/asf' |
| require 'net/http' |
| require 'nokogiri' |
| require 'json' |
| |
| require 'whimsy/cache' |
| |
| def squash(text) |
| text.scrub.gsub(/[[:space:]]+/, ' ').strip |
| end |
| |
| ######################################################################### |
| |
| IMAGE_DIR = ASF::SVN.find('asf/infrastructure/site/trunk/content/img') |
| |
| def parse(id, site, name) |
| uri = URI.parse(site) |
| |
| # default data |
| data = { |
| display_name: name, |
| uri: site, |
| events: nil, |
| foundation: nil, |
| license: nil, |
| sponsorship: nil, |
| security: nil, |
| trademarks: nil, |
| copyright: nil, |
| image: nil, |
| } |
| |
| # check if site exists |
| begin |
| Socket.getaddrinfo(uri.host, uri.scheme) |
| rescue SocketError |
| return data |
| end |
| |
| uri, response, status = $cache.get(site.to_s) |
| $stderr.puts "#{id} #{uri} #{status}" |
| doc = Nokogiri::HTML(response) |
| data[:uri] = uri.to_s |
| |
| # scan each link |
| doc.css('a').each do |a| |
| |
| # check the link targets |
| a_href = a['href'].to_s.strip |
| |
| if a_href =~ %r{^https?://(www\.)?apache\.org/?$} |
| img = a.at('img') |
| if img |
| # use the title (hover text) in preference to the source |
| data[:foundation] = img['title'] ? squash(img['title']) : uri + img['src'].strip |
| else |
| data[:foundation] = squash(a.text) |
| end |
| end |
| |
| if a_href.include? 'apache.org/events/' |
| img = a.at('img') |
| if img |
| data[:events] = uri + img['src'].strip |
| else |
| data[:events] = uri + a_href |
| end |
| end |
| |
| # check the link text |
| a_text = a.text.downcase.strip |
| $stderr.puts a_text if $verbose |
| |
| if a_text =~ /licenses?/ and a_href.include? 'apache.org' |
| begin |
| data[:license] = uri + a_href |
| rescue |
| data[:license] = a_href |
| end |
| end |
| |
| if a_text == 'thanks' |
| begin |
| data[:thanks] = uri + a_href |
| rescue |
| data[:thanks] = a_href |
| end |
| end |
| |
| if a_text == 'security' |
| begin |
| data[:security] = uri + a_href |
| rescue |
| data[:security] = a_href |
| end |
| end |
| |
| if ['sponsorship', 'donate', 'sponsor apache','sponsoring apache'].include? a_text |
| begin |
| data[:sponsorship] = uri + a_href |
| rescue |
| data[:sponsorship] = a_href |
| end |
| end |
| end |
| |
| # Now scan the page text |
| doc.traverse do |node| |
| next unless node.is_a?(Nokogiri::XML::Text) |
| |
| txt = squash(node.text) |
| |
| # allow override if phrase looks good |
| if (txt =~ /\btrademarks\b/ and not data[:trademarks]) or txt =~/are trademarks of [Tt]he Apache Software/ |
| t, p = getText(txt, node) |
| # drop previous text if it looks like Copyright sentence |
| data[:trademarks] = t.sub(/^.*?Copyright .+? Foundation[.]?/,'').strip |
| data[:tradeparent] = p if p |
| end |
| if txt =~ /Copyright / or txt =~ /©/ |
| t, p = getText(txt, node) |
| # drop text around the Copyright (or the symbol) |
| data[:copyright] = t.sub(/^.*?((Copyright|©) .+? Foundation[.]?).*/,'\1').strip |
| data[:copyparent] = p if p |
| end |
| end |
| |
| # see if image has been uploaded |
| if IMAGE_DIR |
| data[:image] = Dir["#{IMAGE_DIR}/#{id}.*"]. |
| map {|path| File.basename(path)}.first |
| end |
| |
| return data |
| end |
| |
| # get the text; use parent if text does not appear to be complete |
| def getText(txt, node) |
| parent = nil # debug to show where parent needed to be fetched |
| if not txt =~ /Apache Software Foundation/i # have we got all the text? |
| if node.parent.name == 'a' # e.g. whimsical. such parents don't have extra text. |
| newnode = node.parent.parent |
| else |
| newnode = node.parent |
| end |
| # ensure <br> is treated as a separator when extracting the combined text |
| newnode.css('br').each{ |br| br.replace(" ") } |
| txt = squash(newnode.text) |
| parent = true |
| end |
| return txt, parent |
| end |
| |
| $verbose = ARGV.delete '--verbose' |
| |
| results = {} |
| |
| $cache = Cache.new(dir: 'site-scan') |
| |
| # Parse a single site given its URL |
| if (1..2).include? ARGV.length and ARGV.first =~ /^https?:\/\/\w/ |
| site = ARGV.shift |
| name = ARGV.shift || site[/\/(\w[^.]*)/, 1].capitalize |
| results[name] = parse(name, site, name) |
| else |
| if ARGV.first =~ %r{[./]} # have we a file name? |
| outfile = ARGV.shift |
| else |
| outfile = nil |
| end |
| # scan all committees, including non-pmcs |
| ASF::Committee.load_committee_info |
| committees = (ASF::Committee.pmcs + ASF::Committee.nonpmcs).uniq |
| |
| committees.sort_by {|committee| committee.name}.each do |committee| |
| next unless committee.site |
| # if parameters specified, parse only those names |
| if ARGV.length > 0 |
| next unless ARGV.include? committee.name |
| end |
| |
| # fetch, parse committee site |
| results[committee.name] = parse(committee.name, committee.site, committee.display_name) |
| end |
| end |
| |
| # Output results |
| if outfile |
| File.write(outfile, JSON.pretty_generate(results)) |
| else |
| puts JSON.pretty_generate(results) |
| end |