tools/site-scan.rb - whimsy - Git at Google

 #!/usr/bin/env ruby
 # Scans Apache project homepages and captures text|urls for common links
 # Gathers data that can be used to check for policy compliance:
 #   https://www.apache.org/foundation/marks/pmcs#navigation
 #   http://www.apache.org/events/README.txt
 #   See Also: lib/whimsy/sitestandards.rb
 #
 # Makes no value judgements.  Simply extracts raw data for offline analysis.
 $LOAD_PATH.unshift '/srv/whimsy/lib'
 require 'net/http'
 require 'nokogiri'
 require 'json'
 require 'whimsy/asf'
 require 'whimsy/cache'
 require 'whimsy/sitestandards'
 require_relative 'asf-site-check'

 $stdout.sync = true

 # Normalize spaces in text runs
 def squash(text)
   return text.scrub.gsub(/[[:space:]]+/, ' ').strip
 end

 # Get text from a node; use parent if text does not appear to be complete
 # This is used when scanning for some links that may
 #   be in an image or other commonly related node on websites
 def getText(txt, node, match=/Apache Software Foundation/i)
   parent = nil # debug to show where parent needed to be fetched
   if txt !~ match # have we got all the text?
     if node.parent.name == 'a' # e.g. whimsical. such parents don't have extra text.
       newnode = node.parent.parent
     else
       newnode = node.parent
     end
     # ensure <br> is treated as a separator when extracting the combined text
     newnode.css('br').each { |br| br.replace(" ") }
     txt = squash(newnode.text)
     parent = true
   end
   return txt, parent
 end

 # helper for multiple events
 # TODO should we show them all?
 def save_events(data, value)
   prev = data[:events]
   if prev and prev != value
     puts "Events: already have '#{prev}', not storing '#{value}'"
   else
     data[:events] = value
   end
 end

 # Extract link text, skipping invisible stuff (assumed to be a class ending with 'sr-only')
 def get_link_text(anode)
   bits = []
   anode.traverse do |node|
     if node.name == 'text'
       bits << node.text unless node.parent.name == 'span' and  node.parent.attribute('class')&.value&.end_with? 'sr-only'
     end
 end
   bits.join(' ')
 end

 # Parse an Apache project website and return text|urls that match our checks
 # @return Hash of symbols: text|url found from a check made
 # @see SiteStandards for definitions of what we should scan for (in general)
 def parse(id, site, name)
   data = {}
   # force https to avoid issue with cache (sites should use https anyway)
   site.sub!(%r{^http:},'https:')
   SiteStandards::COMMON_CHECKS.each_key do |k|
     data[k.to_sym] = nil
   end
   data[:display_name] = name
   data[:uri] = site
   uri = URI.parse(site)
   begin
     Socket.getaddrinfo(uri.host, uri.scheme)
   rescue SocketError => se
     data[:errors] = se.message
     return data
   end
   begin
     uri, response, status = $cache.get(site.to_s)
   rescue IOError => ioe
     data[:errors] = ioe.message
     return data
   end
   puts "#{id} #{uri} #{status}"
   # Bail and return if getting the site returns an error code
   if response.respond_to? :code and response.code =~ /^[45]/
     data[:errors] = "cache.get(#{site}) error code #{response.code}"
     return data
   end
   doc = Nokogiri::HTML(response)
   if $saveparse
     file = File.join('/tmp',"site-scan_#{$$}.txt")
     File.write(file, doc.to_s)
     $stderr.puts "Wrote parsed input to #{file}"
   end
   data[:uri] = uri.to_s

   # FIRST: scan each link's a_href to see if we need to capture it
   # also capture script src for events
   doc.traverse do |a|

     if a.name == 'script'
       a_src = a['src'].to_s.strip
       if a_src =~ SiteStandards::COMMON_CHECKS['events'][SiteStandards::CHECK_CAPTURE]
         save_events data, uri + a_src
       end
     end

     next unless a.name == 'a'

     # Normalize the text and href for our capture purposes
     a_href = a['href'].to_s.strip
     a_text = get_link_text(a) # Not down-cased yet
     $stderr.puts "#{a_text.inspect} #{a_href}" if $verbose

     # Check the href urls for some patterns
     if a_href =~ SiteStandards::COMMON_CHECKS['foundation'][SiteStandards::CHECK_CAPTURE]
       img = a.at('img')
       if img
         # use the title (hover text) in preference to the source
         data[:foundation] = img['title'] ? squash(img['title']) : uri + img['src'].strip
       else
         data[:foundation] = squash(a_text)
       end
     end

     if a_href =~ SiteStandards::COMMON_CHECKS['events'][SiteStandards::CHECK_CAPTURE]
       # Hack to ignore hidden links on main site
       save_events data, uri + a_href unless a['class'] == 'visible-home' and uri.path != '/'
     end

     # Check the a_text strings for other patterns
     a_text = a_text.downcase.strip # needs to be downcased here
     # Note this is an unusual case
     if (a_text =~ SiteStandards::COMMON_CHECKS['license'][SiteStandards::CHECK_TEXT]) and
         (a_href =~ SiteStandards::COMMON_CHECKS['license'][SiteStandards::CHECK_CAPTURE])
       begin
         data[:license] = uri + a_href
       rescue StandardError
         data[:license] = a_href
       end
     end

     %w(thanks security sponsorship privacy).each do |check|
       if a_text =~ SiteStandards::COMMON_CHECKS[check][SiteStandards::CHECK_CAPTURE]
         begin
           data[check.to_sym] = uri + a_href
         rescue StandardError
           data[check.to_sym] = a_href
         end
       end
     end
   end

   # SECOND: scan each text node to match and capture
   doc.traverse do |node|
     next unless node.is_a?(Nokogiri::XML::Text)
     txt = squash(node.text)
     # allow override if phrase looks good
     if (txt =~ SiteStandards::COMMON_CHECKS['trademarks'][SiteStandards::CHECK_CAPTURE] and not data[:trademarks]) or
         txt =~ /are trademarks of [Tt]he Apache Software/
       t, p = getText(txt, node)
       # drop previous text if it looks like Copyright sentence
       data[:trademarks] = t.sub(/^.*?Copyright .+? Foundation[.]?/, '').strip
       data[:tradeparent] = p if p
     end
     if txt =~ SiteStandards::COMMON_CHECKS['copyright'][SiteStandards::CHECK_CAPTURE]
       t, p = getText(txt, node)
       # drop text around the Copyright (or the symbol)
       data[:copyright] = t.sub(/^.*?((Copyright|©) .+? Foundation[.]?).*/, '\1').strip
       data[:copyparent] = p if p
     end
     # Note we also check for incubator disclaimer (immaterial of tlp|podling)
     if txt =~ SiteStandards::PODLING_CHECKS['disclaimer'][SiteStandards::CHECK_CAPTURE]
       t, _p = getText(txt, node, / is an effort undergoing/)
       data[:disclaimer] = t
     end
   end
   # THIRD: see if an image has been uploaded
   data[:image] = ASF::SiteImage.find(id)

   # Check for resource loading from non-ASF domains
   if $skipresourcecheck
     data[:resources] = "Not checked"
   else
     cmd = ['node', '/srv/whimsy/tools/scan-page.js', site]
     out, err, status = exec_with_timeout(cmd, 60)
     if status
       ext_urls = out.split("\n").reject {|x| ASFDOMAIN.asfhost? x}.tally
       resources = ext_urls.values.sum
       data[:resources] = "Found #{resources} external resources: #{ext_urls}"
     else
       data[:resources] = err
     end
   end

   #  TODO: does not find js references such as:
   #  ga.src = ('https:' == document.location.protocol ? 'https://ssl' : 'http://www') + '.google-analytics.com/ga.js';
   return data
 end

 require 'timeout'
 # the node script appears to stall sometimes, so apply a timeout
 def exec_with_timeout(cmd, timeout)
   begin
     # stdout, stderr pipes
     rout, wout = IO.pipe
     rerr, werr = IO.pipe
     stdout, stderr = nil
     status = false

     pid = Process.spawn(*cmd, pgroup: true, :out => wout, :err => werr)

     Timeout.timeout(timeout) do
       Process.waitpid(pid)
       status = $?.success?
       # close write ends so we can read from them
       wout.close
       werr.close

       stdout = rout.readlines.join
       stderr = rerr.readlines.join
       unless status
         $stderr.puts "WARN:  #{Time.now} failed scanning #{cmd} #{pid} #{stderr}"
         stderr = 'Scanning failed'
       end

     end

   rescue Timeout::Error
     # Try to determine why the kill does not tidy the chrome processes
     # Also whether a kill was actually issued!
     puts "WARN: timeout scanning #{cmd[-1]} #{pid}"
     $stderr.puts "WARN:  #{Time.now} timeout scanning #{cmd[-1]} #{pid}"
     stderr = 'Timeout'
     ret=''
     # Try to show process tree
     cmd = "ps -lfg #{$$}"
     begin
       $stderr.puts "WARN:  #{Time.now} #{cmd}:"
       $stderr.puts `#{cmd}`
       reaper = Process.detach(pid) # ensure the process is reaped
       # kill -pid responds with EINVAL - invalid argument
       $stderr.puts "WARN:  #{Time.now} about to kill -15 #{pid}"
       ret = Process.kill(-15, pid) # SIGTERM
       $stderr.puts "WARN:  #{Time.now} sent kill -15 #{pid} ret=#{ret}"

       thrd = reaper.join 30 # allow some time for process to exit

       if thrd # original process has finished
         $stderr.puts  "WARN:  #{Time.now} process completed #{thrd.value}"
       else # not yet finished, try a stronger kill
         $stderr.puts "WARN:  #{Time.now} about to kill -9 #{pid}"
         ret = Process.kill(-9, pid) # SIGKILL
         $stderr.puts "WARN:  #{Time.now} sent kill -9 #{pid} ret=#{ret}"
         thrd = reaper.join 5 # allow some time for process to exit
         if thrd
           $stderr.puts  "WARN:  #{Time.now} process completed #{thrd.value}"
         else
           $stderr.puts "ERROR:  #{Time.now} failed to kill -9 #{pid}"
         end
       end
     rescue StandardError => e
       $stderr.puts "WARN:  #{Time.now} ret=#{ret} exception: #{e}"
     end
     $stderr.puts "WARN:  #{Time.now} #{cmd}:"
     $stderr.puts `#{cmd}`
 ensure
     wout.close unless wout.closed?
     werr.close unless werr.closed?
     # dispose the read ends of the pipes
     rout.close
     rerr.close
   end
   return stdout, stderr, status
 end

 #########################################################################
 # Main execution begins here
 results = {}
 podlings = {}
 $cache = Cache.new(dir: 'site-scan')
 $verbose = ARGV.delete '--verbose'
 $saveparse = ARGV.delete '--saveparse'
 $skipresourcecheck = ARGV.delete '--noresource'

 puts "Started: #{Time.now}"  # must agree with site-scan monitor

 # USAGE:
 # site-scan.rb https://whimsical.apache.org [Whimsy] [whimsy-scan.json] - to scan one project
 # site-scan.rb [project-output.json] [podlings-output.json] [projname podlingname ...]
 # If additional projname|podlingname are provided, only scans those sites
 if ARGV.first =~ /^https?:\/\/\w/
   # Scan a single URL provided by user
   site = ARGV.shift
   name = ARGV.shift || site[/\/(\w[^.]*)/, 1].capitalize
   output_projects = ARGV.shift
   results[name] = parse(name, site, name)
 else
   # Gather output filenames (if any) and scan various projects
   if ARGV.first =~ %r{[./]} # have we a file name?
     output_projects = ARGV.shift
     if ARGV.first =~ %r{[./]} # have we another file name?
       output_podlings = ARGV.shift
     else
       output_podlings = nil
     end
   else
     output_projects = nil
   end

   # Scan committees, including non-pmcs
   ASF::Committee.load_committee_info
   committees = (ASF::Committee.pmcs + ASF::Committee.nonpmcs).uniq
   committees.sort_by {|committee| committee.name}.each do |committee|
     next unless committee.site
     # if more parameters specified, parse only those names
     if ARGV.length > 0
       next unless ARGV.include? committee.name
     end
     results[committee.name] = parse(committee.name, committee.site, committee.display_name)
   end

   # Scan podlings that have a website
   ASF::Podling.list.sort_by(&:name).each do |podling|
     if podling.status == 'current' and podling.podlingStatus[:website]
       # if more parameters specified, parse only those names
       if ARGV.length > 0
         next unless ARGV.include? podling.name
       end
       podlings[podling.name] = parse(podling.name, podling.podlingStatus[:website], podling.display_name)
     end
   end
 end

 # Output all results
 if output_projects
   File.write(output_projects, JSON.pretty_generate(results))
 else
   puts JSON.pretty_generate(results)
 end
 if output_podlings
   File.write(output_podlings, JSON.pretty_generate(podlings))
 else
   puts JSON.pretty_generate(podlings)
 end

 puts "Ended: #{Time.now}" # must agree with site-scan monitor
	#!/usr/bin/env ruby
	# Scans Apache project homepages and captures text\|urls for common links
	# Gathers data that can be used to check for policy compliance:
	# https://www.apache.org/foundation/marks/pmcs#navigation
	# http://www.apache.org/events/README.txt
	# See Also: lib/whimsy/sitestandards.rb
	#
	# Makes no value judgements. Simply extracts raw data for offline analysis.
	$LOAD_PATH.unshift '/srv/whimsy/lib'
	require 'net/http'
	require 'nokogiri'
	require 'json'
	require 'whimsy/asf'
	require 'whimsy/cache'
	require 'whimsy/sitestandards'
	require_relative 'asf-site-check'

	$stdout.sync = true

	# Normalize spaces in text runs
	def squash(text)
	return text.scrub.gsub(/[[:space:]]+/, ' ').strip
	end

	# Get text from a node; use parent if text does not appear to be complete
	# This is used when scanning for some links that may
	# be in an image or other commonly related node on websites
	def getText(txt, node, match=/Apache Software Foundation/i)
	parent = nil # debug to show where parent needed to be fetched
	if txt !~ match # have we got all the text?
	if node.parent.name == 'a' # e.g. whimsical. such parents don't have extra text.
	newnode = node.parent.parent
	else
	newnode = node.parent
	end
	# ensure <br> is treated as a separator when extracting the combined text
	newnode.css('br').each { \|br\| br.replace(" ") }
	txt = squash(newnode.text)
	parent = true
	end
	return txt, parent
	end

	# helper for multiple events
	# TODO should we show them all?
	def save_events(data, value)
	prev = data[:events]
	if prev and prev != value
	puts "Events: already have '#{prev}', not storing '#{value}'"
	else
	data[:events] = value
	end
	end

	# Extract link text, skipping invisible stuff (assumed to be a class ending with 'sr-only')
	def get_link_text(anode)
	bits = []
	anode.traverse do \|node\|
	if node.name == 'text'
	bits << node.text unless node.parent.name == 'span' and node.parent.attribute('class')&.value&.end_with? 'sr-only'
	end
	end
	bits.join(' ')
	end

	# Parse an Apache project website and return text\|urls that match our checks
	# @return Hash of symbols: text\|url found from a check made
	# @see SiteStandards for definitions of what we should scan for (in general)
	def parse(id, site, name)
	data = {}
	# force https to avoid issue with cache (sites should use https anyway)
	site.sub!(%r{^http:},'https:')
	SiteStandards::COMMON_CHECKS.each_key do \|k\|
	data[k.to_sym] = nil
	end
	data[:display_name] = name
	data[:uri] = site
	uri = URI.parse(site)
	begin
	Socket.getaddrinfo(uri.host, uri.scheme)
	rescue SocketError => se
	data[:errors] = se.message
	return data
	end
	begin
	uri, response, status = $cache.get(site.to_s)
	rescue IOError => ioe
	data[:errors] = ioe.message
	return data
	end
	puts "#{id} #{uri} #{status}"
	# Bail and return if getting the site returns an error code
	if response.respond_to? :code and response.code =~ /^[45]/
	data[:errors] = "cache.get(#{site}) error code #{response.code}"
	return data
	end
	doc = Nokogiri::HTML(response)
	if $saveparse
	file = File.join('/tmp',"site-scan_#{$$}.txt")
	File.write(file, doc.to_s)
	$stderr.puts "Wrote parsed input to #{file}"
	end
	data[:uri] = uri.to_s

	# FIRST: scan each link's a_href to see if we need to capture it
	# also capture script src for events
	doc.traverse do \|a\|

	if a.name == 'script'
	a_src = a['src'].to_s.strip
	if a_src =~ SiteStandards::COMMON_CHECKS['events'][SiteStandards::CHECK_CAPTURE]
	save_events data, uri + a_src
	end
	end

	next unless a.name == 'a'

	# Normalize the text and href for our capture purposes
	a_href = a['href'].to_s.strip
	a_text = get_link_text(a) # Not down-cased yet
	$stderr.puts "#{a_text.inspect} #{a_href}" if $verbose

	# Check the href urls for some patterns
	if a_href =~ SiteStandards::COMMON_CHECKS['foundation'][SiteStandards::CHECK_CAPTURE]
	img = a.at('img')
	if img
	# use the title (hover text) in preference to the source
	data[:foundation] = img['title'] ? squash(img['title']) : uri + img['src'].strip
	else
	data[:foundation] = squash(a_text)
	end
	end

	if a_href =~ SiteStandards::COMMON_CHECKS['events'][SiteStandards::CHECK_CAPTURE]
	# Hack to ignore hidden links on main site
	save_events data, uri + a_href unless a['class'] == 'visible-home' and uri.path != '/'
	end

	# Check the a_text strings for other patterns
	a_text = a_text.downcase.strip # needs to be downcased here
	# Note this is an unusual case
	if (a_text =~ SiteStandards::COMMON_CHECKS['license'][SiteStandards::CHECK_TEXT]) and
	(a_href =~ SiteStandards::COMMON_CHECKS['license'][SiteStandards::CHECK_CAPTURE])
	begin
	data[:license] = uri + a_href
	rescue StandardError
	data[:license] = a_href
	end
	end

	%w(thanks security sponsorship privacy).each do \|check\|
	if a_text =~ SiteStandards::COMMON_CHECKS[check][SiteStandards::CHECK_CAPTURE]
	begin
	data[check.to_sym] = uri + a_href
	rescue StandardError
	data[check.to_sym] = a_href
	end
	end
	end
	end

	# SECOND: scan each text node to match and capture
	doc.traverse do \|node\|
	next unless node.is_a?(Nokogiri::XML::Text)
	txt = squash(node.text)
	# allow override if phrase looks good
	if (txt =~ SiteStandards::COMMON_CHECKS['trademarks'][SiteStandards::CHECK_CAPTURE] and not data[:trademarks]) or
	txt =~ /are trademarks of [Tt]he Apache Software/
	t, p = getText(txt, node)
	# drop previous text if it looks like Copyright sentence
	data[:trademarks] = t.sub(/^.*?Copyright .+? Foundation[.]?/, '').strip
	data[:tradeparent] = p if p
	end
	if txt =~ SiteStandards::COMMON_CHECKS['copyright'][SiteStandards::CHECK_CAPTURE]
	t, p = getText(txt, node)
	# drop text around the Copyright (or the symbol)
	data[:copyright] = t.sub(/^.?((Copyright\|©) .+? Foundation[.]?)./, '\1').strip
	data[:copyparent] = p if p
	end
	# Note we also check for incubator disclaimer (immaterial of tlp\|podling)
	if txt =~ SiteStandards::PODLING_CHECKS['disclaimer'][SiteStandards::CHECK_CAPTURE]
	t, _p = getText(txt, node, / is an effort undergoing/)
	data[:disclaimer] = t
	end
	end
	# THIRD: see if an image has been uploaded
	data[:image] = ASF::SiteImage.find(id)

	# Check for resource loading from non-ASF domains
	if $skipresourcecheck
	data[:resources] = "Not checked"
	else
	cmd = ['node', '/srv/whimsy/tools/scan-page.js', site]
	out, err, status = exec_with_timeout(cmd, 60)
	if status
	ext_urls = out.split("\n").reject {\|x\| ASFDOMAIN.asfhost? x}.tally
	resources = ext_urls.values.sum
	data[:resources] = "Found #{resources} external resources: #{ext_urls}"
	else
	data[:resources] = err
	end
	end

	# TODO: does not find js references such as:
	# ga.src = ('https:' == document.location.protocol ? 'https://ssl' : 'http://www') + '.google-analytics.com/ga.js';
	return data
	end

	require 'timeout'
	# the node script appears to stall sometimes, so apply a timeout
	def exec_with_timeout(cmd, timeout)
	begin
	# stdout, stderr pipes
	rout, wout = IO.pipe
	rerr, werr = IO.pipe
	stdout, stderr = nil
	status = false

	pid = Process.spawn(*cmd, pgroup: true, :out => wout, :err => werr)

	Timeout.timeout(timeout) do
	Process.waitpid(pid)
	status = $?.success?
	# close write ends so we can read from them
	wout.close
	werr.close

	stdout = rout.readlines.join
	stderr = rerr.readlines.join
	unless status
	$stderr.puts "WARN: #{Time.now} failed scanning #{cmd} #{pid} #{stderr}"
	stderr = 'Scanning failed'
	end

	end

	rescue Timeout::Error
	# Try to determine why the kill does not tidy the chrome processes
	# Also whether a kill was actually issued!
	puts "WARN: timeout scanning #{cmd[-1]} #{pid}"
	$stderr.puts "WARN: #{Time.now} timeout scanning #{cmd[-1]} #{pid}"
	stderr = 'Timeout'
	ret=''
	# Try to show process tree
	cmd = "ps -lfg #{$$}"
	begin
	$stderr.puts "WARN: #{Time.now} #{cmd}:"
	$stderr.puts `#{cmd}`
	reaper = Process.detach(pid) # ensure the process is reaped
	# kill -pid responds with EINVAL - invalid argument
	$stderr.puts "WARN: #{Time.now} about to kill -15 #{pid}"
	ret = Process.kill(-15, pid) # SIGTERM
	$stderr.puts "WARN: #{Time.now} sent kill -15 #{pid} ret=#{ret}"

	thrd = reaper.join 30 # allow some time for process to exit

	if thrd # original process has finished
	$stderr.puts "WARN: #{Time.now} process completed #{thrd.value}"
	else # not yet finished, try a stronger kill
	$stderr.puts "WARN: #{Time.now} about to kill -9 #{pid}"
	ret = Process.kill(-9, pid) # SIGKILL
	$stderr.puts "WARN: #{Time.now} sent kill -9 #{pid} ret=#{ret}"
	thrd = reaper.join 5 # allow some time for process to exit
	if thrd
	$stderr.puts "WARN: #{Time.now} process completed #{thrd.value}"
	else
	$stderr.puts "ERROR: #{Time.now} failed to kill -9 #{pid}"
	end
	end
	rescue StandardError => e
	$stderr.puts "WARN: #{Time.now} ret=#{ret} exception: #{e}"
	end
	$stderr.puts "WARN: #{Time.now} #{cmd}:"
	$stderr.puts `#{cmd}`
	ensure
	wout.close unless wout.closed?
	werr.close unless werr.closed?
	# dispose the read ends of the pipes
	rout.close
	rerr.close
	end
	return stdout, stderr, status
	end

	#########################################################################
	# Main execution begins here
	results = {}
	podlings = {}
	$cache = Cache.new(dir: 'site-scan')
	$verbose = ARGV.delete '--verbose'
	$saveparse = ARGV.delete '--saveparse'
	$skipresourcecheck = ARGV.delete '--noresource'

	puts "Started: #{Time.now}" # must agree with site-scan monitor

	# USAGE:
	# site-scan.rb https://whimsical.apache.org [Whimsy] [whimsy-scan.json] - to scan one project
	# site-scan.rb [project-output.json] [podlings-output.json] [projname podlingname ...]
	# If additional projname\|podlingname are provided, only scans those sites
	if ARGV.first =~ /^https?:\/\/\w/
	# Scan a single URL provided by user
	site = ARGV.shift
	name = ARGV.shift \|\| site[/\/(\w[^.]*)/, 1].capitalize
	output_projects = ARGV.shift
	results[name] = parse(name, site, name)
	else
	# Gather output filenames (if any) and scan various projects
	if ARGV.first =~ %r{[./]} # have we a file name?
	output_projects = ARGV.shift
	if ARGV.first =~ %r{[./]} # have we another file name?
	output_podlings = ARGV.shift
	else
	output_podlings = nil
	end
	else
	output_projects = nil
	end

	# Scan committees, including non-pmcs
	ASF::Committee.load_committee_info
	committees = (ASF::Committee.pmcs + ASF::Committee.nonpmcs).uniq
	committees.sort_by {\|committee\| committee.name}.each do \|committee\|
	next unless committee.site
	# if more parameters specified, parse only those names
	if ARGV.length > 0
	next unless ARGV.include? committee.name
	end
	results[committee.name] = parse(committee.name, committee.site, committee.display_name)
	end

	# Scan podlings that have a website
	ASF::Podling.list.sort_by(&:name).each do \|podling\|
	if podling.status == 'current' and podling.podlingStatus[:website]
	# if more parameters specified, parse only those names
	if ARGV.length > 0
	next unless ARGV.include? podling.name
	end
	podlings[podling.name] = parse(podling.name, podling.podlingStatus[:website], podling.display_name)
	end
	end
	end

	# Output all results
	if output_projects
	File.write(output_projects, JSON.pretty_generate(results))
	else
	puts JSON.pretty_generate(results)
	end
	if output_podlings
	File.write(output_podlings, JSON.pretty_generate(podlings))
	else
	puts JSON.pretty_generate(podlings)
	end

	puts "Ended: #{Time.now}" # must agree with site-scan monitor