blob: 41d80cc57e5e54af41108b207270c613ed3edc51 [file] [log] [blame]
#!/usr/bin/env ruby
# Defines partial standards for Apache website checker
# TODO better document with specific policies
# Encapsulate (most) scans/validations done on website content
module SiteStandards
extend self
CHECK_TEXT = 'text' # (optional) Regex of <a ...>Text to scan for</a>, of a.text.downcase.strip
CHECK_CAPTURE = 'capture' # a_href minimal regex to capture - for license, we capture the link if it points to apache.org somewhere
CHECK_VALIDATE = 'validate' # a_href detailed regex to expect for compliance; it must point to one of our actual licenses to pass
CHECK_TYPE = 'type' # true = validation checks href/url; false = checks text node
CHECK_POLICY = 'policy' # URL to policy statement for this check
CHECK_DOC = 'doc' # Explanation of what the check is looking for
# Checks done only for TLPs (i.e. not podlings)
TLP_CHECKS = {
'uri' => { # Custom: merely saves uri of site
CHECK_TEXT => nil,
CHECK_CAPTURE => nil,
CHECK_VALIDATE => %r{https?://[^.]+\.apache\.org},
CHECK_TYPE => true,
CHECK_POLICY => 'https://www.apache.org/foundation/marks/pmcs#websites',
CHECK_DOC => 'The homepage for any ProjectName must be served from http://ProjectName.apache.org',
},
}
# Checks done only for Incubator podlings
PODLING_CHECKS = {
'uri' => {
CHECK_TEXT => nil,
CHECK_CAPTURE => %r{https?://[^.]+\.incubator\.apache\.org},
CHECK_VALIDATE => %r{https?://[^.]+\.incubator\.apache\.org},
CHECK_TYPE => true,
CHECK_POLICY => 'https://www.apache.org/foundation/marks/pmcs#websites',
CHECK_DOC => 'The homepage for any ProjectName must be served from http://ProjectName(.incubator).apache.org',
},
'disclaimer' => { # textnode_check: txt =~ / Incubation is required of all newly accepted projects /
CHECK_TEXT => %r{Incubation is required of all newly accepted projects},
CHECK_CAPTURE => %r{Incubation is required of all newly accepted projects},
CHECK_VALIDATE => %r{Apache \S+( \S+)?( \([Ii]ncubating\))? is an effort undergoing [Ii]ncubation at [Tt]he Apache Software Foundation \(ASF\),? sponsored by the (Apache )?\S+( PMC)?. Incubation is required of all newly accepted projects until a further review indicates that the infrastructure, communications, and decision making process have stabilized in a manner consistent with other successful ASF projects. While incubation status is not necessarily a reflection of the completeness or stability of the code, it does indicate that the project has yet to be fully endorsed by the ASF.},
CHECK_TYPE => false,
CHECK_POLICY => 'https://incubator.apache.org/guides/branding.html#disclaimers',
CHECK_DOC => 'All Apache Incubator Podling sites must contain the incubating disclaimer.',
},
}
# Checks done for all podlings|projects
COMMON_CHECKS = {
'foundation' => { # Custom: a_href =~ ... then custom checking for hover/title text
CHECK_TEXT => %r{apache|asf|foundation}i,
CHECK_CAPTURE => %r{^(https?:)?//(www\.)?apache\.org/?$},
CHECK_VALIDATE => %r{apache|asf|foundation}i,
CHECK_TYPE => false,
CHECK_POLICY => 'https://www.apache.org/foundation/marks/pmcs#navigation',
CHECK_DOC => 'All projects must feature some prominent link back to the main ASF homepage at http://www.apache.org/',
},
'events' => { # Custom: a_href.include? 'apache.org/events/' then custom check for img
CHECK_TEXT => nil,
CHECK_CAPTURE => %r{apache\.org\/events},
CHECK_VALIDATE => %r{^https?://.*apache.org/events/current-event},
CHECK_TYPE => true,
CHECK_POLICY => 'https://www.apache.org/events/README.txt',
CHECK_DOC => 'Projects SHOULD include a link to any current ApacheCon event, as provided by VP, Conferences.',
},
'license' => { # link_check a_text =~ /^license$/ and a_href.include? 'apache.org'
CHECK_TEXT => /^license$/,
CHECK_CAPTURE => %r{apache\.org},
CHECK_VALIDATE => %r{^https?://.*apache.org/licenses/?$},
CHECK_TYPE => true,
CHECK_POLICY => 'https://www.apache.org/foundation/marks/pmcs#navigation',
CHECK_DOC => '"License" should link to: http[s]://www.apache.org/licenses[/]',
},
'thanks' => { # link_check a_text =~ /\Athanks[!]?\z/
CHECK_TEXT => /\Athanks[!]?\z/,
CHECK_CAPTURE => /\Athanks[!]?\z/,
CHECK_VALIDATE => %r{^https?://.*apache.org/foundation/thanks},
CHECK_TYPE => true,
CHECK_POLICY => 'https://www.apache.org/foundation/marks/pmcs#navigation',
CHECK_DOC => '"Thanks" should link to: http://www.apache.org/foundation/thanks.html',
},
'security' => { # link_check a_text == 'security'
CHECK_TEXT => /security/,
CHECK_CAPTURE => /security/,
CHECK_VALIDATE => %r{^https?://.*apache.org/.*[Ss]ecurity},
CHECK_TYPE => true,
CHECK_POLICY => 'https://www.apache.org/foundation/marks/pmcs#navigation',
CHECK_DOC => '"Security" should link to either to a project-specific page [...], or to the main http://www.apache.org/security/ page.',
},
'sponsorship' => { # link_check ['sponsorship', 'donate', 'sponsor apache','sponsoring apache'].include? a_text
CHECK_TEXT => %r{sponsorship|donate|sponsor\sapache|sponsoring\sapache|sponsor},
CHECK_CAPTURE => %r{sponsorship|donate|sponsor\sapache|sponsoring\sapache|sponsor},
CHECK_VALIDATE => %r{^https?://.*apache.org/foundation/sponsorship},
CHECK_TYPE => true,
CHECK_POLICY => 'https://www.apache.org/foundation/marks/pmcs#navigation',
CHECK_DOC => '"Sponsorship", "Sponsor Apache", or "Donate" should link to: http://www.apache.org/foundation/sponsorship.html',
},
'trademarks' => { # textnode_check: if (txt =~ /\btrademarks\b/ and not data[:trademarks]) or txt =~/are trademarks of [Tt]he Apache Software/
CHECK_TEXT => %r{\btrademarks\b},
CHECK_CAPTURE => %r{\btrademarks\b},
CHECK_VALIDATE => %r{trademarks of [Tt]he Apache Software Foundation},
CHECK_TYPE => false,
CHECK_POLICY => 'https://www.apache.org/foundation/marks/pmcs#attributions',
CHECK_DOC => 'All project or product homepages must feature a prominent trademark attribution of all applicable Apache trademarks.',
},
'copyright' => { # textnode_check: txt =~ /Copyright / or txt =~ /©/
CHECK_TEXT => %r{((Copyright|©).*apache|apache.*(Copyright|©))}i,
CHECK_CAPTURE => %r{(Copyright|©)}i,
CHECK_VALIDATE => %r{((Copyright|©).*apache|apache.*(Copyright|©))}i,
CHECK_TYPE => false,
CHECK_POLICY => 'https://www.apache.org/legal/src-headers.html#headers',
CHECK_DOC => 'All website content SHOULD include a copyright notice for the ASF.',
},
'image' => { # Custom: merely looks in IMAGE_DIR for #{id}.*
CHECK_TEXT => nil,
CHECK_CAPTURE => nil,
CHECK_VALIDATE => %r{.},
CHECK_TYPE => true,
CHECK_POLICY => 'https://www.apache.org/img/',
CHECK_DOC => 'Projects SHOULD include a 212px wide copy of their logo in https://www.apache.org/img/ to be included in ASF homepage.',
},
}
SITE_PASS = 'label-success'
SITE_WARN = 'label-warning'
SITE_FAIL = 'label-danger'
# Determine the color of a given table cell, given:
# - overall analysis of the sites, in particular the third column
# which is a list projects that successfully matched the check
# - list of links for the project in question
# - the column in question (which indicates the check being reported on)
# - the name of the project
def label(analysis, links, col, name)
if not links[col]
SITE_FAIL
elsif analysis[2].include? col and not analysis[2][col].include? name
SITE_WARN
else
SITE_PASS
end
end
# Get hash of checks to be done for tlp | podling
# @param tlp true if project; podling otherwise
def get_checks(tlp = true)
tlp ? (return TLP_CHECKS.merge(COMMON_CHECKS)) : (return PODLING_CHECKS.merge(COMMON_CHECKS))
end
# Get filename of check data for tlp | podling
# @param tlp true if project; podling otherwise
def get_filename(tlp = true)
tlp ? (return 'site-scan.json') : (return 'pods-scan.json')
end
# Get URL to default filename location on server
def get_url(is_local = true)
is_local ? (return '../../../www/public/') : (return 'https://whimsy.apache.org/public/')
end
# Get check data for tlp | podling
# Uses a local_copy if available; w.a.o/public otherwise
# @param tlp true if project; podling otherwise
# @return [hash of site data, crawl_time]
def get_sites(tlp = true)
local_copy = File.expand_path("#{get_url(true)}#{get_filename(tlp)}", __FILE__).untaint
if File.exist? local_copy
crawl_time = File.mtime(local_copy).httpdate # show time in same format as last-mod
sites = JSON.parse(File.read(local_copy))
else
response = Net::HTTP.get_response(URI("#{get_url(false)}#{get_filename(tlp)}"))
crawl_time = response['last-modified']
sites = JSON.parse(response.body)
end
return sites, crawl_time
end
# Analyze data returned from site-scan.rb by using checks[CHECK_VALIDATE] regex
# If value =~ CHECK_VALIDATE, SITE_PASS
# If value is present (presumably from CHECK_TEXT|CAPTURE), then SITE_WARN
# If value not present, SITE_FAIL (i.e. site-scan.rb didn't find it)
# @param sites hash of site-scan data collected
# @param checks to apply to sites to determine status
# @return [overall counts, description of statuses, success listings]
def analyze(sites, checks)
success = Hash.new { |h, k| h[k] = Hash.new(&h.default_proc) }
counts = Hash.new { |h, k| h[k] = Hash.new(&h.default_proc) }
checks.each do |nam, check_data|
success[nam] = sites.select{ |k, site| site[nam] =~ check_data[SiteStandards::CHECK_VALIDATE] }.keys
counts[nam][SITE_PASS] = success[nam].count
counts[nam][SITE_WARN] = 0 # Reorder output
counts[nam][SITE_FAIL] = sites.select{ |k, site| site[nam].nil? }.count
counts[nam][SITE_WARN] = sites.size - counts[nam][SITE_PASS] - counts[nam][SITE_FAIL]
end
return [
counts, {
SITE_PASS => '# Sites with links to primary ASF page',
SITE_WARN => '# Sites with link, but not an expected ASF one',
SITE_FAIL => '# Sites with no link for this topic'
}, success
]
end
end