blob: 7e96cbc8f58aede49341ee23a88a334d90720de9 [file] [log] [blame]
#!/usr/bin/env ruby
<<~HEREDOC
Pony poop: simple statistics for Apache Ponymail monthly archives
- Methods to pull down stats.lua JSON structures of monthly archive reports
- Medhods to analyze local .json structures with chartable stats
See also: https://ponymail.incubator.apache.org/docs/api
See also: https://lists.apache.org/ngrams.html
HEREDOC
require 'json'
require 'csv'
require 'net/http'
require 'cgi'
require 'optparse'
PONYSTATS = 'https://lists.apache.org/api/stats.lua?list=' # board&domain=apache.org&d=2017-04 becomes board-apache-org-201704.json
# TODO: Fixup CSV output format to be more flexible, and/or add charting automatically
CSV_COLS = %w( Date TotalEmails TotalInteresting TotalThreads Missing Feedback Notice Report Resolution SVNAgenda SVNICLAs Person1 Emails1 Person2 Emails2 Person3 Emails3 Person4 Emails4 Person5 Emails5 )
BOARD_REGEX = { # Non-interesting email subjects from board
missing: /\AMissing\s\S+\sBoard/,
feedback: /\ABoard\sfeedback\son\s20/,
notice: /\A\[NOTICE\]/i,
report: /\A\[REPORT\]/i,
resolution: /\A\[RESOLUTION\]/i,
svn_agenda: %r{\Aboard: r\d{4,7} - /foundation/board/},
svn_iclas: %r{\Aboard: r\d{4,7} - /foundation/officers/iclas.txt}
}
# ## ### #### ##### ######
# Analysis functions, scanning stats.lua output JSON
# Determine max, total thread depth (only within current data)
def analyze_threads(threads)
# We are a hash that always includes a 'children' entry
# which is either a hash (empty) or an array of hashes
# TODO This code doesn't actually get correct depths
max = 1
total = 1
if threads['children'].is_a?(Array) then
max += 1
threads['children'].each do |thread|
m, t = analyze_threads(thread)
total += t
end
p "Hsh: #{threads.class} - #{threads.size} at #{threads['epoch']} #{max}/#{total}"
end
return max, total
end
# Analyze a local .json for interesting vs. not interesting board@ subjects
def analyze(fname, results, subject_regex, errors)
begin
f = File.basename(fname)
begin
jzon = JSON.parse(File.read(fname))
rescue Exception => e
errors << "Bogosity! parsing #{f} raised #{e.message[0..255]}"
errors << "\t#{e.backtrace.join("\n\t")}"
return
end
begin
results << {}
subjects = []
results.last[:date] = f.chomp('.json')
results.last[:email] = jzon['emails'].size
results.last[:interesting] = results.last[:email]
results.last[:threads] = jzon['no_threads']
subject_regex.each do |t, s|
results.last[t] = jzon['emails'].select{ |email| email['subject'] =~ s }.size
results.last[:interesting] -= results.last[t] if subject_regex.keys.include? t
end
# TODO: there's a more rubyish way to combine these loops
subject_regex.each do |t, s|
jzon['emails'].reject!{ |email| email['subject'] =~ s }
end
jzon['emails'].each do |email|
subjects << email['subject']
end
# max = 0
# total = 0
# jzon['thread_struct'].each do |thread|
# m, t = analyze_threads(thread)
# max = m if m > max
# total += t
# end
# puts "max/total #{max}/#{total}"
# results.last[:maxdepth] = max
# results.last[:avgdepth] = total / jzon['no_threads']
ctr = 1
jzon['participants'].each do |participant|
unless participant['name'] =~ /@/ # Ignore SVN commit mails
results.last["Person#{ctr}"] = participant['name']
results.last["Emails#{ctr}"] = participant['count']
ctr += 1
break if ctr > 5
end
end
return subjects
rescue Exception => e
errors << "Bogosity! analyzing #{f} raised #{e.message[0..255]}"
errors << "\t#{e.backtrace.join("\n\t")}"
end
end
end
# Analyze a set of local .json files downloaded from lists.a.o
def run_analyze(dir, list, subject_regex)
results = []
errors = []
subjects = []
output = File.join("#{dir}", "output-#{list}")
Dir[File.join("#{dir}", "#{list}*.json")].each do |fname|
subjects = analyze(fname, results, subject_regex, errors)
if subjects
responses = subjects.select {|subj| subj =~ /Re:/i }.size
File.open("#{fname.chomp('.json')}.txt", "w") do |f|
f.puts "COUNTS - Replies:#{responses}, New Messages:#{subjects.size - responses}"
subjects.sort.each do |s|
f.puts s.delete("\n")
end
end
end
end
CSV.open("#{output}.csv",'w', :write_headers=> true, :headers => CSV_COLS) do |csv|
results.each do |r|
csv << r.values
end
end
if errors.size > 0
results << {}
errors.each_with_index do |item, index|
results.last["error#{index}"] = item
end
end
File.open("#{output}.json", "w") do |f|
f.puts JSON.pretty_generate(results)
end
results
end
# ## ### #### ##### ######
# Download functions: grab monthly stats.lua data as .jsons
# Grab monthly data from lists.a.o - for private lists
def get_private_from_archive(dir, list, years, months, cookie)
cookieval = "ponymail=#{cookie}"
years.each do |y|
months.each do |m|
uri = URI("#{PONYSTATS}#{list}&domain=apache-org&d=#{y}-#{m}")
http = Net::HTTP.new(uri.host, uri.port)
http.use_ssl = true
request = Net::HTTP::Get.new(uri.request_uri)
request['Cookie'] = cookieval
r = http.request(request)
if r.code =~ /200/ then
File.open(File.join("#{dir}", "#{list}-apache-org-#{y}#{m}.json"), "w") do |f|
jzon = JSON.parse(r.body)
begin
f.puts JSON.pretty_generate(jzon)
rescue JSON::GeneratorError
puts "Bogosity: Generator error on #{r.code} for #{uri.request_uri}"
f.puts jzon
end
end
else
puts "Double Bogus! #{r.code} for #{uri.request_uri}"
end
end
end
end
# ## ### #### ##### ######
# Grab monthly data from lists.a.o - only for public lists
# fetch uri, following redirects: tools/site-scan.rb
def fetch(uri)
uri = URI.parse(uri)
Net::HTTP.start(uri.host, uri.port, use_ssl: uri.scheme == 'https') do |http|
request = Net::HTTP::Get.new(uri.request_uri)
response = http.request(request)
if response.code =~ /^3\d\d/
fetch response['location']
else
return uri, request, response
end
end
end
# Grab monthly data from lists.a.o - for public lists
def get_public_from_archive(dir, list, subdomain, year, month)
uri, request, response = fetch("#{PONYSTATS}#{list}&domain=#{subdomain}.apache.org&d=#{year}-#{month}")
pmails = JSON.parse(response.body)
File.open(File.join("#{dir}", "#{list}-#{subdomain}-apache-org-#{year}-#{month}.json"), "w") do |f|
f.puts JSON.pretty_generate(pmails)
end
end
def get_all_public(dir, list, subdomain, years, months)
years.each do |y|
months.each do |m|
get_public_from_archive dir, list, subdomain, y, m
end
end
end
# ## ### #### ##### ######
# Check options and call needed methods
# TODO: this assumes you correctly use -c and -s
def optparse
options = {}
OptionParser.new do |opts|
opts.on('-h') { puts opts; exit }
opts.on(:REQUIRED, '-dDIRECTORY', '--directory DIRECTORY', 'Local directory to dump/find .json files (required)') do |d|
if File.directory?(d)
options[:dir] = d
else
raise ArgumentError, "-d #{d} is not a valid directory"
end
end
opts.on(:REQUIRED, '-lLISTNAME', '--list LISTNAME', 'Root listname to download stats archive from (required; board or trademarks or...)') do |l|
options[:list] = l.chomp('@')
end
opts.on('-cCOOKIE', '--cookie COOKIE', 'For private lists, your ponymail logged-in cookie value') do |c|
options[:cookie] = c
end
opts.on('-sSUBDOMAIN', '--list SUBDOMAIN', 'Root @ subdomain .apache.org (only if project list; hadoop or community or...) to download stats archive from') do |s|
options[:subdomain] = s.chomp('@.')
end
opts.on('-p', '--pull', 'Pull down stats into -d dir (otherwise, analyzes existing stats in dir)') do |p|
options[:pull] = true
end
begin
opts.parse!
rescue OptionParser::ParseError => e
$stderr.puts e
$stderr.puts "try -h for valid options, or see code"
exit 1
end
end
return options
end
# ## ### #### ##### ######
# Main method for command line use
if __FILE__ == $PROGRAM_NAME
months = %w( 01 02 03 04 05 06 07 08 09 10 11 12 )
years = %w( 2010 2011 2012 2013 2014 2015 2016 2017 )
options = optparse
options[:list] ||= 'board'
if options[:pull]
# TODO make months/years settable
raise ArgumentError "Must have a -c COOKIE to -p pull private archives" unless options[:cookie]
puts "BEGIN: Pulling down JSON to #{options[:dir]} of list: #{options[:list]} @ #{options[:subdomain]} "
get_private_from_archive options[:dir], options[:list], years, months, options[:cookie]
else
puts "BEGIN: Analyzing local JSONs in #{options[:dir]} of list: #{options[:list]}"
run_analyze options[:dir], options[:list], BOARD_REGEX
end
puts "END: Thanks for running ponypoop - see results in #{options[:dir]}"
end