blob: 8be7e184a616c28b6f4da7095e205f00ea858e1c [file] [log] [blame]
#!/usr/bin/env ruby
# Pony poop: utilities for analyzing data from Apache Ponymail APIs
# - Analyze stats.lua JSON output for subject/author analysis
# - Analyze mbox.lua mbox files for author/list/lines written analysis
#
# See also: https://ponymail.incubator.apache.org/docs/api
# See also: https://lists.apache.org/ngrams.html
require 'json'
require 'csv'
require 'net/http'
require 'cgi'
require 'optparse'
require_relative 'ponyapi'
# TODO: Fixup CSV output format to be more flexible, and/or add charting automatically
CSV_COLS = %w( Date TotalEmails TotalInteresting TotalThreads Missing Feedback Notice Report Resolution SVNAgenda SVNICLAs Person1 Emails1 Person2 Emails2 Person3 Emails3 Person4 Emails4 Person5 Emails5 )
BOARD_REGEX = { # Non-interesting email subjects from board # TODO add features for other lists
missing: /\AMissing\s\S+\sBoard/,
feedback: /\ABoard\sfeedback\son\s20/,
notice: /\A\[NOTICE\]/i,
report: /\A\[REPORT\]/i,
resolution: /\A\[RESOLUTION\]/i,
svn_agenda: %r{\Aboard: r\d{4,8} - /foundation/board/},
svn_iclas: %r{\Aboard: r\d{4,8} - /foundation/officers/iclas.txt}
}
MONTHS = %w( 1 2 3 4 5 6 7 8 9 10 11 12 )
YEARS_DEFAULT = %w( 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020)
# ## ### #### ##### ######
# Analysis functions, scanning stats.lua output JSON
# Determine max, total thread depth (only within current data)
def analyze_threads(threads)
# We are a hash that always includes a 'children' entry
# which is either a hash (empty) or an array of hashes
# TODO This code doesn't actually get correct depths
max = 1
total = 1
if threads['children'].is_a?(Array) then
max += 1
threads['children'].each do |thread|
m, t = analyze_threads(thread)
total += t
end
p "Hsh: #{threads.class} - #{threads.size} at #{threads['epoch']} #{max}/#{total}"
end
return max, total
end
# Analyze a local .json for interesting vs. not interesting board@ subjects
def analyze_stats(fname, results, subject_regex, errors)
begin
f = File.basename(fname)
begin
jzon = JSON.parse(File.read(fname))
rescue Exception => e
errors << "Bogosity! parsing #{f} raised #{e.message[0..255]}"
errors << "\t#{e.backtrace.join("\n\t")}"
return
end
begin
results << {}
subjects = []
results.last[:date] = f.chomp('.json')
results.last[:email] = jzon['emails'].size
results.last[:interesting] = results.last[:email]
results.last[:threads] = jzon['no_threads']
subject_regex.each do |t, s|
results.last[t] = jzon['emails'].select{ |email| email['subject'] =~ s }.size
results.last[:interesting] -= results.last[t] if subject_regex.keys.include? t
end
# TODO: there's a more rubyish way to combine these loops
subject_regex.each do |t, s|
jzon['emails'].reject!{ |email| email['subject'] =~ s }
end
jzon['emails'].each do |email|
subjects << email['subject']
end
# max = 0
# total = 0
# jzon['thread_struct'].each do |thread|
# m, t = analyze_threads(thread)
# max = m if m > max
# total += t
# end
# puts "max/total #{max}/#{total}"
# results.last[:maxdepth] = max
# results.last[:avgdepth] = total / jzon['no_threads']
ctr = 1
jzon['participants'].each do |participant|
unless participant['name'] =~ /@/ # Ignore SVN commit mails
results.last["Person#{ctr}"] = participant['name']
results.last["Emails#{ctr}"] = participant['count']
ctr += 1
break if ctr > 5
end
end
return subjects
rescue Exception => e
errors << "Bogosity! analyzing #{f} raised #{e.message[0..255]}"
errors << "\t#{e.backtrace.join("\n\t")}"
end
end
end
# Analyze a set of local .json files downloaded from lists.a.o
def run_analyze_stats(dir, list, subject_regex)
results = []
errors = []
subjects = []
output = File.join("#{dir}", "output-#{list}")
Dir[File.join("#{dir}", "#{list}*.json")].each do |fname|
subjects = analyze_stats(fname, results, subject_regex, errors)
if subjects
responses = subjects.select {|subj| subj =~ /Re:/i }.size
File.open("#{fname.chomp('.json')}.txt", "w") do |f|
f.puts "COUNTS - Replies:#{responses}, New Messages:#{subjects.size - responses}"
subjects.sort.each do |s|
f.puts s.delete("\n")
end
end
end
end
CSV.open("#{output}.csv",'w', :write_headers=> true, :headers => CSV_COLS) do |csv|
results.each do |r|
csv << r.values
end
end
if errors.size > 0
results << {}
errors.each_with_index do |item, index|
results.last["error#{index}"] = item
end
end
File.open("#{output}.json", "w") do |f|
f.puts JSON.pretty_generate(results)
end
results
end
# ## ### #### ##### ######
# Check options and call needed methods
# TODO: Simplify and allow both:
# - Downloading either stats or mbox
# - Analyzing either stats or mbox
def optparse
options = {}
OptionParser.new do |opts|
opts.on('-h') { puts opts; exit }
opts.on('-dDIRECTORY', '--directory DIRECTORY', 'Local directory to dump/find .json files (default: .)') do |d|
if File.directory?(d)
options[:dir] = d
else
raise ArgumentError, "-d #{d} is not a valid directory"
end
end
opts.on(:REQUIRED, '-lLISTNAME', '--list LISTNAME', 'Root listname to download stats archive from (required; board or trademarks or...)') do |l|
options[:list] = l.chomp('@')
end
opts.on('-cCOOKIE', '--cookie COOKIE', 'For private lists REQUIRED, your ponymail logged-in cookie value') do |c|
options[:cookie] = c
end
opts.on('-sSUBDOMAIN', '--subdomain SUBDOMAIN', 'Root @ subdomain .apache.org (only if project list; hadoop or community or...) to download stats archive from') do |s|
options[:subdomain] = s.chomp('@.')
end
opts.on('-p', '--pull', 'Pull down stats JSON files into -d dir (otherwise, default analyzes existing stats JSON in dir)') do |p|
options[:pull] = true
end
opts.on('-m', '--mbox', 'Pull down mbox files into -d dir') do |p|
options[:mbox] = true
end
opts.on('-yYEAR', '--year YEAR', 'Only pull down single year, instead of 2010 thru now') do |y|
options[:year] = [ y ]
end
begin
opts.parse!
options[:dir] = '.' if options[:dir].nil?
options[:year] = YEARS_DEFAULT if options[:year].nil?
raise ArgumentError, 'You must supply an -l LISTNAME to operate on' if options[:list].nil?
rescue StandardError => e
$stderr.puts "#{e.message}; try -h for valid options, or see code"
exit 1
end
end
return options
end
# ## ### #### ##### ######
# Main method for command line use
if __FILE__ == $PROGRAM_NAME
options = optparse
if options[:pull]
puts "BEGIN: Pulling down stats JSONs in #{options[:dir]} of list: #{options[:list]}@#{options[:subdomain]} for years: #{options[:year]}"
PonyAPI::get_pony_stats_many options[:dir], options[:list], options[:subdomain], options[:year], MONTHS, options[:cookie]
elsif options[:mbox]
puts "BEGIN: Pulling down mboxes in #{options[:dir]} of list: #{options[:list]}@#{options[:subdomain]} for years: #{options[:year]}"
PonyAPI::get_pony_mbox_many options[:dir], options[:list], options[:subdomain], options[:year], MONTHS, options[:cookie]
else
puts "BEGIN: Analyzing previously downloaded stats #{File.join(options[:dir], '*-stats.json')} of list: #{options[:list]}"
run_analyze_stats options[:dir], options[:list], 'board'.eql?(options[:list]) ? BOARD_REGEX : {}
end
puts "END: Thanks for running ponypoop - see results in #{options[:dir]}"
end