tools/ponypoop.rb - whimsy - Git at Google

 #!/usr/bin/env ruby
 # Pony poop: utilities for analyzing data from Apache Ponymail APIs
 # - Analyze stats.lua JSON output for subject/author analysis
 # - Analyze mbox.lua mbox files for author/list/lines written analysis
 #
 # See also: https://ponymail.incubator.apache.org/docs/api
 # See also: https://lists.apache.org/ngrams.html
 require 'json'
 require 'csv'
 require 'net/http'
 require 'cgi'
 require 'optparse'
 require_relative 'ponyapi'

 # TODO: Fixup CSV output format to be more flexible, and/or add charting automatically
 CSV_COLS = %w( Date TotalEmails TotalInteresting TotalThreads Missing Feedback Notice Report Resolution SVNAgenda SVNICLAs Person1 Emails1 Person2 Emails2 Person3 Emails3 Person4 Emails4 Person5 Emails5 )
 BOARD_REGEX = { # Non-interesting email subjects from board # TODO add features for other lists
   missing: /\AMissing\s\S+\sBoard/,
   feedback: /\ABoard\sfeedback\son\s20/,
   notice: /\A\[NOTICE\]/i,
   report: /\A\[REPORT\]/i,
   resolution: /\A\[RESOLUTION\]/i,
   svn_agenda: %r{\Aboard: r\d{4,8} - /foundation/board/},
   svn_iclas: %r{\Aboard: r\d{4,8} - /foundation/officers/iclas.txt}
 }
 MONTHS = %w( 1 2 3 4 5 6 7 8 9 10 11 12 )
 YEARS_DEFAULT = %w( 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020)

 # ## ### #### ##### ######
 # Analysis functions, scanning stats.lua output JSON
 # Determine max, total thread depth (only within current data)
 def analyze_threads(threads)
   # We are a hash that always includes a 'children' entry
   #  which is either a hash (empty) or an array of hashes
   # TODO This code doesn't actually get correct depths
   max = 1
   total = 1
   if threads['children'].is_a?(Array) then
     max += 1
     threads['children'].each do |thread|
       m, t = analyze_threads(thread)
       total += t
     end
     p "Hsh: #{threads.class} - #{threads.size} at #{threads['epoch']} #{max}/#{total}"
   end

   return max, total
 end

 # Analyze a local .json for interesting vs. not interesting board@ subjects
 def analyze_stats(fname, results, subject_regex, errors)
   begin
     f = File.basename(fname)
     begin
       jzon = JSON.parse(File.read(fname))
     rescue Exception => e
       errors << "Bogosity! parsing #{f} raised #{e.message[0..255]}"
       errors << "\t#{e.backtrace.join("\n\t")}"
       return
     end
     begin
       results << {}
       subjects = []
       results.last[:date] = f.chomp('.json')
       results.last[:email] = jzon['emails'].size
       results.last[:interesting] = results.last[:email]
       results.last[:threads] = jzon['no_threads']
       subject_regex.each do |t, s|
         results.last[t] = jzon['emails'].select{ |email| email['subject'] =~ s }.size
         results.last[:interesting] -= results.last[t] if subject_regex.keys.include? t
       end
       # TODO: there's a more rubyish way to combine these loops
       subject_regex.each do |t, s|
         jzon['emails'].reject!{ |email| email['subject'] =~ s }
       end
       jzon['emails'].each do |email|
         subjects << email['subject']
       end
       # max = 0
       # total = 0
       # jzon['thread_struct'].each do |thread|
       #   m, t = analyze_threads(thread)
       #   max = m if m > max
       #   total += t
       # end
       # puts "max/total #{max}/#{total}"
       # results.last[:maxdepth] = max
       # results.last[:avgdepth] = total / jzon['no_threads']
       ctr = 1
       jzon['participants'].each do |participant|
         unless participant['name'] =~ /@/ # Ignore SVN commit mails
           results.last["Person#{ctr}"] = participant['name']
           results.last["Emails#{ctr}"] = participant['count']
           ctr += 1
           break if ctr > 5
         end
       end
       return subjects
     rescue Exception => e
       errors << "Bogosity! analyzing #{f} raised #{e.message[0..255]}"
       errors << "\t#{e.backtrace.join("\n\t")}"
     end
   end
 end

 # Analyze a set of local .json files downloaded from lists.a.o
 def run_analyze_stats(dir, list, subject_regex)
   results = []
   errors = []
   subjects = []
   output = File.join("#{dir}", "output-#{list}")
   Dir[File.join("#{dir}", "#{list}*.json")].each do |fname|
     subjects = analyze_stats(fname, results, subject_regex, errors)
     if subjects
       responses = subjects.select {|subj| subj =~ /Re:/i }.size
       File.open("#{fname.chomp('.json')}.txt", "w") do |f|
         f.puts "COUNTS - Replies:#{responses}, New Messages:#{subjects.size - responses}"
         subjects.sort.each do |s|
           f.puts s.delete("\n")
         end
       end
     end
   end
   CSV.open("#{output}.csv",'w', :write_headers=> true, :headers => CSV_COLS) do |csv|
     results.each do |r|
       csv << r.values
     end
   end
   if errors.size > 0
     results << {}
     errors.each_with_index do |item, index|
       results.last["error#{index}"] = item
     end
   end
   File.open("#{output}.json", "w") do |f|
     f.puts JSON.pretty_generate(results)
   end

   results
 end


 # ## ### #### ##### ######
 # Check options and call needed methods
 # TODO: Simplify and allow both:
 # - Downloading either stats or mbox
 # - Analyzing either stats or mbox
 def optparse
   options = {}
   OptionParser.new do |opts|
     opts.on('-h') { puts opts; exit }

     opts.on('-dDIRECTORY', '--directory DIRECTORY', 'Local directory to dump/find .json files (default: .)') do |d|
       if File.directory?(d)
         options[:dir] = d
       else
         raise ArgumentError, "-d #{d} is not a valid directory"
       end
     end
     opts.on(:REQUIRED, '-lLISTNAME', '--list LISTNAME', 'Root listname to download stats archive from (required; board or trademarks or...)') do |l|
       options[:list] = l.chomp('@')
     end

     opts.on('-cCOOKIE', '--cookie COOKIE', 'For private lists REQUIRED, your ponymail logged-in cookie value') do |c|
       options[:cookie] = c
     end
     opts.on('-sSUBDOMAIN', '--subdomain SUBDOMAIN', 'Root @ subdomain .apache.org (only if project list; hadoop or community or...) to download stats archive from') do |s|
       options[:subdomain] = s.chomp('@.')
     end

     opts.on('-p', '--pull', 'Pull down stats JSON files into -d dir (otherwise, default analyzes existing stats JSON in dir)') do |p|
       options[:pull] = true
     end
     opts.on('-m', '--mbox', 'Pull down mbox files into -d dir') do |p|
       options[:mbox] = true
     end

     opts.on('-yYEAR', '--year YEAR', 'Only pull down single year, instead of 2010 thru now') do |y|
       options[:year] = [ y ]
     end

     begin
       opts.parse!
       options[:dir] = '.' if options[:dir].nil?
       options[:year] = YEARS_DEFAULT if options[:year].nil?
       raise ArgumentError, 'You must supply an -l LISTNAME to operate on' if options[:list].nil?
     rescue StandardError => e
       $stderr.puts "#{e.message}; try -h for valid options, or see code"
       exit 1
     end
   end

   return options
 end

 # ## ### #### ##### ######
 # Main method for command line use
 if __FILE__ == $PROGRAM_NAME
   options = optparse
   if options[:pull]
     puts "BEGIN: Pulling down stats JSONs in #{options[:dir]} of list: #{options[:list]}@#{options[:subdomain]} for years: #{options[:year]}"
     PonyAPI::get_pony_stats_many options[:dir], options[:list], options[:subdomain], options[:year], MONTHS, options[:cookie]
   elsif options[:mbox]
     puts "BEGIN: Pulling down mboxes in #{options[:dir]} of list: #{options[:list]}@#{options[:subdomain]} for years: #{options[:year]}"
     PonyAPI::get_pony_mbox_many options[:dir], options[:list], options[:subdomain], options[:year], MONTHS, options[:cookie]
   else
     puts "BEGIN: Analyzing previously downloaded stats #{File.join(options[:dir], '*-stats.json')} of list: #{options[:list]}"
     run_analyze_stats options[:dir], options[:list], 'board'.eql?(options[:list]) ? BOARD_REGEX : {}
   end
   puts "END: Thanks for running ponypoop - see results in #{options[:dir]}"
 end
	#!/usr/bin/env ruby
	# Pony poop: utilities for analyzing data from Apache Ponymail APIs
	# - Analyze stats.lua JSON output for subject/author analysis
	# - Analyze mbox.lua mbox files for author/list/lines written analysis
	#
	# See also: https://ponymail.incubator.apache.org/docs/api
	# See also: https://lists.apache.org/ngrams.html
	require 'json'
	require 'csv'
	require 'net/http'
	require 'cgi'
	require 'optparse'
	require_relative 'ponyapi'

	# TODO: Fixup CSV output format to be more flexible, and/or add charting automatically
	CSV_COLS = %w( Date TotalEmails TotalInteresting TotalThreads Missing Feedback Notice Report Resolution SVNAgenda SVNICLAs Person1 Emails1 Person2 Emails2 Person3 Emails3 Person4 Emails4 Person5 Emails5 )
	BOARD_REGEX = { # Non-interesting email subjects from board # TODO add features for other lists
	missing: /\AMissing\s\S+\sBoard/,
	feedback: /\ABoard\sfeedback\son\s20/,
	notice: /\A\[NOTICE\]/i,
	report: /\A\[REPORT\]/i,
	resolution: /\A\[RESOLUTION\]/i,
	svn_agenda: %r{\Aboard: r\d{4,8} - /foundation/board/},
	svn_iclas: %r{\Aboard: r\d{4,8} - /foundation/officers/iclas.txt}
	}
	MONTHS = %w( 1 2 3 4 5 6 7 8 9 10 11 12 )
	YEARS_DEFAULT = %w( 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020)

	# ## ### #### ##### ######
	# Analysis functions, scanning stats.lua output JSON
	# Determine max, total thread depth (only within current data)
	def analyze_threads(threads)
	# We are a hash that always includes a 'children' entry
	# which is either a hash (empty) or an array of hashes
	# TODO This code doesn't actually get correct depths
	max = 1
	total = 1
	if threads['children'].is_a?(Array) then
	max += 1
	threads['children'].each do \|thread\|
	m, t = analyze_threads(thread)
	total += t
	end
	p "Hsh: #{threads.class} - #{threads.size} at #{threads['epoch']} #{max}/#{total}"
	end

	return max, total
	end

	# Analyze a local .json for interesting vs. not interesting board@ subjects
	def analyze_stats(fname, results, subject_regex, errors)
	begin
	f = File.basename(fname)
	begin
	jzon = JSON.parse(File.read(fname))
	rescue Exception => e
	errors << "Bogosity! parsing #{f} raised #{e.message[0..255]}"
	errors << "\t#{e.backtrace.join("\n\t")}"
	return
	end
	begin
	results << {}
	subjects = []
	results.last[:date] = f.chomp('.json')
	results.last[:email] = jzon['emails'].size
	results.last[:interesting] = results.last[:email]
	results.last[:threads] = jzon['no_threads']
	subject_regex.each do \|t, s\|
	results.last[t] = jzon['emails'].select{ \|email\| email['subject'] =~ s }.size
	results.last[:interesting] -= results.last[t] if subject_regex.keys.include? t
	end
	# TODO: there's a more rubyish way to combine these loops
	subject_regex.each do \|t, s\|
	jzon['emails'].reject!{ \|email\| email['subject'] =~ s }
	end
	jzon['emails'].each do \|email\|
	subjects << email['subject']
	end
	# max = 0
	# total = 0
	# jzon['thread_struct'].each do \|thread\|
	# m, t = analyze_threads(thread)
	# max = m if m > max
	# total += t
	# end
	# puts "max/total #{max}/#{total}"
	# results.last[:maxdepth] = max
	# results.last[:avgdepth] = total / jzon['no_threads']
	ctr = 1
	jzon['participants'].each do \|participant\|
	unless participant['name'] =~ /@/ # Ignore SVN commit mails
	results.last["Person#{ctr}"] = participant['name']
	results.last["Emails#{ctr}"] = participant['count']
	ctr += 1
	break if ctr > 5
	end
	end
	return subjects
	rescue Exception => e
	errors << "Bogosity! analyzing #{f} raised #{e.message[0..255]}"
	errors << "\t#{e.backtrace.join("\n\t")}"
	end
	end
	end

	# Analyze a set of local .json files downloaded from lists.a.o
	def run_analyze_stats(dir, list, subject_regex)
	results = []
	errors = []
	subjects = []
	output = File.join("#{dir}", "output-#{list}")
	Dir[File.join("#{dir}", "#{list}*.json")].each do \|fname\|
	subjects = analyze_stats(fname, results, subject_regex, errors)
	if subjects
	responses = subjects.select {\|subj\| subj =~ /Re:/i }.size
	File.open("#{fname.chomp('.json')}.txt", "w") do \|f\|
	f.puts "COUNTS - Replies:#{responses}, New Messages:#{subjects.size - responses}"
	subjects.sort.each do \|s\|
	f.puts s.delete("\n")
	end
	end
	end
	end
	CSV.open("#{output}.csv",'w', :write_headers=> true, :headers => CSV_COLS) do \|csv\|
	results.each do \|r\|
	csv << r.values
	end
	end
	if errors.size > 0
	results << {}
	errors.each_with_index do \|item, index\|
	results.last["error#{index}"] = item
	end
	end
	File.open("#{output}.json", "w") do \|f\|
	f.puts JSON.pretty_generate(results)
	end

	results
	end


	# ## ### #### ##### ######
	# Check options and call needed methods
	# TODO: Simplify and allow both:
	# - Downloading either stats or mbox
	# - Analyzing either stats or mbox
	def optparse
	options = {}
	OptionParser.new do \|opts\|
	opts.on('-h') { puts opts; exit }

	opts.on('-dDIRECTORY', '--directory DIRECTORY', 'Local directory to dump/find .json files (default: .)') do \|d\|
	if File.directory?(d)
	options[:dir] = d
	else
	raise ArgumentError, "-d #{d} is not a valid directory"
	end
	end
	opts.on(:REQUIRED, '-lLISTNAME', '--list LISTNAME', 'Root listname to download stats archive from (required; board or trademarks or...)') do \|l\|
	options[:list] = l.chomp('@')
	end

	opts.on('-cCOOKIE', '--cookie COOKIE', 'For private lists REQUIRED, your ponymail logged-in cookie value') do \|c\|
	options[:cookie] = c
	end
	opts.on('-sSUBDOMAIN', '--subdomain SUBDOMAIN', 'Root @ subdomain .apache.org (only if project list; hadoop or community or...) to download stats archive from') do \|s\|
	options[:subdomain] = s.chomp('@.')
	end

	opts.on('-p', '--pull', 'Pull down stats JSON files into -d dir (otherwise, default analyzes existing stats JSON in dir)') do \|p\|
	options[:pull] = true
	end
	opts.on('-m', '--mbox', 'Pull down mbox files into -d dir') do \|p\|
	options[:mbox] = true
	end

	opts.on('-yYEAR', '--year YEAR', 'Only pull down single year, instead of 2010 thru now') do \|y\|
	options[:year] = [ y ]
	end

	begin
	opts.parse!
	options[:dir] = '.' if options[:dir].nil?
	options[:year] = YEARS_DEFAULT if options[:year].nil?
	raise ArgumentError, 'You must supply an -l LISTNAME to operate on' if options[:list].nil?
	rescue StandardError => e
	$stderr.puts "#{e.message}; try -h for valid options, or see code"
	exit 1
	end
	end

	return options
	end

	# ## ### #### ##### ######
	# Main method for command line use
	if __FILE__ == $PROGRAM_NAME
	options = optparse
	if options[:pull]
	puts "BEGIN: Pulling down stats JSONs in #{options[:dir]} of list: #{options[:list]}@#{options[:subdomain]} for years: #{options[:year]}"
	PonyAPI::get_pony_stats_many options[:dir], options[:list], options[:subdomain], options[:year], MONTHS, options[:cookie]
	elsif options[:mbox]
	puts "BEGIN: Pulling down mboxes in #{options[:dir]} of list: #{options[:list]}@#{options[:subdomain]} for years: #{options[:year]}"
	PonyAPI::get_pony_mbox_many options[:dir], options[:list], options[:subdomain], options[:year], MONTHS, options[:cookie]
	else
	puts "BEGIN: Analyzing previously downloaded stats #{File.join(options[:dir], '*-stats.json')} of list: #{options[:list]}"
	run_analyze_stats options[:dir], options[:list], 'board'.eql?(options[:list]) ? BOARD_REGEX : {}
	end
	puts "END: Thanks for running ponypoop - see results in #{options[:dir]}"
	end