tools/mboxhdr2csv.rb - whimsy - Git at Google

 #!/usr/bin/env ruby
 # Analyze mbox files (downloaded by PonyAPI) for general statistics into CSV
 # - Per list messages per month over time (PMOT)
 # - Count messages group by list -> graph months as time
 # - Per list contentlines per lists PMOT
 # - Per user statistics
 # Count lines of text content in mail body, roughly attempting to
 #   count just new content (not automated, not > replies)
 # Attempt to normalize/map email addresses to committer/member status

 $LOAD_PATH.unshift '/srv/whimsy/lib'
 require 'whimsy/asf'
 require 'mail'
 require 'csv'
 require 'stringio'
 require 'zlib'
 require 'json'
 require 'date'
 require 'optparse'

 # Various utility functions/data for mailing list analysis
 module MailUtils
   extend self
   MEMBER = 'member'
   COMMITTER = 'committer'
   COUNSEL = 'counsel'
   INVALID = '.INVALID'
   DATE = 'date'
   FROM = 'from'
   WHO = 'who'
   AVAILID = 'id'
   SUBJECT = 'subject'
   TOOLS = 'tools'
   MAILS = 'mails'
   TOOLCOUNT = 'toolcount'
   MAILCOUNT = 'mailcount'

   # Subject regexes that are non-discussion oriented
   # Analysis: don't bother with content lines in these messages,
   #   because most of the content is tool-generated
   NONDISCUSSION_SUBJECTS = { # Note: none applicable to members@
     '<board.apache.org>' => {
       missing: /\AMissing\s((\S+\s){1,3})Board/, # whimsy/www/board/agenda/views/buttons/email.js.rb
       feedback: /\ABoard\sfeedback\son\s20/, # whimsy/www/board/agenda/views/actions/feedback.json.rb
       notice: /\A\[NOTICE\]/i,
       report: /\A\[REPORT\]/i,
       resolution: /\A\[RESOLUTION\]/i,
       svn_agenda: %r{\Aboard: r\d{4,8} - /foundation/board/},
       svn_iclas: %r{\Aboard: r\d{4,8} - /foundation/officers/iclas.txt}
     },
     '<operations.apache.org>' => {
       notice: /\A\[NOTICE\]/i,
       report: /\A\[REPORT\]/i,
       svn_general: %r{\Asvn commit: r/},
       svn_bills: %r{\Abills: r\d{4,8} -}
     },
     '<trademarks.apache.org>' => {
       report: /\A\[REPORT\]/i,
       svn_general: %r{\Asvn commit: r/}
     },
     '<fundraising.apache.org>' => {
       report: /\A\[REPORT\]/i,
       svn_bills: %r{\Abills: r\d{4,8} -}
     }
   }

   # @see www/secretary/workbench/models/message.rb
   # @see https://github.com/mikel/mail/issues/39
   def liberal_email_parser(addr)
     begin
       addr = Mail::Address.new(addr)
     rescue
       if addr =~ /^"([^"]*)" <(.*)>$/
         addr = Mail::Address.new
         addr.address = $2
         addr.display_name = $1
       elsif addr =~ /^([^"]*) <(.*)>$/
         addr = Mail::Address.new
         addr.address = $2
         addr.display_name = $1
       else
         raise
       end
     end
     return addr
   end

   # Annotate mailhash by adding :who and :committer (where known)
   # @param mdata Hash to evaluate and annotate
   # Side effect: adds :who, :committer, :id from ASF::Person.find_by_email
   # :committer = 'n' if not found; 'N' if error, 'counsel' for special case
   def find_who_from(mdata)
     # Remove bogus INVALID before doing lookups
     from = mdata[:from].sub(INVALID, '')
     # Micro-optimize unique names
     case from
     when /Mark.Radcliffe/i
       mdata[:who] = 'Mark.Radcliffe'
       mdata[:committer] = COUNSEL
       mdata[:id] = 'markfradcliffe'
     when /mattmann/i
       mdata[:who] = 'Chris Mattmann'
       mdata[:committer] = MEMBER
       mdata[:id] = 'mattmann'
     when /jagielski/i
       mdata[:who] = 'Jim Jagielski'
       mdata[:committer] = MEMBER
       mdata[:id] = 'jim'
     when /delacretaz/i
       mdata[:who] = 'Bertrand Delacretaz'
       mdata[:committer] = MEMBER
       mdata[:id] = 'bdelacretaz'
     when /curcuru/i
       mdata[:who] = 'Shane Curcuru'
       mdata[:committer] = MEMBER
       mdata[:id] = 'curcuru'
     when /steitz/i
       mdata[:who] = 'Phil Steitz'
       mdata[:committer] = MEMBER
       mdata[:id] = 'psteitz'
     when /gardler/i  # Effectively unique (see: Heidi)
       mdata[:who] = 'Ross Gardler'
       mdata[:committer] = MEMBER
       mdata[:id] = 'rgardler'
     when /Craig (L )?Russell/i # Optimize since Secretary sends a lot of mail
       mdata[:who] = 'Craig L Russell'
       mdata[:committer] = MEMBER
       mdata[:id] = 'clr'
     when /McGrail/i
       mdata[:who] = 'Kevin A. McGrail'
       mdata[:committer] = MEMBER
       mdata[:id] = 'kmcgrail'
     when /sallykhudairi@yahoo/i
       mdata[:who] = 'Sally Khudairi'
       mdata[:committer] = MEMBER
       mdata[:id] = 'sk'
     when /sk@haloworldwide.com/i
       mdata[:who] = 'Sally Khudairi'
       mdata[:committer] = MEMBER
       mdata[:id] = 'sk'
     else
       begin
         # TODO use Real Name (JIRA) to attempt to lookup some notifications
         tmp = liberal_email_parser(from)
         person = ASF::Person.find_by_email(tmp.address.dup)
         if person
           mdata[:who] = person.cn
           mdata[:id] = person.id
           if person.asf_member?
             mdata[:committer] = MEMBER
           else
             mdata[:committer] = COMMITTER
           end
         else
           mdata[:who] = "#{tmp.display_name} <#{tmp.address}>"
           mdata[:committer] = 'n'
           mdata[:id] = 'unknown'
         end
       rescue
         mdata[:who] = mdata[:from] # Use original value here
         mdata[:committer] = 'N'
         mdata[:id] = 'unknown'
       end
     end
   end

   # Get {MAILS: [{date, who, subject, flag},...\, TOOLS: [{...},...] } from the specified list for a month
   # May cache data in mailroot/yearmonth.json
   # Returns empty hash if error or if can't find month
   def get_mails_month(mailroot:, yearmonth:, nondiscuss:)
     # Return cached calculated data if present
     cache_json = File.join(mailroot, "#{yearmonth}.json")
     if File.file?(cache_json)
       begin
         return JSON.parse(File.read(cache_json))
       rescue StandardError => e
         # No-op: fall through to attempt to re-create cache
       end
     end
     emails = {}
     files = Dir[File.join(mailroot, yearmonth, '*')]
     return emails if files.empty?
     emails[MAILS] = []
     emails[TOOLS] = []
     files.each do |email|
       next if email.end_with? '/index'
       message = IO.read(email.untaint, mode: 'rb')
       data = {}
       data[DATE] = DateTime.parse(message[/^Date: (.*)/, 1]).iso8601
       data[FROM] = message[/^From: (.*)/, 1]
       # Originally (before 2265343) the local method #find_who_from expected an email address and returned who, committer
       # Emulate this with the version from MailUtils which expects and updates a hash
       temp = {from: data[FROM]} # pass a hash
       MailUtils.find_who_from(temp) # update the hash
       # pick out the bits we want
       data[WHO], data[COMMITTER], data[AVAILID] = temp[:who], temp[:committer], temp[:id]

       data[SUBJECT] = message[/^Subject: (.*)/, 1]
       if nondiscuss
         nondiscuss.each do |typ, rx|
           if data[SUBJECT] =~ rx
             data[TOOLS] = typ
             break # regex.each
           end
         end
       end
       data.has_key?(TOOLS) ? emails[TOOLS] << data : emails[MAILS] << data
     end
     # Provide as sorted data for ease of use
     emails[TOOLS].sort_by! { |email| email[DATE] }
     emails[TOOLCOUNT] = Hash.new {|h, k| h[k] = 0 }
     emails[TOOLS].each do |mail|
       emails[TOOLCOUNT][mail[TOOLS]] += 1
     end
     emails[TOOLCOUNT] = emails[TOOLCOUNT].sort_by { |k,v| -v}.to_h

     emails[MAILS].sort_by! { |email| email[DATE] }
     emails[MAILCOUNT] = Hash.new {|h, k| h[k] = 0 }
     emails[MAILS].each do |mail|
       emails[MAILCOUNT]["#{mail[WHO]} (#{mail[AVAILID]})"] += 1
     end
     emails[MAILCOUNT] = emails[MAILCOUNT].sort_by { |k,v| -v}.to_h

     # If yearmonth is before current month, then write out yearmonth.json as cache
     if yearmonth < Date.today.strftime('%Y%m')
       begin
         File.open(cache_json, 'w') do |f|
           f.puts JSON.pretty_generate(emails)
         end
       rescue
         # No-op, just don't cache for now
       end
     end
     return emails
   end
 end

 module MboxUtils
   extend self
   MBOX_EXT = '.mbox'
   VERSION = 'mboxhdr2json'
   URIRX = URI.regexp(['http', 'https'])

   # Read a ponyapi.rb mbox file and return mails (text content only)
   # @param f path to .mbox or .mbox.gz
   # @return [mail1, mail2, ...]
   def read_mbox(f)
     if f.end_with? '.gz'
       stream = StringIO.new(mbox)
       reader = Zlib::GzipReader.new(stream)
       mbox = reader.read
       reader.close
       stream.close rescue nil
     else
       mbox = File.read(f)
     end
     mbox.force_encoding Encoding::ASCII_8BIT
     messages = mbox.split(/^From .*/)
     messages.shift # Drop first item (not a message)
     return messages
   end

   # Process an mbox file into mailhash of selected headers and lines of text
   # @param f path to .mbox or .mbox.gz
   # @return [mail1hash, mail2hash, ...], [ [parseerr, order], ...]
   # @return nil, [read, errors2...] if mbox file can't be read
   # mailhash contains :from, :subject, :listid, :date, :messageid,
   #   :inreplyto, :lines (count), plus :who and :committer
   def mbox2stats(f)
     begin
       mails = read_mbox(f)
     rescue => e
       return nil, e
     end
     errs = []
     messages = []
     order = 0
     mails.each do |message|
       mdata = {}
       mail = nil
       begin
         # Preserve message order in case it's important
         order += 1
         # Enforce linefeeds; makes Mail happy; borks binary attachments (not used in this script)
         mail = Mail.read_from_string(message.gsub(/\r?\n/, "\r\n"))
         mdata[:order] = order
         begin # HACK for cases where some values don't parse, try to get good enough values in rescue
           mdata[:from] = mail[:from].value
           mdata[:subject] = mail[:subject].value
           mdata[:listid] = mail[:List_Id].value
           mdata[:date] = mail.date.to_s
         rescue => ee
           mdata[:from] = mail[:from]
           mdata[:subject] = mail[:subject]
           mdata[:listid] = mail[:List_Id]
           mdata[:date] = mail.date.to_s
           mdata[:parseerr] = mail.errors
         end
         mdata[:messageid] = mail.message_id
         mdata[:inreplyto] = mail.in_reply_to
         if mail.multipart?
           text_part = mail.text_part.decoded.split(/\r?\n/)
         else
           text_part = mail.body.decoded.split(/\r?\n/)
         end
         ctr = 0 # Count text lines of nonblank, nonreply content
         links = 0 # Count number of apparent hyperlinks
         text_part.each do |l|
           case l
           when /\A\s*>/
             # Don't count reply lines, even when indented
           when /\A\s*\z/
             # Don't count blank lines
           when /\AOn.*wrote:\z/
             # Don't count most common reply header
           when /\A-----Original Message-----/
             # Stop counting if it seems like a forwarded message
             break
             # TODO: figure out if we're in a .sig block, and stop counting
           else
             links += 1 if l =~ URIRX
             ctr += 1
           end
         end
         mdata[:lines] = ctr
         mdata[:links] = links
         # Annotate various other precomputable data
         MailUtils.find_who_from(mdata)
         begin
           d = DateTime.parse(mdata[:date])
           mdata[:y] = d.year
           mdata[:m] = d.month
           mdata[:d] = d.day
           mdata[:w] = d.wday
           mdata[:h] = d.hour
           mdata[:z] = d.zone
         rescue => noop
           # no-op - not critical
           puts "DEBUG: #{e.message} parsing: #{mdata[:date]}"
         end
         regex = MailUtils::NONDISCUSSION_SUBJECTS[mdata[:listid]] # Use subject regex for this list (if any)
         if regex
           regex.each do |typ, rx|
             if mdata[:subject] =~ rx
               mdata[:nondiscuss] = typ
               break # regex.each
             end
           end
         end
         # Push our hash
         messages << mdata
       rescue => e
         errs << [e, mdata[:order]]
       end
     end
     return messages, errs
   end

   # Scan dir tree for mboxes and output individual mailhash as JSONs
   # @param dir to scan (whole tree)
   # @param ext file extension to glob for
   # Side effect: writes out f.chomp(ext).json files
   # @note writes string VERSION for differentiating from other *.json
   def scan_dir_mbox2stats(dir, ext = MBOX_EXT)
     Dir["#{dir}/**/*#{ext}".untaint].sort.each do |f|
       mails, errs = mbox2stats(f.untaint)
       File.open("#{f.chomp(ext)}.json", "w") do |fout|
         fout.puts JSON.pretty_generate(["#{VERSION}", mails, errs])
       end
     end
   end

   # Scan dir tree for mailhash JSONs and output an overview CSV of all
   # @return [ error1, error2, ...] if any errors
   # Side effect: writes out dir/outname CSV file
   # @note reads string VERSION for differentiating from other *.json
   def scan_dir_stats2csv(dir, outname, ext = '.json')
     errors = []
     jzons = []
     Dir["#{dir}/**/*#{ext}".untaint].sort.each do |f|
       begin
         tmp = JSON.parse(File.read(f))
         if tmp[0].kind_of?(String) && tmp[0].start_with?(VERSION)
           jzons << tmp.drop(1)
         end
       rescue => e
         puts "ERROR: parse of #{f} raised #{e.message[0..255]}"
         errors << "#{e.message}\n\t#{e.backtrace.join("\n\t")}"
         next
       end
     end
     raise ArgumentError, "#{__method__} called with no valid mbox json files in #{dir}" if jzons.length == 0
     puts "#{__method__} processing #{jzons.length} mbox json files"
     # Write out headers and the first array in new csv
     csvfile = File.join("#{dir}", outname)
     csv = CSV.open(csvfile, "w", headers: %w( year month day weekday hour zone listid who subject lines links committer messageid inreplyto ), write_headers: true)
     jzons.shift[0].each do |m|
       csv << [ m['y'], m['m'], m['d'], m['w'], m['h'], m['z'], m['listid'], m['who'], m['subject'], m['lines'], m['links'], m['committer'], m['messageid'], m['inreplyto']  ]
     end
     # Write out all remaining arrays, without headers, appending
     jzons.each do |j|
       begin
         j[0].each do |m|
           csv << [ m['y'], m['m'], m['d'], m['w'], m['h'], m['z'], m['listid'], m['who'], m['subject'], m['lines'], m['links'], m['committer'], m['messageid'], m['inreplyto']  ]
         end
       rescue => e
         puts "ERROR: write of #{f} raised #{e.message[0..255]}"
         errors << "#{e.message}\n\t#{e.backtrace.join("\n\t")}"
         next
       end
     end
     csv.close # Just in case
     return errors
   end
 end

 # ## ### #### ##### ######
 # Check options and call needed methods
 DEFAULT_OUTPUT = 'mbox-analysis.csv'
 def optparse
   options = {}
   OptionParser.new do |opts|
     opts.on('-h') { puts opts; exit }

     opts.on('-dDIRECTORY', '--directory DIRECTORY', 'Local directory to read existing mboxes and dump output in (default: .)') do |d|
       if File.directory?(d)
         options[:dir] = d
       else
         raise ArgumentError, "-d #{d} is not a valid directory"
       end
     end
     opts.on('-oOUTPUT.CSV', '--output OUTPUT.CSV', "Filename to output rows into; default #{DEFAULT_OUTPUT}") do |o|
       options[:output] = o
     end
     opts.on('-j', '--json', "Process .mbox to .json (optional)") do |j|
       options[:json] = true
     end
     begin
       opts.parse!
       options[:dir] = '.' if options[:dir].nil?
       options[:output] = DEFAULT_OUTPUT if options[:output].nil?
     rescue StandardError => e
       $stderr.puts "#{e.message}; try -h for valid options, or see code"
       exit 1
     end
   end

   return options
 end

 # ## ### #### ##### ######
 # Main method for command line use
 if __FILE__ == $PROGRAM_NAME
   options = optparse
   if options[:json]
     puts "START: Parsing #{options[:dir]}/*#{MboxUtils::MBOX_EXT} into *.json"
     MboxUtils.scan_dir_mbox2stats(options[:dir]) # Side effect: writes out f.chomp(ext).json files
   end
   puts "START: Analyzing #{options[:dir]}/*.json into #{options[:output]}"
   errs = MboxUtils.scan_dir_stats2csv(options[:dir], options[:output])
   if errs
     errs.each do |e|
       puts "ERROR: #{e}"
     end
   end
   puts "END"
 end
	#!/usr/bin/env ruby
	# Analyze mbox files (downloaded by PonyAPI) for general statistics into CSV
	# - Per list messages per month over time (PMOT)
	# - Count messages group by list -> graph months as time
	# - Per list contentlines per lists PMOT
	# - Per user statistics
	# Count lines of text content in mail body, roughly attempting to
	# count just new content (not automated, not > replies)
	# Attempt to normalize/map email addresses to committer/member status

	$LOAD_PATH.unshift '/srv/whimsy/lib'
	require 'whimsy/asf'
	require 'mail'
	require 'csv'
	require 'stringio'
	require 'zlib'
	require 'json'
	require 'date'
	require 'optparse'

	# Various utility functions/data for mailing list analysis
	module MailUtils
	extend self
	MEMBER = 'member'
	COMMITTER = 'committer'
	COUNSEL = 'counsel'
	INVALID = '.INVALID'
	DATE = 'date'
	FROM = 'from'
	WHO = 'who'
	AVAILID = 'id'
	SUBJECT = 'subject'
	TOOLS = 'tools'
	MAILS = 'mails'
	TOOLCOUNT = 'toolcount'
	MAILCOUNT = 'mailcount'

	# Subject regexes that are non-discussion oriented
	# Analysis: don't bother with content lines in these messages,
	# because most of the content is tool-generated
	NONDISCUSSION_SUBJECTS = { # Note: none applicable to members@
	'<board.apache.org>' => {
	missing: /\AMissing\s((\S+\s){1,3})Board/, # whimsy/www/board/agenda/views/buttons/email.js.rb
	feedback: /\ABoard\sfeedback\son\s20/, # whimsy/www/board/agenda/views/actions/feedback.json.rb
	notice: /\A\[NOTICE\]/i,
	report: /\A\[REPORT\]/i,
	resolution: /\A\[RESOLUTION\]/i,
	svn_agenda: %r{\Aboard: r\d{4,8} - /foundation/board/},
	svn_iclas: %r{\Aboard: r\d{4,8} - /foundation/officers/iclas.txt}
	},
	'<operations.apache.org>' => {
	notice: /\A\[NOTICE\]/i,
	report: /\A\[REPORT\]/i,
	svn_general: %r{\Asvn commit: r/},
	svn_bills: %r{\Abills: r\d{4,8} -}
	},
	'<trademarks.apache.org>' => {
	report: /\A\[REPORT\]/i,
	svn_general: %r{\Asvn commit: r/}
	},
	'<fundraising.apache.org>' => {
	report: /\A\[REPORT\]/i,
	svn_bills: %r{\Abills: r\d{4,8} -}
	}
	}

	# @see www/secretary/workbench/models/message.rb
	# @see https://github.com/mikel/mail/issues/39
	def liberal_email_parser(addr)
	begin
	addr = Mail::Address.new(addr)
	rescue
	if addr =~ /^"([^"])" <(.)>$/
	addr = Mail::Address.new
	addr.address = $2
	addr.display_name = $1
	elsif addr =~ /^([^"]) <(.)>$/
	addr = Mail::Address.new
	addr.address = $2
	addr.display_name = $1
	else
	raise
	end
	end
	return addr
	end

	# Annotate mailhash by adding :who and :committer (where known)
	# @param mdata Hash to evaluate and annotate
	# Side effect: adds :who, :committer, :id from ASF::Person.find_by_email
	# :committer = 'n' if not found; 'N' if error, 'counsel' for special case
	def find_who_from(mdata)
	# Remove bogus INVALID before doing lookups
	from = mdata[:from].sub(INVALID, '')
	# Micro-optimize unique names
	case from
	when /Mark.Radcliffe/i
	mdata[:who] = 'Mark.Radcliffe'
	mdata[:committer] = COUNSEL
	mdata[:id] = 'markfradcliffe'
	when /mattmann/i
	mdata[:who] = 'Chris Mattmann'
	mdata[:committer] = MEMBER
	mdata[:id] = 'mattmann'
	when /jagielski/i
	mdata[:who] = 'Jim Jagielski'
	mdata[:committer] = MEMBER
	mdata[:id] = 'jim'
	when /delacretaz/i
	mdata[:who] = 'Bertrand Delacretaz'
	mdata[:committer] = MEMBER
	mdata[:id] = 'bdelacretaz'
	when /curcuru/i
	mdata[:who] = 'Shane Curcuru'
	mdata[:committer] = MEMBER
	mdata[:id] = 'curcuru'
	when /steitz/i
	mdata[:who] = 'Phil Steitz'
	mdata[:committer] = MEMBER
	mdata[:id] = 'psteitz'
	when /gardler/i # Effectively unique (see: Heidi)
	mdata[:who] = 'Ross Gardler'
	mdata[:committer] = MEMBER
	mdata[:id] = 'rgardler'
	when /Craig (L )?Russell/i # Optimize since Secretary sends a lot of mail
	mdata[:who] = 'Craig L Russell'
	mdata[:committer] = MEMBER
	mdata[:id] = 'clr'
	when /McGrail/i
	mdata[:who] = 'Kevin A. McGrail'
	mdata[:committer] = MEMBER
	mdata[:id] = 'kmcgrail'
	when /sallykhudairi@yahoo/i
	mdata[:who] = 'Sally Khudairi'
	mdata[:committer] = MEMBER
	mdata[:id] = 'sk'
	when /sk@haloworldwide.com/i
	mdata[:who] = 'Sally Khudairi'
	mdata[:committer] = MEMBER
	mdata[:id] = 'sk'
	else
	begin
	# TODO use Real Name (JIRA) to attempt to lookup some notifications
	tmp = liberal_email_parser(from)
	person = ASF::Person.find_by_email(tmp.address.dup)
	if person
	mdata[:who] = person.cn
	mdata[:id] = person.id
	if person.asf_member?
	mdata[:committer] = MEMBER
	else
	mdata[:committer] = COMMITTER
	end
	else
	mdata[:who] = "#{tmp.display_name} <#{tmp.address}>"
	mdata[:committer] = 'n'
	mdata[:id] = 'unknown'
	end
	rescue
	mdata[:who] = mdata[:from] # Use original value here
	mdata[:committer] = 'N'
	mdata[:id] = 'unknown'
	end
	end
	end

	# Get {MAILS: [{date, who, subject, flag},...\, TOOLS: [{...},...] } from the specified list for a month
	# May cache data in mailroot/yearmonth.json
	# Returns empty hash if error or if can't find month
	def get_mails_month(mailroot:, yearmonth:, nondiscuss:)
	# Return cached calculated data if present
	cache_json = File.join(mailroot, "#{yearmonth}.json")
	if File.file?(cache_json)
	begin
	return JSON.parse(File.read(cache_json))
	rescue StandardError => e
	# No-op: fall through to attempt to re-create cache
	end
	end
	emails = {}
	files = Dir[File.join(mailroot, yearmonth, '*')]
	return emails if files.empty?
	emails[MAILS] = []
	emails[TOOLS] = []
	files.each do \|email\|
	next if email.end_with? '/index'
	message = IO.read(email.untaint, mode: 'rb')
	data = {}
	data[DATE] = DateTime.parse(message[/^Date: (.*)/, 1]).iso8601
	data[FROM] = message[/^From: (.*)/, 1]
	# Originally (before 2265343) the local method #find_who_from expected an email address and returned who, committer
	# Emulate this with the version from MailUtils which expects and updates a hash
	temp = {from: data[FROM]} # pass a hash
	MailUtils.find_who_from(temp) # update the hash
	# pick out the bits we want
	data[WHO], data[COMMITTER], data[AVAILID] = temp[:who], temp[:committer], temp[:id]

	data[SUBJECT] = message[/^Subject: (.*)/, 1]
	if nondiscuss
	nondiscuss.each do \|typ, rx\|
	if data[SUBJECT] =~ rx
	data[TOOLS] = typ
	break # regex.each
	end
	end
	end
	data.has_key?(TOOLS) ? emails[TOOLS] << data : emails[MAILS] << data
	end
	# Provide as sorted data for ease of use
	emails[TOOLS].sort_by! { \|email\| email[DATE] }
	emails[TOOLCOUNT] = Hash.new {\|h, k\| h[k] = 0 }
	emails[TOOLS].each do \|mail\|
	emails[TOOLCOUNT][mail[TOOLS]] += 1
	end
	emails[TOOLCOUNT] = emails[TOOLCOUNT].sort_by { \|k,v\| -v}.to_h

	emails[MAILS].sort_by! { \|email\| email[DATE] }
	emails[MAILCOUNT] = Hash.new {\|h, k\| h[k] = 0 }
	emails[MAILS].each do \|mail\|
	emails[MAILCOUNT]["#{mail[WHO]} (#{mail[AVAILID]})"] += 1
	end
	emails[MAILCOUNT] = emails[MAILCOUNT].sort_by { \|k,v\| -v}.to_h

	# If yearmonth is before current month, then write out yearmonth.json as cache
	if yearmonth < Date.today.strftime('%Y%m')
	begin
	File.open(cache_json, 'w') do \|f\|
	f.puts JSON.pretty_generate(emails)
	end
	rescue
	# No-op, just don't cache for now
	end
	end
	return emails
	end
	end

	module MboxUtils
	extend self
	MBOX_EXT = '.mbox'
	VERSION = 'mboxhdr2json'
	URIRX = URI.regexp(['http', 'https'])

	# Read a ponyapi.rb mbox file and return mails (text content only)
	# @param f path to .mbox or .mbox.gz
	# @return [mail1, mail2, ...]
	def read_mbox(f)
	if f.end_with? '.gz'
	stream = StringIO.new(mbox)
	reader = Zlib::GzipReader.new(stream)
	mbox = reader.read
	reader.close
	stream.close rescue nil
	else
	mbox = File.read(f)
	end
	mbox.force_encoding Encoding::ASCII_8BIT
	messages = mbox.split(/^From .*/)
	messages.shift # Drop first item (not a message)
	return messages
	end

	# Process an mbox file into mailhash of selected headers and lines of text
	# @param f path to .mbox or .mbox.gz
	# @return [mail1hash, mail2hash, ...], [ [parseerr, order], ...]
	# @return nil, [read, errors2...] if mbox file can't be read
	# mailhash contains :from, :subject, :listid, :date, :messageid,
	# :inreplyto, :lines (count), plus :who and :committer
	def mbox2stats(f)
	begin
	mails = read_mbox(f)
	rescue => e
	return nil, e
	end
	errs = []
	messages = []
	order = 0
	mails.each do \|message\|
	mdata = {}
	mail = nil
	begin
	# Preserve message order in case it's important
	order += 1
	# Enforce linefeeds; makes Mail happy; borks binary attachments (not used in this script)
	mail = Mail.read_from_string(message.gsub(/\r?\n/, "\r\n"))
	mdata[:order] = order
	begin # HACK for cases where some values don't parse, try to get good enough values in rescue
	mdata[:from] = mail[:from].value
	mdata[:subject] = mail[:subject].value
	mdata[:listid] = mail[:List_Id].value
	mdata[:date] = mail.date.to_s
	rescue => ee
	mdata[:from] = mail[:from]
	mdata[:subject] = mail[:subject]
	mdata[:listid] = mail[:List_Id]
	mdata[:date] = mail.date.to_s
	mdata[:parseerr] = mail.errors
	end
	mdata[:messageid] = mail.message_id
	mdata[:inreplyto] = mail.in_reply_to
	if mail.multipart?
	text_part = mail.text_part.decoded.split(/\r?\n/)
	else
	text_part = mail.body.decoded.split(/\r?\n/)
	end
	ctr = 0 # Count text lines of nonblank, nonreply content
	links = 0 # Count number of apparent hyperlinks
	text_part.each do \|l\|
	case l
	when /\A\s*>/
	# Don't count reply lines, even when indented
	when /\A\s*\z/
	# Don't count blank lines
	when /\AOn.*wrote:\z/
	# Don't count most common reply header
	when /\A-----Original Message-----/
	# Stop counting if it seems like a forwarded message
	break
	# TODO: figure out if we're in a .sig block, and stop counting
	else
	links += 1 if l =~ URIRX
	ctr += 1
	end
	end
	mdata[:lines] = ctr
	mdata[:links] = links
	# Annotate various other precomputable data
	MailUtils.find_who_from(mdata)
	begin
	d = DateTime.parse(mdata[:date])
	mdata[:y] = d.year
	mdata[:m] = d.month
	mdata[:d] = d.day
	mdata[:w] = d.wday
	mdata[:h] = d.hour
	mdata[:z] = d.zone
	rescue => noop
	# no-op - not critical
	puts "DEBUG: #{e.message} parsing: #{mdata[:date]}"
	end
	regex = MailUtils::NONDISCUSSION_SUBJECTS[mdata[:listid]] # Use subject regex for this list (if any)
	if regex
	regex.each do \|typ, rx\|
	if mdata[:subject] =~ rx
	mdata[:nondiscuss] = typ
	break # regex.each
	end
	end
	end
	# Push our hash
	messages << mdata
	rescue => e
	errs << [e, mdata[:order]]
	end
	end
	return messages, errs
	end

	# Scan dir tree for mboxes and output individual mailhash as JSONs
	# @param dir to scan (whole tree)
	# @param ext file extension to glob for
	# Side effect: writes out f.chomp(ext).json files
	# @note writes string VERSION for differentiating from other *.json
	def scan_dir_mbox2stats(dir, ext = MBOX_EXT)
	Dir["#{dir}/*/#{ext}".untaint].sort.each do \|f\|
	mails, errs = mbox2stats(f.untaint)
	File.open("#{f.chomp(ext)}.json", "w") do \|fout\|
	fout.puts JSON.pretty_generate(["#{VERSION}", mails, errs])
	end
	end
	end

	# Scan dir tree for mailhash JSONs and output an overview CSV of all
	# @return [ error1, error2, ...] if any errors
	# Side effect: writes out dir/outname CSV file
	# @note reads string VERSION for differentiating from other *.json
	def scan_dir_stats2csv(dir, outname, ext = '.json')
	errors = []
	jzons = []
	Dir["#{dir}/*/#{ext}".untaint].sort.each do \|f\|
	begin
	tmp = JSON.parse(File.read(f))
	if tmp[0].kind_of?(String) && tmp[0].start_with?(VERSION)
	jzons << tmp.drop(1)
	end
	rescue => e
	puts "ERROR: parse of #{f} raised #{e.message[0..255]}"
	errors << "#{e.message}\n\t#{e.backtrace.join("\n\t")}"
	next
	end
	end
	raise ArgumentError, "#{__method__} called with no valid mbox json files in #{dir}" if jzons.length == 0
	puts "#{__method__} processing #{jzons.length} mbox json files"
	# Write out headers and the first array in new csv
	csvfile = File.join("#{dir}", outname)
	csv = CSV.open(csvfile, "w", headers: %w( year month day weekday hour zone listid who subject lines links committer messageid inreplyto ), write_headers: true)
	jzons.shift[0].each do \|m\|
	csv << [ m['y'], m['m'], m['d'], m['w'], m['h'], m['z'], m['listid'], m['who'], m['subject'], m['lines'], m['links'], m['committer'], m['messageid'], m['inreplyto'] ]
	end
	# Write out all remaining arrays, without headers, appending
	jzons.each do \|j\|
	begin
	j[0].each do \|m\|
	csv << [ m['y'], m['m'], m['d'], m['w'], m['h'], m['z'], m['listid'], m['who'], m['subject'], m['lines'], m['links'], m['committer'], m['messageid'], m['inreplyto'] ]
	end
	rescue => e
	puts "ERROR: write of #{f} raised #{e.message[0..255]}"
	errors << "#{e.message}\n\t#{e.backtrace.join("\n\t")}"
	next
	end
	end
	csv.close # Just in case
	return errors
	end
	end

	# ## ### #### ##### ######
	# Check options and call needed methods
	DEFAULT_OUTPUT = 'mbox-analysis.csv'
	def optparse
	options = {}
	OptionParser.new do \|opts\|
	opts.on('-h') { puts opts; exit }

	opts.on('-dDIRECTORY', '--directory DIRECTORY', 'Local directory to read existing mboxes and dump output in (default: .)') do \|d\|
	if File.directory?(d)
	options[:dir] = d
	else
	raise ArgumentError, "-d #{d} is not a valid directory"
	end
	end
	opts.on('-oOUTPUT.CSV', '--output OUTPUT.CSV', "Filename to output rows into; default #{DEFAULT_OUTPUT}") do \|o\|
	options[:output] = o
	end
	opts.on('-j', '--json', "Process .mbox to .json (optional)") do \|j\|
	options[:json] = true
	end
	begin
	opts.parse!
	options[:dir] = '.' if options[:dir].nil?
	options[:output] = DEFAULT_OUTPUT if options[:output].nil?
	rescue StandardError => e
	$stderr.puts "#{e.message}; try -h for valid options, or see code"
	exit 1
	end
	end

	return options
	end

	# ## ### #### ##### ######
	# Main method for command line use
	if __FILE__ == $PROGRAM_NAME
	options = optparse
	if options[:json]
	puts "START: Parsing #{options[:dir]}/#{MboxUtils::MBOX_EXT} into .json"
	MboxUtils.scan_dir_mbox2stats(options[:dir]) # Side effect: writes out f.chomp(ext).json files
	end
	puts "START: Analyzing #{options[:dir]}/*.json into #{options[:output]}"
	errs = MboxUtils.scan_dir_stats2csv(options[:dir], options[:output])
	if errs
	errs.each do \|e\|
	puts "ERROR: #{e}"
	end
	end
	puts "END"
	end