#!/usr/bin/env ruby
# Analyze mbox files (downloaded by PonyAPI) for general statistics into CSV
# - Per list messages per month over time (PMOT)
# - Count messages group by list -> graph months as time
# - Per list contentlines per lists PMOT
# - Per user statistics
# Count lines of text content in mail body, roughly attempting to 
#   count just new content (not automated, not > replies)
# Attempt to normalize/map email addresses to committer/member status

$LOAD_PATH.unshift '/srv/whimsy/lib'
require 'whimsy/asf'
require 'mail'
require 'csv'
require 'stringio'
require 'zlib'
require 'json'
require 'date'
require 'optparse'

# Various utility functions/data for mailing list analysis
module MailUtils
  extend self
  MEMBER = 'member'
  COMMITTER = 'committer'
  COUNSEL = 'counsel'
  INVALID = '.INVALID'

  # Subject regexes that are non-discussion oriented
  # Analysis: don't bother with content lines in these messages, 
  #   because most of the content is tool-generated
  NONDISCUSSION_SUBJECTS = { # Note: none applicable to members@
    '<board.apache.org>' => {
      missing: /\AMissing\s((\S+\s){1,3})Board/, # whimsy/www/board/agenda/views/buttons/email.js.rb
      feedback: /\ABoard\sfeedback\son\s20/, # whimsy/www/board/agenda/views/actions/feedback.json.rb
      notice: /\A\[NOTICE\]/i,
      report: /\A\[REPORT\]/i,
      resolution: /\A\[RESOLUTION\]/i,
      svn_agenda: %r{\Aboard: r\d{4,8} - /foundation/board/},
      svn_iclas: %r{\Aboard: r\d{4,8} - /foundation/officers/iclas.txt}
    },
    '<operations.apache.org>' => {
      notice: /\A\[NOTICE\]/i,
      report: /\A\[REPORT\]/i,
      svn_general: %r{\Asvn commit: r/},
      svn_bills: %r{\Abills: r\d{4,8} -}
    },
    '<trademarks.apache.org>' => {
      report: /\A\[REPORT\]/i,
      svn_general: %r{\Asvn commit: r/}
    },
    '<fundraising.apache.org>' => {
      report: /\A\[REPORT\]/i,
      svn_bills: %r{\Abills: r\d{4,8} -}
    }
  }

  # @see www/secretary/workbench/models/message.rb
  # @see https://github.com/mikel/mail/issues/39
  def liberal_email_parser(addr)
    begin
      addr = Mail::Address.new(addr)
    rescue
      if addr =~ /^"([^"]*)" <(.*)>$/
        addr = Mail::Address.new
        addr.address = $2
        addr.display_name = $1
      elsif addr =~ /^([^"]*) <(.*)>$/
        addr = Mail::Address.new
        addr.address = $2
        addr.display_name = $1
      else
        raise
      end
    end
    return addr
  end

  # Annotate mailhash by adding :who and :committer (where known)
  # @param mdata Hash to evaluate and annotate
  # Side effect: adds :who and :committer from ASF::Person.find_by_email
  # :committer = 'n' if not found; 'N' if error, 'counsel' for special case
  def find_who_from(mdata)
    # Remove bogus INVALID before doing lookups
    from = mdata[:from].sub(INVALID, '')
    # Micro-optimize unique names
    case from
    when /Mark.Radcliffe/i
      mdata[:who] = 'Mark.Radcliffe'
      mdata[:committer] = COUNSEL
    when /mattmann/i
      mdata[:who] = 'Chris Mattmann'
      mdata[:committer] = MEMBER
    when /jagielski/i
      mdata[:who] = 'Jim Jagielski'
      mdata[:committer] = MEMBER
    when /delacretaz/i
      mdata[:who] = 'Bertrand Delacretaz'
      mdata[:committer] = MEMBER
    when /curcuru/i
      mdata[:who] = 'Shane Curcuru'
      mdata[:committer] = MEMBER
    when /steitz/i
      mdata[:who] = 'Phil Steitz'
      mdata[:committer] = MEMBER
    when /gardler/i  # Effectively unique (see: Heidi)
      mdata[:who] = 'Ross Gardler'
      mdata[:committer] = MEMBER
    when /Craig (L )?Russell/i # Optimize since Secretary sends a lot of mail
      mdata[:who] = 'Craig L Russell'
      mdata[:committer] = MEMBER
    when /McGrail/i
      mdata[:who] = 'Kevin A. McGrail'
      mdata[:committer] = MEMBER
    when /sallykhudairi@yahoo/i 
      mdata[:who] = 'Sally Khudairi'
      mdata[:committer] = MEMBER
    when /sk@haloworldwide.com/i
      mdata[:who] = 'Sally Khudairi'
      mdata[:committer] = MEMBER
    else
      begin
        # TODO use Real Name (JIRA) to attempt to lookup some notifications
        tmp = liberal_email_parser(from)
        person = ASF::Person.find_by_email(tmp.address.dup)
        if person
          mdata[:who] = person.cn
          if person.asf_member?
            mdata[:committer] = MEMBER
          else
            mdata[:committer] = COMMITTER
          end
        else
          mdata[:who] = "#{tmp.display_name} <#{tmp.address}>"
          mdata[:committer] = 'n'
        end
      rescue
        mdata[:who] = mdata[:from] # Use original value here
        mdata[:committer] = 'N'
      end
    end
  end
end

module MboxUtils
  extend self
  MBOX_EXT = '.mbox'
  VERSION = 'mboxhdr2json'
  URIRX = URI.regexp(['http', 'https'])

  # Read a ponyapi.rb mbox file and return mails (text content only)
  # @param f path to .mbox or .mbox.gz
  # @return [mail1, mail2, ...]
  def read_mbox(f)
    if f.end_with? '.gz'
      stream = StringIO.new(mbox)
      reader = Zlib::GzipReader.new(stream)
      mbox = reader.read
      reader.close
      stream.close rescue nil
    else
      mbox = File.read(f)
    end
    mbox.force_encoding Encoding::ASCII_8BIT
    messages = mbox.split(/^From .*/)
    messages.shift # Drop first item (not a message)
    return messages
  end

  # Process an mbox file into mailhash of selected headers and lines of text
  # @param f path to .mbox or .mbox.gz
  # @return [mail1hash, mail2hash, ...], [ [parseerr, order], ...]
  # @return nil, [read, errors2...] if mbox file can't be read
  # mailhash contains :from, :subject, :listid, :date, :messageid, 
  #   :inreplyto, :lines (count), plus :who and :committer
  def mbox2stats(f)
    begin
      mails = read_mbox(f)
    rescue => e
      return nil, e
    end
    errs = []
    messages = []
    order = 0
    mails.each do |message|
      mdata = {}
      mail = nil
      begin
        # Preserve message order in case it's important
        order += 1
        # Enforce linefeeds; makes Mail happy; borks binary attachments (not used in this script)
        mail = Mail.read_from_string(message.gsub(/\r?\n/, "\r\n"))
        mdata[:order] = order
        begin # HACK for cases where some values don't parse, try to get good enough values in rescue
          mdata[:from] = mail[:from].value
          mdata[:subject] = mail[:subject].value
          mdata[:listid] = mail[:List_Id].value
          mdata[:date] = mail.date.to_s
        rescue => ee
          mdata[:from] = mail[:from]
          mdata[:subject] = mail[:subject]
          mdata[:listid] = mail[:List_Id]
          mdata[:date] = mail.date.to_s
          mdata[:parseerr] = mail.errors
        end
        mdata[:messageid] = mail.message_id
        mdata[:inreplyto] = mail.in_reply_to
        if mail.multipart?
          text_part = mail.text_part.decoded.split(/\r?\n/)
        else
          text_part = mail.body.decoded.split(/\r?\n/)
        end
        ctr = 0 # Count text lines of nonblank, nonreply content
        links = 0 # Count number of apparent hyperlinks
        text_part.each do |l|
          case l
          when /\A\s*>/
            # Don't count reply lines, even when indented
          when /\A\s*\z/
            # Don't count blank lines
          when /\AOn.*wrote:\z/
            # Don't count most common reply header
          when /\A-----Original Message-----/
            # Stop counting if it seems like a forwarded message
            break
            # TODO: figure out if we're in a .sig block, and stop counting
          else
            links += 1 if l =~ URIRX
            ctr += 1
          end
        end
        mdata[:lines] = ctr
        mdata[:links] = links
        # Annotate various other precomputable data
        MailUtils.find_who_from(mdata)
        begin
          d = DateTime.parse(mdata[:date])
          mdata[:y] = d.year
          mdata[:m] = d.month
          mdata[:d] = d.day
          mdata[:w] = d.wday
          mdata[:h] = d.hour
          mdata[:z] = d.zone
        rescue => noop
          # no-op - not critical
          puts "DEBUG: #{e.message} parsing: #{mdata[:date]}"
        end
        regex = MailUtils::NONDISCUSSION_SUBJECTS[mdata[:listid]] # Use subject regex for this list (if any)
        if regex
          regex.each do |typ, rx|
            if mdata[:subject] =~ rx
              mdata[:nondiscuss] = typ
              break # regex.each
            end
          end
        end
        # Push our hash 
        messages << mdata
      rescue => e
        errs << [e, mdata[:order]]
      end
    end
    return messages, errs
  end

  # Scan dir tree for mboxes and output individual mailhash as JSONs
  # @param dir to scan (whole tree)
  # @param ext file extension to glob for
  # Side effect: writes out f.chomp(ext).json files
  # @note writes string VERSION for differentiating from other *.json
  def scan_dir_mbox2stats(dir, ext = MBOX_EXT)
    Dir["#{dir}/**/*#{ext}".untaint].sort.each do |f|
      mails, errs = mbox2stats(f.untaint)
      File.open("#{f.chomp(ext)}.json", "w") do |fout|
        fout.puts JSON.pretty_generate(["#{VERSION}", mails, errs])
      end
    end
  end

  # Scan dir tree for mailhash JSONs and output an overview CSV of all
  # @return [ error1, error2, ...] if any errors
  # Side effect: writes out dir/outname CSV file
  # @note reads string VERSION for differentiating from other *.json
  def scan_dir_stats2csv(dir, outname, ext = '.json')
    errors = []
    jzons = []
    Dir["#{dir}/**/*#{ext}".untaint].sort.each do |f|
      begin
        tmp = JSON.parse(File.read(f))
        if tmp[0].kind_of?(String) && tmp[0].start_with?(VERSION)
          jzons << tmp.drop(1)
        end
      rescue => e
        puts "ERROR: parse of #{f} raised #{e.message[0..255]}"
        errors << "#{e.message}\n\t#{e.backtrace.join("\n\t")}"
        next
      end
    end
    raise ArgumentError, "#{__method__} called with no valid mbox json files in #{dir}" if jzons.length == 0
    puts "#{__method__} processing #{jzons.length} mbox json files"
    # Write out headers and the first array in new csv
    csvfile = File.join("#{dir}", outname)
    csv = CSV.open(csvfile, "w", headers: %w( year month day weekday hour zone listid who subject lines links committer messageid inreplyto ), write_headers: true)
    jzons.shift[0].each do |m|
      csv << [ m['y'], m['m'], m['d'], m['w'], m['h'], m['z'], m['listid'], m['who'], m['subject'], m['lines'], m['links'], m['committer'], m['messageid'], m['inreplyto']  ]
    end
    # Write out all remaining arrays, without headers, appending
    jzons.each do |j|
      begin
        j[0].each do |m|
          csv << [ m['y'], m['m'], m['d'], m['w'], m['h'], m['z'], m['listid'], m['who'], m['subject'], m['lines'], m['links'], m['committer'], m['messageid'], m['inreplyto']  ]
        end
      rescue => e
        puts "ERROR: write of #{f} raised #{e.message[0..255]}"
        errors << "#{e.message}\n\t#{e.backtrace.join("\n\t")}"
        next
      end
    end
    csv.close # Just in case
    return errors
  end
end

# ## ### #### ##### ######
# Check options and call needed methods
DEFAULT_OUTPUT = 'mbox-analysis.csv'
def optparse
  options = {}
  OptionParser.new do |opts|
    opts.on('-h') { puts opts; exit }
    
    opts.on('-dDIRECTORY', '--directory DIRECTORY', 'Local directory to read existing mboxes and dump output in (default: .)') do |d|
      if File.directory?(d)
        options[:dir] = d
      else
        raise ArgumentError, "-d #{d} is not a valid directory" 
      end
    end
    opts.on('-oOUTPUT.CSV', '--output OUTPUT.CSV', "Filename to output rows into; default #{DEFAULT_OUTPUT}") do |o|
      options[:output] = o
    end
    opts.on('-j', '--json', "Process .mbox to .json (optional)") do |j|
      options[:json] = true
    end
    begin
      opts.parse!
      options[:dir] = '.' if options[:dir].nil?
      options[:output] = DEFAULT_OUTPUT if options[:output].nil?
    rescue StandardError => e
      $stderr.puts "#{e.message}; try -h for valid options, or see code"
      exit 1
    end
  end
  
  return options
end

# ## ### #### ##### ######
# Main method for command line use
if __FILE__ == $PROGRAM_NAME
  options = optparse
  if options[:json]
    puts "START: Parsing #{options[:dir]}/*#{MboxUtils::MBOX_EXT} into *.json"
    MboxUtils.scan_dir_mbox2stats(options[:dir]) # Side effect: writes out f.chomp(ext).json files
  end
  puts "START: Analyzing #{options[:dir]}/*.json into #{options[:output]}"
  errs = MboxUtils.scan_dir_stats2csv(options[:dir], options[:output])
  if errs
    errs.each do |e|
      puts "ERROR: #{e}"
    end
  end
  puts "END"
end
