#!/usr/bin/env ruby

#         DRAFT DRAFT DRAFT
#         DRAFT DRAFT DRAFT
#         DRAFT DRAFT DRAFT
#         DRAFT DRAFT DRAFT
#         DRAFT DRAFT DRAFT

#
# ICLA PDF parsing support
#
# Try to extract user text from ICLA PDFs.

# The Gem is not 100% accurate in creating a text version of the page.
# Also it's tricky to extract the text accurately.

# So we try other methods first:
# - if there is a form, return its fields
# - if there are FreeText Annotations, return them in page order
# - use show_text_with_positioning as that seems to be used for PDF updates
# - where the PDF only uses show_text, the Gem is better at combining the data, so use page.txt

require 'pdf-reader'

# TODO perhaps always extract all the data types then choose the best
# Should turn hash values into arrays?
module ICLAParser
  # Process page to extract text with positioning elements
  # These are often used instead of providing form fields
  class Receiver
    SKIP = [
      # Short elements that are not user data
      'Individual Contributor',
      'License Agreement',
      '("Agreement") V2.0',
      "as \"Not a Contribution.\"",
      "inaccurate in any respect.",
      "for your records.",
      "1. Definitions.",
      "Contributions and such derivative works.",
      "litigation is filed.",
      "Contributions."
    ]

    def initialize(fontdict)
      @texts = [] # show_text_with_positioning
      @lines = [] # show_text
      @tfs = nil # text font and size
      @fontdict = fontdict
    end

    # Some PDFs use show_text() multiple times in a line
    def begin_text_object
      @textobj = []
    end

    def end_text_object
      @lines << @textobj.join('')
    end

    def set_text_font_and_size(*args)
       @tfs=args
    end
  
    def show_text(string)
      font = @fontdict[@tfs.first]
      utf8 = ICLAParser.string_to_utf8(string, font)
      @textobj << utf8
    end

    def show_text_with_positioning(*args)
        font = @fontdict[@tfs.first]
        # args are Strings (in the current font encoding) interspersed with integer spacing adjustments; only want the strings
        # We assume the positioning does not overlay characters so can be ignored
        chars = []
        args.flatten.each do |arg|
          if arg.is_a?(String)
            char = ICLAParser.string_to_utf8(arg, font)
            chars << char
          end
        end
        val = chars.join("").strip
        len = val.length
        # some PDFs have the individual text in this format so skip long lines which are unlikely to be user data
        # Could perhaps have full list of expected text lines instead.
        unless len == 0 or len > 50 or SKIP.include? val
          @texts << val
        end
    end

    def get_text
      @texts
    end
    
    def get_lines
      @lines
    end
    
  end

  def self.string_to_utf8(string, font)
    chars = []
    glyphs = font.unpack(string)
    glyphs.each do |glyph_code|
      char = font.to_utf8(glyph_code)
      # One pdf (yev) has spurious \t\r<sp>?<nbsp> translated from 36 => [9, 13, 32, 194, 160]
      if glyph_code == 36 and char =~ /^\t\r /
        char = ' '
      end
      chars << char
    end
    chars.join('')
  end

  # Standard form field names for other code to use
  NAME2FIELD = {
    'fullname' => :FullName,
    'publicname' => :PublicName,
    'mailingaddress' => :MailingAddress,
    'mailingaddress2' => :MailingAddress2,
    'postaladdress' => :MailingAddress,
    'country' => :Country,
    'telephone' => :Telephone,
    'e-mail' => :EMail,
    'preferredapacheid(s)' => :ApacheID,
    'notifyproject' => :Project,
    'date' => :Date,
    'signature' => :Signature,
  }

  # canonicalise the names found in the PDF
  def self.canon_field_name(pdfname)
    NAME2FIELD[pdfname.gsub(' ','').downcase] || pdfname
  end

  # parse the PDF
  def self.parse(path)
    data=Hash.new
    metadata = {}
    data[:_meta] = metadata
    metadata[:dataSource] = {} # have we found anything
    freetext = {} # gather the free text details
    debug={}
    begin
      reader = PDF::Reader.new(path)
      %w(pdf_version info metadata page_count).each do |i|
        metadata[i] = reader.public_send(i)
      end
      reader.objects.each do |k,v|
        type = v[:Type] rescue nil
        subtype = v[:Subtype] rescue nil
        
        if type == :Annot
          if subtype == :FreeText # These are not directly associated with forms
            rect = v[:Rect]
            # rect can be a reference. If so, it seems there may be multiple copies with different IDs but same Rect coords and contents
            if rect.is_a?(PDF::Reader::Reference)
              rect = reader.objects.deref(rect)
            end
            if rect.is_a?(Array)
              contents = v[:Contents]
              if contents and contents.length > 0 and contents != "\x14" # ignore "\x14" == ASCII DC4
                # Entries may be duplicated, so use a hash to store them
                id = rect.inspect+contents # if the rect and contents match, then they overwrite each other
                freetext[id] = {Contents: contents.strip, x: rect[0], y: rect[1]} 
                metadata[:dataSource]['FreeText'] = true
              end
            else
              puts "warn: #{contents} Rect is #{rect.class} in #{path}"
            end
          else
            key = v[:T]
            if key
              val = v[:V]
              # This is a hack; should really find the font def and use that
              if val
                debug["#{key}"] = v.inspect
                if val.bytes[0..1] == [254,255]
                  val = val.encode('utf-8','utf-16').strip
                else
                  begin
                    val = val.encode('utf-8').strip
                  rescue Encoding::UndefinedConversionError
                    val = val.encode('utf-8','iso-8859-1').strip
                  end
                end
                val.gsub!("\x7F",'') # Not sure where these originate
                if val.length > 0
                  data[canon_field_name(key)] = val.gsub("\x7F",'') # Not sure where these originate
                end
                metadata[:dataSource]['Form'] = true
              end
            end
          end
        else
        end
      end # objects
      if freetext.size > 0
        data[:text] = []
        # Sort by Y descending (down the page) and X ascending (across)
        # split into separate chunks if the difference in Y is more than a few points
        how_close = 3
        freetext.values. # no need for ids any more
          sort_by{|e| -e[:y] }. # sort by Y desc
          slice_when{|i,j| (i[:y]-j[:y]) > how_close}. # gather nearby Y values in case there are multiple entries on a line
          each do |k|
            data[:text] << k.
              sort_by{|l| l[:x]}. # sort by X ascending
              map{|v| v[:Contents]}.join(", ")
        end
      end
      if metadata[:dataSource].size == 0 or ((data[:text].size rescue 0) <= 1 and data.size < 3) # No annotations found or not useful
        page1 = nil # cache for page 1
        fontdict = Hash.new
        # Try looking for text sections instead
        receiver = Receiver.new(fontdict)
        reader.pages.each do |page|
          # extract the fonts (needed for conversion to utf-8)
          page.fonts.each do |label, font|
            fontdict[label] ||= PDF::Reader::Font.new(page.objects, page.objects.deref(font))
          end
          page.walk(receiver)
          page1 ||= page.text
        end
        # pickup up the collected strings
        text = receiver.get_text()
#        p text
        lines = receiver.get_lines() # do we still need these?
        debug[:lines] = lines
        if text.length > 3
          metadata[:dataSource]['Text'] = true
          data[:text] = text
        else
          page1.each_line.slice_before(/^\s+Full name:/).each_with_index do |lump, i|
            if i == 1 # starts with Full name
              metadata[:dataSource]['Page'] = true
              # drop the postamble
              form = lump.slice_before(/^\S/).first
              # split into headers
              form.slice_before(/^\s+.+:/).each do |lines|
                # trim leading and trailing blanks and underscores and drop blank lines
                line = lines.map{|l| l.sub(/^[ _]+/,'').sub(/[ _]+$/,'')}.select{|l| l.length > 0}.join(',')
                case line
                  when /^\s*(?:\(optional\) )?(.+):\s+(.*)/
                    data[canon_field_name($1)] = $2
                  else
                    data[:unmatched] ||= []
                    data[:unmatched] << line
                end
              end
            end
          end 
        end
      end
    rescue Exception => e
      data[:error]="Error processing #{path} => #{e.inspect} #{caller}"
    end
#    data[:debug] = debug
    # TODO attempt to classify data[:text] items?
    data
  end
end

if __FILE__ == $0
  require 'pp'
  pp ICLAParser.parse(ARGV.first)
end
