www/secretary/iclaparser.rb - whimsy - Git at Google

 #!/usr/bin/env ruby

 #         DRAFT DRAFT DRAFT
 #         DRAFT DRAFT DRAFT
 #         DRAFT DRAFT DRAFT
 #         DRAFT DRAFT DRAFT
 #         DRAFT DRAFT DRAFT

 #
 # ICLA PDF parsing support
 #
 # Try to extract user text from ICLA PDFs.

 # The Gem is not 100% accurate in creating a text version of the page.
 # Also it's tricky to extract the text accurately.

 # So we try other methods first:
 # - if there is a form, return its fields
 # - if there are FreeText Annotations, return them in page order
 # - use show_text_with_positioning as that seems to be used for PDF updates
 # - where the PDF only uses show_text, the Gem is better at combining the data, so use page.txt

 require 'pdf-reader'

 # TODO perhaps always extract all the data types then choose the best
 # Should turn hash values into arrays?
 module ICLAParser
   # Process page to extract text with positioning elements
   # These are often used instead of providing form fields
   class Receiver
     SKIP = [
       # Short elements that are not user data
       'Individual Contributor',
       'License Agreement',
       '("Agreement") V2.0',
       "as \"Not a Contribution.\"",
       "inaccurate in any respect.",
       "for your records.",
       "1. Definitions.",
       "Contributions and such derivative works.",
       "litigation is filed.",
       "Contributions."
     ]

     def initialize(fontdict)
       @texts = [] # show_text_with_positioning
       @lines = [] # show_text
       @tfs = nil # text font and size
       @fontdict = fontdict
     end

     # Some PDFs use show_text() multiple times in a line
     def begin_text_object
       @textobj = []
     end

     def end_text_object
       @lines << @textobj.join('')
     end

     def set_text_font_and_size(*args)
        @tfs=args
     end

     def show_text(string)
       font = @fontdict[@tfs.first]
       utf8 = ICLAParser.string_to_utf8(string, font)
       @textobj << utf8
     end

     def show_text_with_positioning(*args)
         font = @fontdict[@tfs.first]
         # args are Strings (in the current font encoding) interspersed with integer spacing adjustments; only want the strings
         # We assume the positioning does not overlay characters so can be ignored
         chars = []
         args.flatten.each do |arg|
           if arg.is_a?(String)
             char = ICLAParser.string_to_utf8(arg, font)
             chars << char
           end
         end
         val = chars.join("").strip
         len = val.length
         # some PDFs have the individual text in this format so skip long lines which are unlikely to be user data
         # Could perhaps have full list of expected text lines instead.
         unless len == 0 or len > 50 or SKIP.include? val
           @texts << val
         end
     end

     def get_text
       @texts
     end

     def get_lines
       @lines
     end

   end

   def self.string_to_utf8(string, font)
     chars = []
     glyphs = font.unpack(string)
     glyphs.each do |glyph_code|
       char = font.to_utf8(glyph_code)
       # One pdf (yev) has spurious \t\r<sp>?<nbsp> translated from 36 => [9, 13, 32, 194, 160]
       if glyph_code == 36 and char =~ /^\t\r /
         char = ' '
       end
       chars << char
     end
     chars.join('')
   end

   # Standard form field names for other code to use
   NAME2FIELD = {
     'fullname' => :FullName,
     'publicname' => :PublicName,
     'mailingaddress' => :MailingAddress,
     'mailingaddress2' => :MailingAddress2,
     'postaladdress' => :MailingAddress,
     'country' => :Country,
     'telephone' => :Telephone,
     'e-mail' => :EMail,
     'preferredapacheid(s)' => :ApacheID,
     'notifyproject' => :Project,
     'date' => :Date,
     'signature' => :Signature,
   }

   # canonicalise the names found in the PDF
   def self.canon_field_name(pdfname)
     NAME2FIELD[pdfname.gsub(' ','').downcase] || pdfname
   end

   # parse the PDF
   def self.parse(path)
     data=Hash.new
     metadata = {}
     data[:_meta] = metadata
     metadata[:dataSource] = {} # have we found anything
     freetext = {} # gather the free text details
     debug={}
     begin
       reader = PDF::Reader.new(path)
       %w(pdf_version info metadata page_count).each do |i|
         metadata[i] = reader.public_send(i)
       end
       reader.objects.each do |k,v|
         type = v[:Type] rescue nil
         subtype = v[:Subtype] rescue nil

         if type == :Annot
           if subtype == :FreeText # These are not directly associated with forms
             rect = v[:Rect]
             # rect can be a reference. If so, it seems there may be multiple copies with different IDs but same Rect coords and contents
             if rect.is_a?(PDF::Reader::Reference)
               rect = reader.objects.deref(rect)
             end
             if rect.is_a?(Array)
               contents = v[:Contents]
               if contents and contents.length > 0 and contents != "\x14" # ignore "\x14" == ASCII DC4
                 # Entries may be duplicated, so use a hash to store them
                 id = rect.inspect+contents # if the rect and contents match, then they overwrite each other
                 freetext[id] = {Contents: contents.strip, x: rect[0], y: rect[1]}
                 metadata[:dataSource]['FreeText'] = true
               end
             else
               puts "warn: #{contents} Rect is #{rect.class} in #{path}"
             end
           else
             key = v[:T]
             if key
               val = v[:V]
               # This is a hack; should really find the font def and use that
               if val
                 debug["#{key}"] = v.inspect
                 if val.bytes[0..1] == [254,255]
                   val = val.encode('utf-8','utf-16').strip
                 else
                   begin
                     val = val.encode('utf-8').strip
                   rescue Encoding::UndefinedConversionError
                     val = val.encode('utf-8','iso-8859-1').strip
                   end
                 end
                 val.gsub!("\x7F",'') # Not sure where these originate
                 if val.length > 0
                   data[canon_field_name(key)] = val.gsub("\x7F",'') # Not sure where these originate
                 end
                 metadata[:dataSource]['Form'] = true
               end
             end
           end
         else
         end
       end # objects
       if freetext.size > 0
         data[:text] = []
         # Sort by Y descending (down the page) and X ascending (across)
         # split into separate chunks if the difference in Y is more than a few points
         how_close = 3
         freetext.values. # no need for ids any more
           sort_by{|e| -e[:y] }. # sort by Y desc
           slice_when{|i,j| (i[:y]-j[:y]) > how_close}. # gather nearby Y values in case there are multiple entries on a line
           each do |k|
             data[:text] << k.
               sort_by{|l| l[:x]}. # sort by X ascending
               map{|v| v[:Contents]}.join(", ")
         end
       end
       if metadata[:dataSource].size == 0 or ((data[:text].size rescue 0) <= 1 and data.size < 3) # No annotations found or not useful
         page1 = nil # cache for page 1
         fontdict = Hash.new
         # Try looking for text sections instead
         receiver = Receiver.new(fontdict)
         reader.pages.each do |page|
           # extract the fonts (needed for conversion to utf-8)
           page.fonts.each do |label, font|
             fontdict[label] ||= PDF::Reader::Font.new(page.objects, page.objects.deref(font))
           end
           page.walk(receiver)
           page1 ||= page.text
         end
         # pickup up the collected strings
         text = receiver.get_text()
 #        p text
         lines = receiver.get_lines() # do we still need these?
         debug[:lines] = lines
         if text.length > 3
           metadata[:dataSource]['Text'] = true
           data[:text] = text
         else
           page1.each_line.slice_before(/^\s+Full name:/).each_with_index do |lump, i|
             if i == 1 # starts with Full name
               metadata[:dataSource]['Page'] = true
               # drop the postamble
               form = lump.slice_before(/^\S/).first
               # split into headers
               form.slice_before(/^\s+.+:/).each do |lines|
                 # trim leading and trailing blanks and underscores and drop blank lines
                 line = lines.map{|l| l.sub(/^[ _]+/,'').sub(/[ _]+$/,'')}.select{|l| l.length > 0}.join(',')
                 case line
                   when /^\s*(?:\(optional\) )?(.+):\s+(.*)/
                     data[canon_field_name($1)] = $2
                   else
                     data[:unmatched] ||= []
                     data[:unmatched] << line
                 end
               end
             end
           end
         end
       end
     rescue Exception => e
       data[:error]="Error processing #{path} => #{e.inspect} #{caller}"
     end
 #    data[:debug] = debug
     # TODO attempt to classify data[:text] items?
     data
   end
 end

 if __FILE__ == $0
   require 'pp'
   pp ICLAParser.parse(ARGV.first)
 end
	#!/usr/bin/env ruby

	# DRAFT DRAFT DRAFT
	# DRAFT DRAFT DRAFT
	# DRAFT DRAFT DRAFT
	# DRAFT DRAFT DRAFT
	# DRAFT DRAFT DRAFT

	#
	# ICLA PDF parsing support
	#
	# Try to extract user text from ICLA PDFs.

	# The Gem is not 100% accurate in creating a text version of the page.
	# Also it's tricky to extract the text accurately.

	# So we try other methods first:
	# - if there is a form, return its fields
	# - if there are FreeText Annotations, return them in page order
	# - use show_text_with_positioning as that seems to be used for PDF updates
	# - where the PDF only uses show_text, the Gem is better at combining the data, so use page.txt

	require 'pdf-reader'

	# TODO perhaps always extract all the data types then choose the best
	# Should turn hash values into arrays?
	module ICLAParser
	# Process page to extract text with positioning elements
	# These are often used instead of providing form fields
	class Receiver
	SKIP = [
	# Short elements that are not user data
	'Individual Contributor',
	'License Agreement',
	'("Agreement") V2.0',
	"as \"Not a Contribution.\"",
	"inaccurate in any respect.",
	"for your records.",
	"1. Definitions.",
	"Contributions and such derivative works.",
	"litigation is filed.",
	"Contributions."
	]

	def initialize(fontdict)
	@texts = [] # show_text_with_positioning
	@lines = [] # show_text
	@tfs = nil # text font and size
	@fontdict = fontdict
	end

	# Some PDFs use show_text() multiple times in a line
	def begin_text_object
	@textobj = []
	end

	def end_text_object
	@lines << @textobj.join('')
	end

	def set_text_font_and_size(*args)
	@tfs=args
	end

	def show_text(string)
	font = @fontdict[@tfs.first]
	utf8 = ICLAParser.string_to_utf8(string, font)
	@textobj << utf8
	end

	def show_text_with_positioning(*args)
	font = @fontdict[@tfs.first]
	# args are Strings (in the current font encoding) interspersed with integer spacing adjustments; only want the strings
	# We assume the positioning does not overlay characters so can be ignored
	chars = []
	args.flatten.each do \|arg\|
	if arg.is_a?(String)
	char = ICLAParser.string_to_utf8(arg, font)
	chars << char
	end
	end
	val = chars.join("").strip
	len = val.length
	# some PDFs have the individual text in this format so skip long lines which are unlikely to be user data
	# Could perhaps have full list of expected text lines instead.
	unless len == 0 or len > 50 or SKIP.include? val
	@texts << val
	end
	end

	def get_text
	@texts
	end

	def get_lines
	@lines
	end

	end

	def self.string_to_utf8(string, font)
	chars = []
	glyphs = font.unpack(string)
	glyphs.each do \|glyph_code\|
	char = font.to_utf8(glyph_code)
	# One pdf (yev) has spurious \t\r<sp>?<nbsp> translated from 36 => [9, 13, 32, 194, 160]
	if glyph_code == 36 and char =~ /^\t\r /
	char = ' '
	end
	chars << char
	end
	chars.join('')
	end

	# Standard form field names for other code to use
	NAME2FIELD = {
	'fullname' => :FullName,
	'publicname' => :PublicName,
	'mailingaddress' => :MailingAddress,
	'mailingaddress2' => :MailingAddress2,
	'postaladdress' => :MailingAddress,
	'country' => :Country,
	'telephone' => :Telephone,
	'e-mail' => :EMail,
	'preferredapacheid(s)' => :ApacheID,
	'notifyproject' => :Project,
	'date' => :Date,
	'signature' => :Signature,
	}

	# canonicalise the names found in the PDF
	def self.canon_field_name(pdfname)
	NAME2FIELD[pdfname.gsub(' ','').downcase] \|\| pdfname
	end

	# parse the PDF
	def self.parse(path)
	data=Hash.new
	metadata = {}
	data[:_meta] = metadata
	metadata[:dataSource] = {} # have we found anything
	freetext = {} # gather the free text details
	debug={}
	begin
	reader = PDF::Reader.new(path)
	%w(pdf_version info metadata page_count).each do \|i\|
	metadata[i] = reader.public_send(i)
	end
	reader.objects.each do \|k,v\|
	type = v[:Type] rescue nil
	subtype = v[:Subtype] rescue nil

	if type == :Annot
	if subtype == :FreeText # These are not directly associated with forms
	rect = v[:Rect]
	# rect can be a reference. If so, it seems there may be multiple copies with different IDs but same Rect coords and contents
	if rect.is_a?(PDF::Reader::Reference)
	rect = reader.objects.deref(rect)
	end
	if rect.is_a?(Array)
	contents = v[:Contents]
	if contents and contents.length > 0 and contents != "\x14" # ignore "\x14" == ASCII DC4
	# Entries may be duplicated, so use a hash to store them
	id = rect.inspect+contents # if the rect and contents match, then they overwrite each other
	freetext[id] = {Contents: contents.strip, x: rect[0], y: rect[1]}
	metadata[:dataSource]['FreeText'] = true
	end
	else
	puts "warn: #{contents} Rect is #{rect.class} in #{path}"
	end
	else
	key = v[:T]
	if key
	val = v[:V]
	# This is a hack; should really find the font def and use that
	if val
	debug["#{key}"] = v.inspect
	if val.bytes[0..1] == [254,255]
	val = val.encode('utf-8','utf-16').strip
	else
	begin
	val = val.encode('utf-8').strip
	rescue Encoding::UndefinedConversionError
	val = val.encode('utf-8','iso-8859-1').strip
	end
	end
	val.gsub!("\x7F",'') # Not sure where these originate
	if val.length > 0
	data[canon_field_name(key)] = val.gsub("\x7F",'') # Not sure where these originate
	end
	metadata[:dataSource]['Form'] = true
	end
	end
	end
	else
	end
	end # objects
	if freetext.size > 0
	data[:text] = []
	# Sort by Y descending (down the page) and X ascending (across)
	# split into separate chunks if the difference in Y is more than a few points
	how_close = 3
	freetext.values. # no need for ids any more
	sort_by{\|e\| -e[:y] }. # sort by Y desc
	slice_when{\|i,j\| (i[:y]-j[:y]) > how_close}. # gather nearby Y values in case there are multiple entries on a line
	each do \|k\|
	data[:text] << k.
	sort_by{\|l\| l[:x]}. # sort by X ascending
	map{\|v\| v[:Contents]}.join(", ")
	end
	end
	if metadata[:dataSource].size == 0 or ((data[:text].size rescue 0) <= 1 and data.size < 3) # No annotations found or not useful
	page1 = nil # cache for page 1
	fontdict = Hash.new
	# Try looking for text sections instead
	receiver = Receiver.new(fontdict)
	reader.pages.each do \|page\|
	# extract the fonts (needed for conversion to utf-8)
	page.fonts.each do \|label, font\|
	fontdict[label] \|\|= PDF::Reader::Font.new(page.objects, page.objects.deref(font))
	end
	page.walk(receiver)
	page1 \|\|= page.text
	end
	# pickup up the collected strings
	text = receiver.get_text()
	# p text
	lines = receiver.get_lines() # do we still need these?
	debug[:lines] = lines
	if text.length > 3
	metadata[:dataSource]['Text'] = true
	data[:text] = text
	else
	page1.each_line.slice_before(/^\s+Full name:/).each_with_index do \|lump, i\|
	if i == 1 # starts with Full name
	metadata[:dataSource]['Page'] = true
	# drop the postamble
	form = lump.slice_before(/^\S/).first
	# split into headers
	form.slice_before(/^\s+.+:/).each do \|lines\|
	# trim leading and trailing blanks and underscores and drop blank lines
	line = lines.map{\|l\| l.sub(/^[ _]+/,'').sub(/[ _]+$/,'')}.select{\|l\| l.length > 0}.join(',')
	case line
	when /^\s(?:\(optional\) )?(.+):\s+(.)/
	data[canon_field_name($1)] = $2
	else
	data[:unmatched] \|\|= []
	data[:unmatched] << line
	end
	end
	end
	end
	end
	end
	rescue Exception => e
	data[:error]="Error processing #{path} => #{e.inspect} #{caller}"
	end
	# data[:debug] = debug
	# TODO attempt to classify data[:text] items?
	data
	end
	end

	if __FILE__ == $0
	require 'pp'
	pp ICLAParser.parse(ARGV.first)
	end