blob: 2bc6b47debd8a581ee2058d2c541170f1c6daa6d [file] [log] [blame]
#!/usr/bin/env ruby
# DRAFT DRAFT DRAFT
# DRAFT DRAFT DRAFT
# DRAFT DRAFT DRAFT
# DRAFT DRAFT DRAFT
# DRAFT DRAFT DRAFT
#
# ICLA PDF parsing support
#
# Try to extract user text from ICLA PDFs.
# The Gem is not 100% accurate in creating a text version of the page.
# Also it's tricky to extract the text accurately.
# So we try other methods first:
# - if there is a form, return its fields
# - if there are FreeText Annotations, return them in page order
# - use show_text_with_positioning as that seems to be used for PDF updates
# - where the PDF only uses show_text, the Gem is better at combining the data, so use page.txt
require 'pdf-reader'
# TODO perhaps always extract all the data types then choose the best
# Should turn hash values into arrays?
module ICLAParser
# Process page to extract text with positioning elements
# These are often used instead of providing form fields
class Receiver
SKIP = [
# Short elements that are not user data
'Individual Contributor',
'License Agreement',
'("Agreement") V2.0',
"as \"Not a Contribution.\"",
"inaccurate in any respect.",
"for your records.",
"1. Definitions.",
"Contributions and such derivative works.",
"litigation is filed.",
"Contributions."
]
def initialize(fontdict)
@texts = [] # show_text_with_positioning
@lines = [] # show_text
@tfs = nil # text font and size
@fontdict = fontdict
end
# Some PDFs use show_text() multiple times in a line
def begin_text_object
@textobj = []
end
def end_text_object
@lines << @textobj.join('')
end
def set_text_font_and_size(*args)
@tfs=args
end
def show_text(string)
font = @fontdict[@tfs.first]
utf8 = ICLAParser.string_to_utf8(string, font)
@textobj << utf8
end
def show_text_with_positioning(*args)
font = @fontdict[@tfs.first]
# args are Strings (in the current font encoding) interspersed with integer spacing adjustments; only want the strings
# We assume the positioning does not overlay characters so can be ignored
chars = []
args.flatten.each do |arg|
if arg.is_a?(String)
char = ICLAParser.string_to_utf8(arg, font)
chars << char
end
end
val = chars.join("").strip
len = val.length
# some PDFs have the individual text in this format so skip long lines which are unlikely to be user data
# Could perhaps have full list of expected text lines instead.
unless len == 0 or len > 50 or SKIP.include? val
@texts << val
end
end
def get_text
@texts
end
def get_lines
@lines
end
end
def self.string_to_utf8(string, font)
chars = []
glyphs = font.unpack(string)
glyphs.each do |glyph_code|
char = font.to_utf8(glyph_code)
# One pdf (yev) has spurious \t\r<sp>?<nbsp> translated from 36 => [9, 13, 32, 194, 160]
if glyph_code == 36 and char =~ /^\t\r /
char = ' '
end
chars << char
end
chars.join('')
end
# Standard form field names for other code to use
NAME2FIELD = {
'fullname' => :FullName,
'publicname' => :PublicName,
'mailingaddress' => :MailingAddress,
'mailingaddress2' => :MailingAddress2,
'postaladdress' => :MailingAddress,
'country' => :Country,
'telephone' => :Telephone,
'e-mail' => :EMail,
'preferredapacheid(s)' => :ApacheID,
'notifyproject' => :Project,
'date' => :Date,
'signature' => :Signature,
}
# canonicalise the names found in the PDF
def self.canon_field_name(pdfname)
NAME2FIELD[pdfname.gsub(' ','').downcase] || pdfname
end
# parse the PDF
def self.parse(path)
data=Hash.new
metadata = {}
data[:_meta] = metadata
metadata[:dataSource] = {} # have we found anything
freetext = {} # gather the free text details
debug={}
begin
reader = PDF::Reader.new(path)
%w(pdf_version info metadata page_count).each do |i|
metadata[i] = reader.public_send(i)
end
reader.objects.each do |k,v|
type = v[:Type] rescue nil
subtype = v[:Subtype] rescue nil
if type == :Annot
if subtype == :FreeText # These are not directly associated with forms
rect = v[:Rect]
# rect can be a reference. If so, it seems there may be multiple copies with different IDs but same Rect coords and contents
if rect.is_a?(PDF::Reader::Reference)
rect = reader.objects.deref(rect)
end
if rect.is_a?(Array)
contents = v[:Contents]
if contents and contents.length > 0 and contents != "\x14" # ignore "\x14" == ASCII DC4
# Entries may be duplicated, so use a hash to store them
id = rect.inspect+contents # if the rect and contents match, then they overwrite each other
freetext[id] = {Contents: contents.strip, x: rect[0], y: rect[1]}
metadata[:dataSource]['FreeText'] = true
end
else
puts "warn: #{contents} Rect is #{rect.class} in #{path}"
end
else
key = v[:T]
if key
val = v[:V]
# This is a hack; should really find the font def and use that
if val
debug["#{key}"] = v.inspect
if val.bytes[0..1] == [254,255]
val = val.encode('utf-8','utf-16').strip
else
begin
val = val.encode('utf-8').strip
rescue Encoding::UndefinedConversionError
val = val.encode('utf-8','iso-8859-1').strip
end
end
val.gsub!("\x7F",'') # Not sure where these originate
if val.length > 0
data[canon_field_name(key)] = val.gsub("\x7F",'') # Not sure where these originate
end
metadata[:dataSource]['Form'] = true
end
end
end
else
end
end # objects
if freetext.size > 0
data[:text] = []
# Sort by Y descending (down the page) and X ascending (across)
# split into separate chunks if the difference in Y is more than a few points
how_close = 3
freetext.values. # no need for ids any more
sort_by{|e| -e[:y] }. # sort by Y desc
slice_when{|i,j| (i[:y]-j[:y]) > how_close}. # gather nearby Y values in case there are multiple entries on a line
each do |k|
data[:text] << k.
sort_by{|l| l[:x]}. # sort by X ascending
map{|v| v[:Contents]}.join(", ")
end
end
if metadata[:dataSource].size == 0 or ((data[:text].size rescue 0) <= 1 and data.size < 3) # No annotations found or not useful
page1 = nil # cache for page 1
fontdict = Hash.new
# Try looking for text sections instead
receiver = Receiver.new(fontdict)
reader.pages.each do |page|
# extract the fonts (needed for conversion to utf-8)
page.fonts.each do |label, font|
fontdict[label] ||= PDF::Reader::Font.new(page.objects, page.objects.deref(font))
end
page.walk(receiver)
page1 ||= page.text
end
# pickup up the collected strings
text = receiver.get_text()
# p text
lines = receiver.get_lines() # do we still need these?
debug[:lines] = lines
if text.length > 3
metadata[:dataSource]['Text'] = true
data[:text] = text
else
page1.each_line.slice_before(/^\s+Full name:/).each_with_index do |lump, i|
if i == 1 # starts with Full name
metadata[:dataSource]['Page'] = true
# drop the postamble
form = lump.slice_before(/^\S/).first
# split into headers
form.slice_before(/^\s+.+:/).each do |lines|
# trim leading and trailing blanks and underscores and drop blank lines
line = lines.map{|l| l.sub(/^[ _]+/,'').sub(/[ _]+$/,'')}.select{|l| l.length > 0}.join(',')
case line
when /^\s*(?:\(optional\) )?(.+):\s+(.*)/
data[canon_field_name($1)] = $2
else
data[:unmatched] ||= []
data[:unmatched] << line
end
end
end
end
end
end
rescue Exception => e
data[:error]="Error processing #{path} => #{e.inspect} #{caller}"
end
# data[:debug] = debug
# TODO attempt to classify data[:text] items?
data
end
end
if __FILE__ == $0
require 'pp'
pp ICLAParser.parse(ARGV.first)
end