| #!/usr/bin/env ruby |
| |
| # tocsplit.rb processes agenda/minute file and extracts the Incubator ToCs |
| # as some were created with more than one copy |
| |
| require 'digest' |
| |
| file=ARGV.shift or raise "missing file" |
| TMP=ARGV.shift || '/tmp/tocsplit' |
| |
| $outn = 100 # so files sort |
| $out = nil |
| |
| # open the next file |
| def nextf |
| $outn += 1 |
| $out.close if $out |
| $out = File.open("#{TMP}#{$outn}.tmp", 'w') |
| end |
| |
| contents=File.read(file) |
| |
| # Split file by start of Attachments |
| # forward lookahead so match is saved with next part |
| sections=contents.split(/(?=^-----+\r?\nAttachment A)/) |
| |
| nextf # Initial section |
| sections.each do |s| |
| # Look for Incubator |
| if s =~ /Report from the Apache Incubator Project/ |
| # split this by ToC sections |
| subs = s.split(/(?=^-------+\s+Table\s+of\s+C)/) # one is badly mangled |
| puts "Found #{subs.length-1} ToC sections" # initial section is before ToC |
| # Now output the Incubator parts |
| p=0 |
| subs.each do |i| |
| p=p+1 |
| nextf # one file per part |
| $out.print i |
| if p > 1 && subs.length > 2 # already printed leading section |
| h = Digest::SHA256.hexdigest(i)[0..15] |
| j = Digest::SHA256.hexdigest(i.gsub(/\s+/,''))[0..15] |
| puts "ToC length: #{i.length} hash: #{h} squashed: #{j}" |
| end |
| end |
| nextf # start rest of output |
| next # we have already output Incubator |
| end |
| $out.print s # Output non-Incubator section |
| end |
| |
| $out.close if $out |