blob: c14d8ecdb31f1a7c15dc845099752543a2df8fb4 [file] [log] [blame]
#!/usr/bin/perl -- # -*- Perl -*-
# this needs some cleanup...
my $PSTOTEXT = "pstotext";
my $pdf = shift @ARGV;
my $index = "";
my $inindex = 0;
open (F, "$PSTOTEXT $pdf |");
while (<F>) {
if (/^<\/index/) {
$index .= $_;
$inindex = 0;
}
$inindex = 1 if /^<index/;
if ($inindex) {
$index .= $_ if /^\s*</;
}
}
my $cindex = "";
while ($index =~ /^(.*?)((<phrase role=\"pageno\">.*?<\/phrase>\s*)+)/s) {
$cindex .= $1;
$_ = $2;
$index = $'; # '
my @pages = m/<phrase role=\"pageno\">.*?<\/phrase>\s*/sg;
# Expand ranges
if ($#pages >= 0) {
my @mpages = ();
foreach my $page (@pages) {
my $pageno = &pageno($page);
if ($pageno =~ /^([0-9]+)[^0-9]([0-9]+)$/) { # funky -
for (my $count = $1; $count <= $2; $count++) {
push (@mpages, "<phrase role=\"$pageno\">$count</phrase>");
}
} else {
push (@mpages, $page);
}
}
@pages = sort rangesort @mpages;
}
# Remove duplicates...
if ($#pages > 0) {
my @mpages = ();
my $current = "";
foreach my $page (@pages) {
my $pageno = &pageno($page);
if ($pageno ne $current) {
push (@mpages, $page);
$current = $pageno;
}
}
@pages = @mpages;
}
# Collapse ranges...
if ($#pages > 1) {
my @cpages = ();
while (@pages) {
my $count = 0;
my $len = &rangelen($count, @pages);
if ($len <= 2) {
my $page = shift @pages;
push (@cpages, $page);
} else {
my $fpage = shift @pages;
my $lpage = "";
while ($len > 1) {
$lpage = shift @pages;
$len--;
}
my $fpno = &pageno($fpage);
my $lpno = &pageno($lpage);
$fpage =~ s/>$fpno</>${fpno}-$lpno</s;
push (@cpages, $fpage);
}
}
@pages = @cpages;
}
my $page = shift @pages;
$page =~ s/\s*$//s;
$cindex .= $page;
while (@pages) {
$page = shift @pages;
$page =~ s/\s*$//s;
$cindex .= ", $page";
}
}
$cindex .= $index;
print "$cindex\n";
sub pageno {
my $page = shift;
$page =~ s/^<phrase.*?>//;
$page =~ s/^<link.*?>//;
return $1 if $page =~ /^([^<>]+)/;
return "?";
}
sub rangesort {
my $apno = &pageno($a);
my $bpno = &pageno($b);
# Make sure roman pages come before arabic ones, otherwise sort them in order
return -1 if ($apno !~ /^\d+/ && $bpno =~ /^\d+/);
return 1 if ($apno =~ /^\d+/ && $bpno !~ /^\d+/);
return $apno <=> $bpno;
}
sub rangelen {
my $count = shift;
my @pages = @_;
my $len = 1;
my $inrange = 1;
my $current = &pageno($pages[$count]);
while ($count < $#pages && $inrange) {
$count++;
my $next = &pageno($pages[$count]);
if ($current + 1 eq $next) {
$current = $next;
$inrange = 1;
$len++;
} else {
$inrange = 0;
}
}
return $len;
}