blob: c5f0ea4eefb777eba61f6305fbe8e6dc138a59e6 [file] [log] [blame]
#!/usr/bin/perl
#
# Uses a fuzzy hash to find closely-related mails, which might only
# differ by an ID string somewhere or similar. Consider it "Razor Lite" ;)
#
# usage: find-dups corpusdir/* > possible-dups.list
# or: find-dups corpusdir1 [dir2 dir3 ...] > possible-dups.list
$DUMP_WITH_KEY = 0; # TODO: use getopt
$COMMANDS = 1;
my $hashes2 = { };
my $hashes3 = { };
my $hashes4 = { };
use File::Find;
use Digest::SHA qw(sha1_base64);
my @files = ();
foreach my $file (@ARGV) {
if (-d $file) {
find (\&wanted, $file);
sub wanted {
(-f $_) and push (@files, $File::Find::name);
}
} else {
push (@files, $file);
}
}
foreach my $file (@files) {
open (STDIN, "<$file") or warn "$file cannot be opened";
my @hash = do_one();
close STDIN;
#print "$hash\t$file\n";
my $hash2 = $hash[0].$hash[1];
my $hash3 = $hash[0].$hash[1].$hash[2];
my $hash4 = $hash[0].$hash[1].$hash[2].$hash[3];
$hashes4->{$hash4} .= " ".$file;
$hashes3->{$hash3} .= " ".$file;
$hashes2->{$hash2} .= " ".$file;
}
check_collisions ($hashes4);
check_collisions ($hashes3);
check_collisions ($hashes2);
exit;
sub check_collisions {
my ($db, $hash, $file) = @_;
foreach $k (sort keys %{$db}) {
$_ = $db->{$k};
next unless (/\S \S/);
s/^ //g;
if ($DUMP_WITH_KEY) {
print "$_ [$k]\n"; # to print the key
} elsif ($COMMANDS) {
my $count = 0;
while (m/ /g) { $count++; }
/^(\S+) (.*)$/;
print "echo \"$1 : $count dups\"; rm -f $2\n";
} else {
print "$_\n";
}
delete $hashes4->{$k};
delete $hashes3->{$k};
delete $hashes2->{$k};
}
}
sub do_one {
while (<STDIN>) { /^$/ and last; }
my $str = join ('', <STDIN>);
# strip HTML tags, email addresses, queries
# Add more strippings here if you like.
$str =~ s/<[^>]+?>/ /igs;
$str =~ s/"[^\"\s]+\?[^\"\s]+\"/ /igs;
$str =~ s/\S+\?\S+/ /igs;
$str =~ s/\S+\@\S+/ /igs;
$str =~ s/TRCK:\S+//;
$str =~ s/^[a-z0-9]{6,}[-_a-z0-9]{12,}[a-z0-9]{6,}\s*\z//is;
$str =~ s/^\s*\S{24,}\s*\z//is;
#print $str;
my @data = split (/\n/, $str);
my $lpb = ($#data+1) / 4;
#warn "JMD $#lines $lpb";
my @blks = ();
push (@blks, join ('', splice (@data, 0, $lpb)));
push (@blks, join ('', splice (@data, 0, $lpb)));
push (@blks, join ('', splice (@data, 0, $lpb)));
push (@blks, join ('', splice (@data, 0, $lpb)));
my @ret = ();
foreach my $blk (@blks) {
#warn "JMD $blk";
#my $digest = sprintf ("%05d", unpack ("%16C*", $blk));
my $digest = sha1_base64($blk);
push (@ret, $digest);
}
@ret;
}