| #!/usr/bin/perl |
| # |
| # Uses a fuzzy hash to find closely-related mails, which might only |
| # differ by an ID string somewhere or similar. Consider it "Razor Lite" ;) |
| # |
| # usage: find-dups corpusdir/* > possible-dups.list |
| # or: find-dups corpusdir1 [dir2 dir3 ...] > possible-dups.list |
| |
| $DUMP_WITH_KEY = 0; # TODO: use getopt |
| $COMMANDS = 1; |
| |
| my $hashes2 = { }; |
| my $hashes3 = { }; |
| my $hashes4 = { }; |
| |
| use File::Find; |
| use Digest::SHA qw(sha1_base64); |
| |
| my @files = (); |
| foreach my $file (@ARGV) { |
| if (-d $file) { |
| find (\&wanted, $file); |
| sub wanted { |
| (-f $_) and push (@files, $File::Find::name); |
| } |
| } else { |
| push (@files, $file); |
| } |
| } |
| |
| foreach my $file (@files) { |
| open (STDIN, "<$file") or warn "$file cannot be opened"; |
| my @hash = do_one(); |
| close STDIN; |
| #print "$hash\t$file\n"; |
| |
| my $hash2 = $hash[0].$hash[1]; |
| my $hash3 = $hash[0].$hash[1].$hash[2]; |
| my $hash4 = $hash[0].$hash[1].$hash[2].$hash[3]; |
| |
| $hashes4->{$hash4} .= " ".$file; |
| $hashes3->{$hash3} .= " ".$file; |
| $hashes2->{$hash2} .= " ".$file; |
| } |
| |
| check_collisions ($hashes4); |
| check_collisions ($hashes3); |
| check_collisions ($hashes2); |
| exit; |
| |
| sub check_collisions { |
| my ($db, $hash, $file) = @_; |
| |
| foreach $k (sort keys %{$db}) { |
| $_ = $db->{$k}; |
| next unless (/\S \S/); |
| s/^ //g; |
| |
| if ($DUMP_WITH_KEY) { |
| print "$_ [$k]\n"; # to print the key |
| } elsif ($COMMANDS) { |
| my $count = 0; |
| while (m/ /g) { $count++; } |
| /^(\S+) (.*)$/; |
| print "echo \"$1 : $count dups\"; rm -f $2\n"; |
| } else { |
| print "$_\n"; |
| } |
| |
| delete $hashes4->{$k}; |
| delete $hashes3->{$k}; |
| delete $hashes2->{$k}; |
| } |
| } |
| |
| sub do_one { |
| while (<STDIN>) { /^$/ and last; } |
| |
| my $str = join ('', <STDIN>); |
| |
| # strip HTML tags, email addresses, queries |
| # Add more strippings here if you like. |
| $str =~ s/<[^>]+?>/ /igs; |
| $str =~ s/"[^\"\s]+\?[^\"\s]+\"/ /igs; |
| $str =~ s/\S+\?\S+/ /igs; |
| $str =~ s/\S+\@\S+/ /igs; |
| |
| $str =~ s/TRCK:\S+//; |
| |
| $str =~ s/^[a-z0-9]{6,}[-_a-z0-9]{12,}[a-z0-9]{6,}\s*\z//is; |
| $str =~ s/^\s*\S{24,}\s*\z//is; |
| |
| #print $str; |
| |
| my @data = split (/\n/, $str); |
| my $lpb = ($#data+1) / 4; |
| |
| #warn "JMD $#lines $lpb"; |
| |
| my @blks = (); |
| push (@blks, join ('', splice (@data, 0, $lpb))); |
| push (@blks, join ('', splice (@data, 0, $lpb))); |
| push (@blks, join ('', splice (@data, 0, $lpb))); |
| push (@blks, join ('', splice (@data, 0, $lpb))); |
| |
| my @ret = (); |
| foreach my $blk (@blks) { |
| #warn "JMD $blk"; |
| #my $digest = sprintf ("%05d", unpack ("%16C*", $blk)); |
| my $digest = sha1_base64($blk); |
| push (@ret, $digest); |
| } |
| @ret; |
| } |
| |