masses/corpora/fuzzy-hash-maildir - spamassassin - Git at Google

 #!/usr/bin/perl
 #
 # Uses a fuzzy hash to find closely-related mails, which might only
 # differ by an ID string somewhere or similar.  Consider it "Razor Lite" ;)
 #
 # usage: find-dups corpusdir/* > possible-dups.list
 # or:    find-dups corpusdir1 [dir2 dir3 ...] > possible-dups.list

 $DUMP_WITH_KEY = 0;	# TODO: use getopt
 $COMMANDS = 1;

 my $hashes2 = { };
 my $hashes3 = { };
 my $hashes4 = { };

 use File::Find;
 use Digest::SHA qw(sha1_base64);

 my @files = ();
 foreach my $file (@ARGV) {
   if (-d $file) {
     find (\&wanted, $file);
     sub wanted {
       (-f $_) and push (@files, $File::Find::name);
     }
   } else {
     push (@files, $file);
   }
 }

 foreach my $file (@files) {
   open (STDIN, "<$file") or warn "$file cannot be opened";
   my @hash = do_one();
   close STDIN;
   #print "$hash\t$file\n";

   my $hash2 = $hash[0].$hash[1];
   my $hash3 = $hash[0].$hash[1].$hash[2];
   my $hash4 = $hash[0].$hash[1].$hash[2].$hash[3];

   $hashes4->{$hash4} .= " ".$file;
   $hashes3->{$hash3} .= " ".$file;
   $hashes2->{$hash2} .= " ".$file;
 }

 check_collisions ($hashes4);
 check_collisions ($hashes3);
 check_collisions ($hashes2);
 exit;

 sub check_collisions {
   my ($db, $hash, $file) = @_;

   foreach $k (sort keys %{$db}) {
     $_ = $db->{$k};
     next unless (/\S \S/);
     s/^ //g;

     if ($DUMP_WITH_KEY) {
       print "$_  [$k]\n";	# to print the key
     } elsif ($COMMANDS) {
       my $count = 0;
       while (m/ /g) { $count++; }
       /^(\S+) (.*)$/;
       print "echo \"$1 : $count dups\"; rm -f $2\n";
     } else {
       print "$_\n";
     }

     delete $hashes4->{$k};
     delete $hashes3->{$k};
     delete $hashes2->{$k};
   }
 }

 sub do_one {
   while (<STDIN>) { /^$/ and last; }

   my $str = join ('', <STDIN>);

   # strip HTML tags, email addresses, queries
   # Add more strippings here if you like.
   $str =~ s/<[^>]+?>/ /igs;
   $str =~ s/"[^\"\s]+\?[^\"\s]+\"/ /igs;
   $str =~ s/\S+\?\S+/ /igs;
   $str =~ s/\S+\@\S+/ /igs;

   $str =~ s/TRCK:\S+//;

   $str =~ s/^[a-z0-9]{6,}[-_a-z0-9]{12,}[a-z0-9]{6,}\s*\z//is;
   $str =~ s/^\s*\S{24,}\s*\z//is;

 #print $str;

   my @data = split (/\n/, $str);
   my $lpb = ($#data+1) / 4;

 #warn "JMD $#lines $lpb";

   my @blks = ();
   push (@blks, join ('', splice (@data, 0, $lpb)));
   push (@blks, join ('', splice (@data, 0, $lpb)));
   push (@blks, join ('', splice (@data, 0, $lpb)));
   push (@blks, join ('', splice (@data, 0, $lpb)));

   my @ret = ();
   foreach my $blk (@blks) {
     #warn "JMD $blk";
     #my $digest = sprintf ("%05d", unpack ("%16C*", $blk));
     my $digest = sha1_base64($blk);
     push (@ret, $digest);
   }
   @ret;
 }
	#!/usr/bin/perl
	#
	# Uses a fuzzy hash to find closely-related mails, which might only
	# differ by an ID string somewhere or similar. Consider it "Razor Lite" ;)
	#
	# usage: find-dups corpusdir/* > possible-dups.list
	# or: find-dups corpusdir1 [dir2 dir3 ...] > possible-dups.list

	$DUMP_WITH_KEY = 0; # TODO: use getopt
	$COMMANDS = 1;

	my $hashes2 = { };
	my $hashes3 = { };
	my $hashes4 = { };

	use File::Find;
	use Digest::SHA qw(sha1_base64);

	my @files = ();
	foreach my $file (@ARGV) {
	if (-d $file) {
	find (\&wanted, $file);
	sub wanted {
	(-f $_) and push (@files, $File::Find::name);
	}
	} else {
	push (@files, $file);
	}
	}

	foreach my $file (@files) {
	open (STDIN, "<$file") or warn "$file cannot be opened";
	my @hash = do_one();
	close STDIN;
	#print "$hash\t$file\n";

	my $hash2 = $hash[0].$hash[1];
	my $hash3 = $hash[0].$hash[1].$hash[2];
	my $hash4 = $hash[0].$hash[1].$hash[2].$hash[3];

	$hashes4->{$hash4} .= " ".$file;
	$hashes3->{$hash3} .= " ".$file;
	$hashes2->{$hash2} .= " ".$file;
	}

	check_collisions ($hashes4);
	check_collisions ($hashes3);
	check_collisions ($hashes2);
	exit;

	sub check_collisions {
	my ($db, $hash, $file) = @_;

	foreach $k (sort keys %{$db}) {
	$_ = $db->{$k};
	next unless (/\S \S/);
	s/^ //g;

	if ($DUMP_WITH_KEY) {
	print "$_ [$k]\n"; # to print the key
	} elsif ($COMMANDS) {
	my $count = 0;
	while (m/ /g) { $count++; }
	/^(\S+) (.*)$/;
	print "echo \"$1 : $count dups\"; rm -f $2\n";
	} else {
	print "$_\n";
	}

	delete $hashes4->{$k};
	delete $hashes3->{$k};
	delete $hashes2->{$k};
	}
	}

	sub do_one {
	while (<STDIN>) { /^$/ and last; }

	my $str = join ('', <STDIN>);

	# strip HTML tags, email addresses, queries
	# Add more strippings here if you like.
	$str =~ s/<[^>]+?>/ /igs;
	$str =~ s/"[^\"\s]+\?[^\"\s]+\"/ /igs;
	$str =~ s/\S+\?\S+/ /igs;
	$str =~ s/\S+\@\S+/ /igs;

	$str =~ s/TRCK:\S+//;

	$str =~ s/^[a-z0-9]{6,}[-_a-z0-9]{12,}[a-z0-9]{6,}\s*\z//is;
	$str =~ s/^\s\S{24,}\s\z//is;

	#print $str;

	my @data = split (/\n/, $str);
	my $lpb = ($#data+1) / 4;

	#warn "JMD $#lines $lpb";

	my @blks = ();
	push (@blks, join ('', splice (@data, 0, $lpb)));
	push (@blks, join ('', splice (@data, 0, $lpb)));
	push (@blks, join ('', splice (@data, 0, $lpb)));
	push (@blks, join ('', splice (@data, 0, $lpb)));

	my @ret = ();
	foreach my $blk (@blks) {
	#warn "JMD $blk";
	#my $digest = sprintf ("%05d", unpack ("%16C*", $blk));
	my $digest = sha1_base64($blk);
	push (@ret, $digest);
	}
	@ret;
	}