masses/corpora/uniq-maildirs - spamassassin - Git at Google

 #!/usr/bin/perl
 #
 # uniq-maildirs [-m grepfile] mdir1 mdir2 ...
 #
 # Run through all maildirs specified on the command line, searching
 # recursively (and through subdirs) reading each file for a Message-I[dD]
 # header.  Files that share the same Message-Id will be listed on stdout
 # in sh command format: "rm -f FILENAME" so the output can be run
 # as a script.
 #
 # If the -m arg is used, the file named will be parsed for Message-IDs
 # in UNIX "grep" format: e.g. "filename:Message-Id: blah"

 use File::Find;

 $| = 1;

 if (defined $ARGV[0] && $ARGV[0] eq '-m') {
   shift @ARGV;
   my $msgidsfile = shift @ARGV;
   %file = ();
   $count = 0;
   open (IN, "<$msgidsfile") or die "cannot read msgids from $msgidsfile";
   while (<IN>) {
     /^([^:]+):Message-I.: (\S+)/ or next;
     $f = $1;
     $m = $2; $m =~ s/^<//; $m =~ s/>$//;

     $file{$m} = $f;
     $count++;

     progress ($count, "m");
   }
   warn "\nfound $count message-ids.\n";
 }

 $count = 0;
 $dups = 0;
 foreach my $dir (@ARGV) {
   File::Find::find ( { wanted => \&wanted, no_chdir => 1 }, $dir);
 }
 warn "\nscanned $count mails, $dups dups.\n";

 sub wanted {
   local ($_);
   $count++;

   open (IN, "<$File::Find::name") or warn "cannot read $File::Find::name";
   while (<IN>) {
     /^$/ and last;
     /^Message-I[dD]: (\S+)/ or next;

     my $m = $1;
     $m =~ s/^<//; $m =~ s/>$//;
     if (exists $file{$m}) {
       print "\n# DUP: $File::Find::name dup of $file{$m}\n";
       print "rm -f $File::Find::name\n";
       $dups++;
       progress ($count, "*");
     } else {
       $file{$m} = $File::Find::name;
       progress ($count, ".");
     }
     last;
   }
   close IN;
 }

 sub progress {
   my ($c, $sym) = @_;

   if (($c % 500) == 0) { print STDERR $sym; }
   if (($c % (500*70)) == 0) { print STDERR "\n"; }
 }
	#!/usr/bin/perl
	#
	# uniq-maildirs [-m grepfile] mdir1 mdir2 ...
	#
	# Run through all maildirs specified on the command line, searching
	# recursively (and through subdirs) reading each file for a Message-I[dD]
	# header. Files that share the same Message-Id will be listed on stdout
	# in sh command format: "rm -f FILENAME" so the output can be run
	# as a script.
	#
	# If the -m arg is used, the file named will be parsed for Message-IDs
	# in UNIX "grep" format: e.g. "filename:Message-Id: blah"

	use File::Find;

	$\| = 1;

	if (defined $ARGV[0] && $ARGV[0] eq '-m') {
	shift @ARGV;
	my $msgidsfile = shift @ARGV;
	%file = ();
	$count = 0;
	open (IN, "<$msgidsfile") or die "cannot read msgids from $msgidsfile";
	while (<IN>) {
	/^([^:]+):Message-I.: (\S+)/ or next;
	$f = $1;
	$m = $2; $m =~ s/^<//; $m =~ s/>$//;

	$file{$m} = $f;
	$count++;

	progress ($count, "m");
	}
	warn "\nfound $count message-ids.\n";
	}

	$count = 0;
	$dups = 0;
	foreach my $dir (@ARGV) {
	File::Find::find ( { wanted => \&wanted, no_chdir => 1 }, $dir);
	}
	warn "\nscanned $count mails, $dups dups.\n";

	sub wanted {
	local ($_);
	$count++;

	open (IN, "<$File::Find::name") or warn "cannot read $File::Find::name";
	while (<IN>) {
	/^$/ and last;
	/^Message-I[dD]: (\S+)/ or next;

	my $m = $1;
	$m =~ s/^<//; $m =~ s/>$//;
	if (exists $file{$m}) {
	print "\n# DUP: $File::Find::name dup of $file{$m}\n";
	print "rm -f $File::Find::name\n";
	$dups++;
	progress ($count, "*");
	} else {
	$file{$m} = $File::Find::name;
	progress ($count, ".");
	}
	last;
	}
	close IN;
	}

	sub progress {
	my ($c, $sym) = @_;

	if (($c % 500) == 0) { print STDERR $sym; }
	if (($c % (500*70)) == 0) { print STDERR "\n"; }
	}