blob: cccf4b4a17f8aa3c438a944e21cd72ba956b1ca8 [file] [log] [blame]
#! /usr/bin/perl -w
use strict;
# <@LICENSE>
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to you under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at:
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# </@LICENSE>
# Written by Theo Van Dinter <felicity@apache.org>
# Please feel free to mail with any questions. :)
# This goes with the run-masses script to take the ham/spam directories
# and spit out the appropriate spam:mbox:path statements for mass-check.
# The directory structure is assumed to look something like this:
#
# $CORPUS (this script, run-masses, etc)
# |-- ham (dir with mbox files for ham)
# | |-- hamtrap (dirs split into YYYY/MM/DD)
# | `-- personal (dirs split into YYYY/MM/DD)
# `-- spam (empty)
# |-- personal (dirs split into YYYY/MM/DD)
# `-- spamtrap (dirs split into YYYY/MM/DD)
# if you don't have {ham,spam}trap mail broken out, set this to 0.
my $include_traps = 1;
# which dirs have mbox files?
my @dirs = ( 'ham' );
# how many days should we limit to searching for the dir areas?
# ie: assuming we have years of messages in YYYY/MM/DD directories, only look
# at the most recent X so that mass-check will go faster in the scan stage.
# comment out the line if you don't want to limit.
my $RECENT = 120;
my $actualdir = "./";
if (@ARGV) {
$actualdir = shift(@ARGV) . "/";
chdir $actualdir;
}
my @do_dirs;
foreach ( 'ham', 'spam' ) {
push(@do_dirs, "$_/personal");
push(@do_dirs, "$_/${_}trap") if $include_traps;
}
# mbox laden areas
while (my $dir = shift @dirs) {
if (-d $dir) {
$dir =~ m@^([^/]+)@;
print "$1:mbox:$actualdir$dir\n";
}
else {
die "$dir isn't a directory!\n";
}
}
# Ok, now figure out the most recent X days of spam ...
foreach my $pdir ( @do_dirs ) {
$pdir =~ m@^([^/]+)@;
my $type = $1;
my @dlist = ();
if (opendir(DIR1, $pdir)) {
while(my $dir = readdir(DIR1)) {
next unless ($dir =~ /^\d+$/);
$dir = "$pdir/$dir";
next unless (opendir(DIR2, $dir));
while(my $dir2 = readdir(DIR2)) {
next unless ($dir2 =~ /^\d+$/ && opendir(DIR3, "$dir/$dir2"));
$dir2 = "$dir/$dir2";
push(@dlist, map { "$type:dir:$actualdir$dir2/$_" } grep(-d "$dir2/$_" && /^\d+$/, readdir(DIR3)));
closedir(DIR3);
}
closedir(DIR2);
}
closedir(DIR1);
@dlist = reverse sort @dlist;
splice @dlist, $RECENT if (defined $RECENT && @dlist > $RECENT);
push(@dirs, @dlist);
}
}
print join("\n", @dirs, "");
exit;