| #! /usr/bin/perl -w |
| use strict; |
| |
| # <@LICENSE> |
| # Licensed to the Apache Software Foundation (ASF) under one or more |
| # contributor license agreements. See the NOTICE file distributed with |
| # this work for additional information regarding copyright ownership. |
| # The ASF licenses this file to you under the Apache License, Version 2.0 |
| # (the "License"); you may not use this file except in compliance with |
| # the License. You may obtain a copy of the License at: |
| # |
| # http://www.apache.org/licenses/LICENSE-2.0 |
| # |
| # Unless required by applicable law or agreed to in writing, software |
| # distributed under the License is distributed on an "AS IS" BASIS, |
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| # See the License for the specific language governing permissions and |
| # limitations under the License. |
| # </@LICENSE> |
| |
| # Written by Theo Van Dinter <felicity@apache.org> |
| # Please feel free to mail with any questions. :) |
| |
| # This goes with the run-masses script to take the ham/spam directories |
| # and spit out the appropriate spam:mbox:path statements for mass-check. |
| # The directory structure is assumed to look something like this: |
| # |
| # $CORPUS (this script, run-masses, etc) |
| # |-- ham (dir with mbox files for ham) |
| # | |-- hamtrap (dirs split into YYYY/MM/DD) |
| # | `-- personal (dirs split into YYYY/MM/DD) |
| # `-- spam (empty) |
| # |-- personal (dirs split into YYYY/MM/DD) |
| # `-- spamtrap (dirs split into YYYY/MM/DD) |
| |
| |
| # if you don't have {ham,spam}trap mail broken out, set this to 0. |
| my $include_traps = 1; |
| |
| # which dirs have mbox files? |
| my @dirs = ( 'ham' ); |
| |
| # how many days should we limit to searching for the dir areas? |
| # ie: assuming we have years of messages in YYYY/MM/DD directories, only look |
| # at the most recent X so that mass-check will go faster in the scan stage. |
| # comment out the line if you don't want to limit. |
| my $RECENT = 120; |
| |
| |
| my $actualdir = "./"; |
| if (@ARGV) { |
| $actualdir = shift(@ARGV) . "/"; |
| chdir $actualdir; |
| } |
| |
| my @do_dirs; |
| |
| foreach ( 'ham', 'spam' ) { |
| push(@do_dirs, "$_/personal"); |
| push(@do_dirs, "$_/${_}trap") if $include_traps; |
| } |
| |
| # mbox laden areas |
| while (my $dir = shift @dirs) { |
| if (-d $dir) { |
| $dir =~ m@^([^/]+)@; |
| print "$1:mbox:$actualdir$dir\n"; |
| } |
| else { |
| die "$dir isn't a directory!\n"; |
| } |
| } |
| |
| # Ok, now figure out the most recent X days of spam ... |
| foreach my $pdir ( @do_dirs ) { |
| $pdir =~ m@^([^/]+)@; |
| my $type = $1; |
| |
| my @dlist = (); |
| |
| if (opendir(DIR1, $pdir)) { |
| while(my $dir = readdir(DIR1)) { |
| next unless ($dir =~ /^\d+$/); |
| |
| $dir = "$pdir/$dir"; |
| next unless (opendir(DIR2, $dir)); |
| |
| while(my $dir2 = readdir(DIR2)) { |
| next unless ($dir2 =~ /^\d+$/ && opendir(DIR3, "$dir/$dir2")); |
| $dir2 = "$dir/$dir2"; |
| |
| push(@dlist, map { "$type:dir:$actualdir$dir2/$_" } grep(-d "$dir2/$_" && /^\d+$/, readdir(DIR3))); |
| closedir(DIR3); |
| } |
| closedir(DIR2); |
| } |
| closedir(DIR1); |
| @dlist = reverse sort @dlist; |
| splice @dlist, $RECENT if (defined $RECENT && @dlist > $RECENT); |
| push(@dirs, @dlist); |
| } |
| } |
| |
| print join("\n", @dirs, ""); |
| exit; |