| #!/usr/bin/perl -w -T |
| # <@LICENSE> |
| # Licensed to the Apache Software Foundation (ASF) under one or more |
| # contributor license agreements. See the NOTICE file distributed with |
| # this work for additional information regarding copyright ownership. |
| # The ASF licenses this file to you under the Apache License, Version 2.0 |
| # (the "License"); you may not use this file except in compliance with |
| # the License. You may obtain a copy of the License at: |
| # |
| # http://www.apache.org/licenses/LICENSE-2.0 |
| # |
| # Unless required by applicable law or agreed to in writing, software |
| # distributed under the License is distributed on an "AS IS" BASIS, |
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| # See the License for the specific language governing permissions and |
| # limitations under the License. |
| # </@LICENSE> |
| |
| use strict; |
| use warnings; |
| # use bytes; |
| |
| use Errno qw(EBADF); |
| use Getopt::Long; |
| use Pod::Usage; |
| use File::Spec; |
| use POSIX qw(locale_h setsid sigprocmask _exit); |
| |
| POSIX::setlocale(LC_TIME,'C'); |
| |
| our ( $spamtest, %opt, $isspam, $forget, $messagecount, $learnedcount, $messagelimit, $progress, |
| $total_messages, $init_results, $start_time, $synconly, $learnprob, @targets, $bayes_override_path ); |
| |
| my $PREFIX = '@@PREFIX@@'; # substituted at 'make' time |
| my $DEF_RULES_DIR = '@@DEF_RULES_DIR@@'; # substituted at 'make' time |
| my $LOCAL_RULES_DIR = '@@LOCAL_RULES_DIR@@'; # substituted at 'make' time |
| |
| use lib '@@INSTALLSITELIB@@'; # substituted at 'make' time |
| |
| BEGIN { # see comments in "spamassassin.raw" for doco |
| my @bin = File::Spec->splitpath($0); |
| my $bin = ($bin[0] ? File::Spec->catpath(@bin[0..1], '') : $bin[1]) |
| || File::Spec->curdir; |
| |
| if (-e $bin.'/lib/Mail/SpamAssassin.pm' |
| || !-e '@@INSTALLSITELIB@@/Mail/SpamAssassin.pm' ) |
| { |
| my $searchrelative; |
| $searchrelative = 1; # disabled during "make install": REMOVEFORINST |
| if ($searchrelative && $bin eq '../' && -e '../blib/lib/Mail/SpamAssassin.pm') |
| { |
| unshift ( @INC, '../blib/lib' ); |
| } else { |
| foreach ( qw(lib ../lib/site_perl |
| ../lib/spamassassin ../share/spamassassin/lib)) |
| { |
| my $dir = File::Spec->catdir( $bin, split ( '/', $_ ) ); |
| if ( -f File::Spec->catfile( $dir, "Mail", "SpamAssassin.pm" ) ) |
| { unshift ( @INC, $dir ); last; } |
| } |
| } |
| } |
| } |
| |
| use Mail::SpamAssassin; |
| use Mail::SpamAssassin::ArchiveIterator; |
| use Mail::SpamAssassin::Message; |
| use Mail::SpamAssassin::PerMsgLearner; |
| use Mail::SpamAssassin::Util::Progress; |
| use Mail::SpamAssassin::Logger; |
| |
| ########################################################################### |
| |
| $SIG{PIPE} = 'IGNORE'; |
| |
| # used to be CmdLearn::cmd_run() ... |
| |
| %opt = ( |
| 'force-expire' => 0, |
| 'use-ignores' => 0, |
| 'nosync' => 0, |
| 'quiet' => 0, |
| 'cf' => [] |
| ); |
| |
| Getopt::Long::Configure( |
| qw(bundling no_getopt_compat |
| permute no_auto_abbrev no_ignore_case) |
| ); |
| |
| GetOptions( |
| 'forget' => \$forget, |
| 'ham|nonspam' => sub { $isspam = 0; }, |
| 'spam' => sub { $isspam = 1; }, |
| 'sync' => \$synconly, |
| 'rebuild' => sub { $synconly = 1; warn "The --rebuild option has been deprecated. Please use --sync instead.\n" }, |
| |
| 'q|quiet' => \$opt{'quiet'}, |
| 'username|u=s' => \$opt{'username'}, |
| 'configpath|config-file|config-dir|c|C=s' => \$opt{'configpath'}, |
| 'prefspath|prefs-file|p=s' => \$opt{'prefspath'}, |
| 'siteconfigpath=s' => \$opt{'siteconfigpath'}, |
| 'cf=s' => \@{$opt{'cf'}}, |
| |
| 'folders|f=s' => \$opt{'folders'}, |
| 'force-expire|expire' => \$opt{'force-expire'}, |
| 'local|L' => \$opt{'local'}, |
| 'no-sync|nosync' => \$opt{'nosync'}, |
| 'showdots' => \$opt{'showdots'}, |
| 'progress' => \$opt{'progress'}, |
| 'use-ignores' => \$opt{'use-ignores'}, |
| 'no-rebuild|norebuild' => sub { $opt{'nosync'} = 1; warn "The --no-rebuild option has been deprecated. Please use --no-sync instead.\n" }, |
| |
| 'learnprob=f' => \$opt{'learnprob'}, |
| 'randseed=i' => \$opt{'randseed'}, |
| 'stopafter=i' => \$opt{'stopafter'}, |
| 'max-size=i' => \$opt{'max-size'}, |
| |
| 'debug|debug-level|D:s' => \$opt{'debug'}, |
| 'help|h|?' => \$opt{'help'}, |
| 'version|V' => \$opt{'version'}, |
| |
| 'dump:s' => \$opt{'dump'}, |
| 'import' => \$opt{'import'}, |
| |
| 'backup' => \$opt{'backup'}, |
| 'clear' => \$opt{'clear'}, |
| 'restore=s' => \$opt{'restore'}, |
| |
| 'dir' => sub { $opt{'old_format'} = 'dir'; }, |
| 'file' => sub { $opt{'old_format'} = 'file'; }, |
| 'mbox' => sub { $opt{'format'} = 'mbox'; }, |
| 'mbx' => sub { $opt{'format'} = 'mbx'; }, |
| 'single' => sub { $opt{'old_format'} = 'single'; }, |
| |
| 'db|dbpath=s' => \$bayes_override_path, |
| 're|regexp=s' => \$opt{'regexp'}, |
| |
| '<>' => \&target, |
| ) |
| or usage( 0, "Unknown option!" ); |
| |
| if ( defined $opt{'help'} ) { |
| usage( 0, "For more information read the manual page" ); |
| } |
| if ( defined $opt{'version'} ) { |
| print "SpamAssassin version " . Mail::SpamAssassin::Version() . "\n"; |
| exit 0; |
| } |
| |
| # set debug areas, if any specified (only useful for command-line tools) |
| if (defined $opt{'debug'}) { |
| $opt{'debug'} ||= 'all'; |
| } |
| |
| if ( $opt{'force-expire'} ) { |
| $synconly = 1; |
| } |
| |
| if ($opt{'showdots'} && $opt{'progress'}) { |
| print "--showdots and --progress may not be used together, please select just one\n"; |
| exit 0; |
| } |
| |
| if ( !defined $isspam |
| && !defined $synconly |
| && !defined $forget |
| && !defined $opt{'dump'} |
| && !defined $opt{'import'} |
| && !defined $opt{'clear'} |
| && !defined $opt{'backup'} |
| && !defined $opt{'restore'} |
| && !defined $opt{'folders'} ) |
| { |
| usage( 0, |
| "Please select either --spam, --ham, --folders, --forget, --sync, --import,\n--dump, --clear, --backup or --restore" |
| ); |
| } |
| |
| # We need to make sure the journal syncs pre-forget... |
| if ( defined $forget && $opt{'nosync'} ) { |
| $opt{'nosync'} = 0; |
| warn "sa-learn warning: --forget requires read/write access to the database, and is incompatible with --no-sync\n"; |
| } |
| |
| if ( defined $opt{'old_format'} ) { |
| |
| #Format specified in the 2.5x form of --dir, --file, --mbox, --mbx or --single. |
| #Convert it to the new behavior: |
| if ( $opt{'old_format'} eq 'single' ) { |
| push ( @ARGV, '-' ); |
| } |
| } |
| |
| my $post_config = ''; |
| |
| # kluge to support old check_bayes_db operation |
| # bug 3799: init() will go r/o with the configured DB, and then dbpath needs |
| # to override. Just access the dbpath version via post_config_text. |
| if ( defined $bayes_override_path ) { |
| # Add a default prefix if the path is a directory |
| if ( -d $bayes_override_path ) { |
| $bayes_override_path = File::Spec->catfile( $bayes_override_path, 'bayes' ); |
| } |
| |
| $post_config .= "bayes_path $bayes_override_path\n"; |
| } |
| |
| # These options require bayes_scanner, which requires "use_bayes 1", but |
| # that's not necessary for these commands. |
| if (defined $opt{'dump'} || defined $opt{'import'} || defined $opt{'clear'} || |
| defined $opt{'backup'} || defined $opt{'restore'}) { |
| $post_config .= "use_bayes 1\n"; |
| } |
| |
| $post_config .= join("\n", @{$opt{'cf'}})."\n"; |
| |
| # create the tester factory |
| $spamtest = Mail::SpamAssassin->new( |
| { |
| rules_filename => $opt{'configpath'}, |
| site_rules_filename => $opt{'siteconfigpath'}, |
| userprefs_filename => $opt{'prefspath'}, |
| username => $opt{'username'}, |
| debug => $opt{'debug'}, |
| local_tests_only => $opt{'local'}, |
| dont_copy_prefs => 1, |
| PREFIX => $PREFIX, |
| DEF_RULES_DIR => $DEF_RULES_DIR, |
| LOCAL_RULES_DIR => $LOCAL_RULES_DIR, |
| post_config_text => $post_config, |
| } |
| ); |
| |
| $spamtest->init(1); |
| dbg("sa-learn: spamtest initialized"); |
| |
| # Bug 6228 hack: bridge the transition gap of moving Bayes.pm into a plugin; |
| # To be resolved more cleanly!!! |
| if ($spamtest->{bayes_scanner}) { |
| foreach my $plugin ( @{ $spamtest->{plugins}->{plugins} } ) { |
| if ($plugin->isa('Mail::SpamAssassin::Plugin::Bayes')) { |
| # copy plugin's "store" object ref one level up! |
| $spamtest->{bayes_scanner}->{store} = $plugin->{store}; |
| } |
| } |
| } |
| |
| if (Mail::SpamAssassin::Util::am_running_on_windows()) { |
| binmode(STDIN) or die "cannot set binmode on STDIN: $!"; # bug 4363 |
| binmode(STDOUT) or die "cannot set binmode on STDOUT: $!"; |
| } |
| |
| if ( defined $opt{'dump'} ) { |
| my ( $magic, $toks ); |
| |
| if ( $opt{'dump'} eq 'all' || $opt{'dump'} eq '' ) { # show us all tokens! |
| ( $magic, $toks ) = ( 1, 1 ); |
| } |
| elsif ( $opt{'dump'} eq 'magic' ) { # show us magic tokens only |
| ( $magic, $toks ) = ( 1, 0 ); |
| } |
| elsif ( $opt{'dump'} eq 'data' ) { # show us data tokens only |
| ( $magic, $toks ) = ( 0, 1 ); |
| } |
| else { # unknown option |
| warn "Unknown dump option '" . $opt{'dump'} . "'\n"; |
| $spamtest->finish_learner(); |
| exit 1; |
| } |
| |
| if (!$spamtest->dump_bayes_db( $magic, $toks, $opt{'regexp'}) ) { |
| $spamtest->finish_learner(); |
| die "ERROR: Bayes dump returned an error, please re-run with -D for more information\n"; |
| } |
| |
| $spamtest->finish_learner(); |
| # make sure we notice any write errors while flushing output buffer |
| close STDOUT or die "error closing STDOUT: $!"; |
| close STDIN or die "error closing STDIN: $!"; |
| exit 0; |
| } |
| |
| if ( defined $opt{'import'} ) { |
| my $ret = $spamtest->{bayes_scanner}->{store}->perform_upgrade(); |
| $spamtest->finish_learner(); |
| # make sure we notice any write errors while flushing output buffer |
| close STDOUT or die "error closing STDOUT: $!"; |
| close STDIN or die "error closing STDIN: $!"; |
| exit( !$ret ); |
| } |
| |
| if (defined $opt{'clear'}) { |
| unless ($spamtest->{bayes_scanner}->{store}->clear_database()) { |
| $spamtest->finish_learner(); |
| die "ERROR: Bayes clear returned an error, please re-run with -D for more information\n"; |
| } |
| |
| $spamtest->finish_learner(); |
| # make sure we notice any write errors while flushing output buffer |
| close STDOUT or die "error closing STDOUT: $!"; |
| close STDIN or die "error closing STDIN: $!"; |
| exit 0; |
| } |
| |
| if (defined $opt{'backup'}) { |
| unless ($spamtest->{bayes_scanner}->{store}->backup_database()) { |
| $spamtest->finish_learner(); |
| die "ERROR: Bayes backup returned an error, please re-run with -D for more information\n"; |
| } |
| |
| $spamtest->finish_learner(); |
| # make sure we notice any write errors while flushing output buffer |
| close STDOUT or die "error closing STDOUT: $!"; |
| close STDIN or die "error closing STDIN: $!"; |
| exit 0; |
| } |
| |
| if (defined $opt{'restore'}) { |
| |
| my $filename = $opt{'restore'}; |
| |
| unless ($filename) { |
| $spamtest->finish_learner(); |
| die "ERROR: You must specify a filename to restore.\n"; |
| } |
| |
| unless ($spamtest->{bayes_scanner}->{store}->restore_database($filename, $opt{'showdots'})) { |
| $spamtest->finish_learner(); |
| die "ERROR: Bayes restore returned an error, please re-run with -D for more information\n"; |
| } |
| |
| $spamtest->finish_learner(); |
| # make sure we notice any write errors while flushing output buffer |
| close STDOUT or die "error closing STDOUT: $!"; |
| close STDIN or die "error closing STDIN: $!"; |
| exit 0; |
| } |
| |
| if ( !$spamtest->{conf}->{use_bayes} ) { |
| warn "ERROR: configuration specifies 'use_bayes 0', sa-learn disabled\n"; |
| exit 1; |
| } |
| |
| $spamtest->init_learner( |
| { |
| force_expire => $opt{'force-expire'}, |
| learn_to_journal => $opt{'nosync'}, |
| wait_for_lock => 1, |
| caller_will_untie => 1 |
| } |
| ); |
| |
| $spamtest->{bayes_scanner}{use_ignores} = $opt{'use-ignores'}; |
| |
| if ($synconly) { |
| $spamtest->rebuild_learner_caches( |
| { |
| verbose => !$opt{'quiet'}, |
| showdots => $opt{'showdots'} |
| } |
| ); |
| $spamtest->finish_learner(); |
| # make sure we notice any write errors while flushing output buffer |
| close STDOUT or die "error closing STDOUT: $!"; |
| close STDIN or die "error closing STDIN: $!"; |
| exit 0; |
| } |
| |
| $messagelimit = $opt{'stopafter'}; |
| $learnprob = $opt{'learnprob'}; |
| |
| if ( defined $opt{'randseed'} ) { |
| srand( $opt{'randseed'} ); |
| } |
| |
| # sync the journal first if we're going to go r/w so we make sure to |
| # learn everything before doing anything else. |
| # |
| if ( !$opt{nosync} ) { |
| $spamtest->rebuild_learner_caches(); |
| } |
| |
| # what is the result of the run? will end up being the exit code. |
| my $exit_status = 0; |
| |
| # run this lot in an eval block, so we can catch die's and clear |
| # up the dbs. |
| eval { |
| $SIG{HUP} = \&killed; |
| $SIG{INT} = \&killed; |
| $SIG{TERM} = \&killed; |
| |
| if ( $opt{folders} ) { |
| open( F, $opt{folders} ) or die "cannot open $opt{folders}: $!"; |
| for ($!=0; <F>; $!=0) { |
| chomp; |
| next if /^\s*$/; |
| if (/^(ham|spam):(\w*):(.*)/) { |
| my $class = $1; |
| my $format = $2 || "detect"; |
| my $target = $3; |
| push ( @targets, "$class:$format:$target" ); |
| } |
| else { |
| target($_); |
| } |
| } |
| defined $_ || $!==0 or |
| $!==EBADF ? dbg("error reading from $opt{folders}: $!") |
| : die "error reading from $opt{folders}: $!"; |
| close(F) or die "error closing $opt{folders}: $!"; |
| } |
| |
| ########################################################################### |
| # Deal with the target listing, and STDIN -> tempfile |
| |
| my $tempfile; # will be defined if stdin -> tempfile |
| push(@targets, @ARGV); |
| @targets = ('-') unless @targets || $opt{folders}; |
| |
| for(my $elem = 0; $elem <= $#targets; $elem++) { |
| # ArchiveIterator doesn't really like STDIN, so if "-" is specified |
| # as a target, make it a temp file instead. |
| if ( $targets[$elem] =~ /(?:^|:)-$/ ) { |
| if (defined $tempfile) { |
| # uh-oh, stdin specified multiple times? |
| warn "skipping extra stdin target (".$targets[$elem].")\n"; |
| splice @targets, $elem, 1; |
| $elem--; # go back to this element again |
| next; |
| } |
| else { |
| my $handle; |
| ( $tempfile, $handle ) = Mail::SpamAssassin::Util::secure_tmpfile(); |
| binmode $handle or die "cannot set binmode on file $tempfile: $!"; |
| |
| # avoid slurping the whole file into memory, copy chunk by chunk |
| my($inbuf,$nread); |
| while ( $nread=sysread(STDIN,$inbuf,16384) ) |
| { print {$handle} $inbuf or die "error writing to $tempfile: $!" } |
| defined $nread or die "error reading from STDIN: $!"; |
| close $handle or die "error closing $tempfile: $!"; |
| |
| # re-aim the targets at the tempfile instead of STDIN |
| $targets[$elem] =~ s/-$/$tempfile/; |
| } |
| } |
| |
| # make sure the target list is in the normal AI format |
| if ($targets[$elem] !~ /^[^:]*:[a-z]+:/) { |
| my $item = splice @targets, $elem, 1; |
| target($item); # add back to the list |
| $elem--; # go back to this element again |
| next; |
| } |
| } |
| |
| ########################################################################### |
| |
| my $iter = Mail::SpamAssassin::ArchiveIterator->new( |
| { |
| # skip messages larger than max-size bytes, |
| # 0 for no limit, undef defaults to 500 KB |
| 'opt_max_size' => $opt{'max-size'}, |
| 'opt_want_date' => 0, |
| 'opt_from_regex' => $spamtest->{conf}->{mbox_format_from_regex}, |
| } |
| ); |
| |
| $iter->set_functions(\&wanted, \&result); |
| $messagecount = 0; |
| $learnedcount = 0; |
| |
| $init_results = 0; |
| $start_time = time; |
| |
| # if exit_status isn't already set to non-zero, set it to the reverse of the |
| # run result (0 is bad, 1+ is good -- the opposite of exit status codes) |
| my $run_ok = eval { $exit_status ||= ! $iter->run(@targets); 1 }; |
| |
| print STDERR "\n" if ($opt{showdots}); |
| $progress->final() if ($opt{progress} && $progress); |
| |
| my $phrase = defined $forget ? "Forgot" : "Learned"; |
| print "$phrase tokens from $learnedcount message(s) ($messagecount message(s) examined)\n" |
| if !$opt{'quiet'}; |
| |
| # If we needed to make a tempfile, go delete it. |
| if (defined $tempfile) { |
| unlink $tempfile or die "cannot unlink temporary file $tempfile: $!"; |
| undef $tempfile; |
| } |
| |
| if (!$run_ok && $@ !~ /HITLIMIT/) { die $@ } |
| 1; |
| } or do { |
| my $eval_stat = $@ ne '' ? $@ : "errno=$!"; chomp $eval_stat; |
| $spamtest->finish_learner(); |
| die $eval_stat; |
| }; |
| |
| $spamtest->finish_learner(); |
| # make sure we notice any write errors while flushing output buffer |
| close STDOUT or die "error closing STDOUT: $!"; |
| close STDIN or die "error closing STDIN: $!"; |
| exit $exit_status; |
| |
| ########################################################################### |
| |
| sub killed { |
| $spamtest->finish_learner(); |
| die "interrupted"; |
| } |
| |
| sub target { |
| my ($target) = @_; |
| |
| if (!defined $isspam && !$forget) |
| { |
| usage( 0, |
| "Please select either --spam or --ham or --forget before the first target" |
| ); |
| } |
| my $class = ( $isspam ? "spam" : "ham" ); |
| my $format = ( defined( $opt{'format'} ) ? $opt{'format'} : "detect" ); |
| |
| push ( @targets, "$class:$format:$target" ); |
| } |
| |
| ########################################################################### |
| |
| sub init_results { |
| $init_results = 1; |
| |
| return unless $opt{'progress'}; |
| |
| $total_messages = $Mail::SpamAssassin::ArchiveIterator::MESSAGES; |
| |
| $progress = Mail::SpamAssassin::Util::Progress->new({total => $total_messages,}); |
| } |
| |
| ########################################################################### |
| |
| sub result { |
| my ($class, $result, $time) = @_; |
| |
| # don't open results files until we get here to avoid overwriting files |
| &init_results if !$init_results; |
| |
| $progress->update($messagecount) if ($opt{progress} && $progress); |
| } |
| |
| ########################################################################### |
| |
| sub wanted { |
| my ( $class, $id, $time, $dataref ) = @_; |
| |
| my $spam = $class eq "s" ? 1 : 0; |
| |
| if ( defined($learnprob) ) { |
| if ( int( rand( 1 / $learnprob ) ) != 0 ) { |
| print STDERR '_' if ( $opt{showdots} ); |
| return 1; |
| } |
| } |
| |
| if ( defined($messagelimit) && $learnedcount > $messagelimit ) { |
| $progress->final() if ($opt{progress} && $progress); |
| die 'HITLIMIT'; |
| } |
| |
| $messagecount++; |
| my $ma = $spamtest->parse($dataref); |
| |
| if ( $ma->get_header("X-Spam-Checker-Version") ) { |
| my $new_ma = $spamtest->parse($spamtest->remove_spamassassin_markup($ma), 1); |
| $ma->finish(); |
| $ma = $new_ma; |
| } |
| |
| my $status = $spamtest->learn( $ma, undef, $spam, $forget ); |
| my $learned = $status->did_learn(); |
| |
| if ( !defined $learned ) { # undef=learning unavailable |
| die "ERROR: the Bayes learn function returned an error, please re-run with -D for more information\n"; |
| } |
| elsif ( $learned == 1 ) { # 1=message was learned. 0=message wasn't learned |
| $learnedcount++; |
| } |
| |
| # Do cleanup ... |
| $status->finish(); |
| undef $status; |
| |
| $ma->finish(); |
| undef $ma; |
| |
| print STDERR '.' if ( $opt{showdots} ); |
| return 1; |
| } |
| |
| ########################################################################### |
| |
| sub usage { |
| my ( $verbose, $message ) = @_; |
| my $ver = Mail::SpamAssassin::Version(); |
| print "SpamAssassin version $ver\n"; |
| pod2usage( -verbose => $verbose, -message => $message, -exitval => 64 ); |
| } |
| |
| # --------------------------------------------------------------------------- |
| |
| =head1 NAME |
| |
| sa-learn - train SpamAssassin's Bayesian classifier |
| |
| =head1 SYNOPSIS |
| |
| B<sa-learn> [options] [file]... |
| |
| B<sa-learn> [options] --dump [ all | data | magic ] |
| |
| Options: |
| |
| --ham Learn the following messages as ham (non-spam) |
| --spam Learn the following messages as spam |
| --forget Forget the following messages |
| --use-ignores Use bayes_ignore_from and bayes_ignore_to |
| --sync Synchronize the database and the journal if needed |
| --force-expire Force a database sync and expiry run |
| --dbpath <path> Allows commandline override (in bayes_path form) |
| for where to read the Bayes DB from |
| --dump [all|data|magic] Display the contents of the Bayes database |
| Takes optional argument for what to display |
| --regexp <re> For dump only, specifies which tokens to |
| dump based on a regular expression. |
| -f file, --folders=file Read list of files/directories from file |
| --dir Ignored; historical compatibility |
| --file Ignored; historical compatibility |
| --mbox Input sources are in mbox format |
| --mbx Input sources are in mbx format |
| --max-size <b> Skip messages larger than b bytes; |
| defaults to 500 KB, 0 implies no limit |
| --showdots Show progress using dots |
| --progress Show progress using progress bar |
| --no-sync Skip synchronizing the database and journal |
| after learning |
| -L, --local Operate locally, no network accesses. Use |
| of this is recommended, see documentation. |
| --import Migrate data from older version/non DB_File |
| based databases |
| --clear Wipe out existing database |
| --backup Backup, to STDOUT, existing database |
| --restore <filename> Restore a database from filename |
| -u username, --username=username |
| Override username taken from the runtime |
| environment, used with SQL |
| -C path, --configpath=path, --config-file=path |
| Path to standard configuration dir |
| -p prefs, --prefspath=file, --prefs-file=file |
| Set user preferences file |
| --siteconfigpath=path Path for site configs |
| (default: @@PREFIX@@/etc/mail/spamassassin) |
| --cf='config line' Additional line of configuration |
| -D, --debug [area,...] Print debugging messages |
| -V, --version Print version |
| -h, --help Print usage message |
| |
| =head1 DESCRIPTION |
| |
| Given a typical selection of your incoming mail classified as spam or ham |
| (non-spam), this tool will feed each mail to SpamAssassin, allowing it |
| to 'learn' what signs are likely to mean spam, and which are likely to |
| mean ham. |
| |
| Simply run this command once for each of your mail folders, and it will |
| ''learn'' from the mail therein. |
| |
| Note that csh-style I<globbing> in the mail folder names is supported; |
| in other words, listing a folder name as C<*> will scan every folder |
| that matches. See C<Mail::SpamAssassin::ArchiveIterator> for more details. |
| |
| If you are using mail boxes in format other than maildir you should use |
| the B<--mbox> or B<--mbx> parameters. |
| |
| Files compressed with gzip/bzip2/xz/lz4/lzip/lzo are uncompressed |
| automatically. See C<Mail::SpamAssassin::ArchiveIterator> for more details. |
| |
| SpamAssassin remembers which mail messages it has learnt already, and will not |
| re-learn those messages again, unless you use the B<--forget> option. Messages |
| learnt as spam will have SpamAssassin markup removed, on the fly. |
| |
| If you make a mistake and scan a mail as ham when it is spam, or vice |
| versa, simply rerun this command with the correct classification, and the |
| mistake will be corrected. SpamAssassin will automatically 'forget' the |
| previous indications. |
| |
| Users of C<spamd> who wish to perform training remotely, over a network, |
| should investigate the C<spamc -L> switch. |
| |
| =head1 OPTIONS |
| |
| =over 4 |
| |
| =item B<--ham> |
| |
| Learn the input message(s) in the files following the option as ham. |
| If you have previously learnt any of the messages as spam, SpamAssassin will |
| forget them first, then re-learn them as ham. Alternatively, if you have |
| previously learnt them as ham, it'll skip them this time around. |
| If the messages have already been filtered through SpamAssassin, the learner |
| will ignore any modifications SpamAssassin may have made. |
| |
| =item B<--spam> |
| |
| Learn the input message(s) in the files following the option as spam. |
| If you have previously learnt any of the messages as ham, SpamAssassin will |
| forget them first, then re-learn them as spam. Alternatively, if you have |
| previously learnt them as spam, it'll skip them this time around. |
| If the messages have already been filtered through SpamAssassin, the learner |
| will ignore any modifications SpamAssassin may havemmade. |
| |
| =item B<--folders>=I<filename>, B<-f> I<filename> |
| |
| sa-learn will read in the list of folders from the specified file, one folder |
| per line in the file. If the folder is prefixed with C<ham:type:> or C<spam:type:>, |
| sa-learn will learn that folder appropriately, otherwise the folders will be |
| assumed to be of the type specified by B<--ham> or B<--spam>. |
| |
| C<type> above is optional, but is the same as the standard for |
| ArchiveIterator: mbox, mbx, dir, file, or detect (the default if not |
| specified). |
| |
| =item B<--mbox> |
| |
| sa-learn will read in the file(s) containing the emails to be learned, |
| and will process them in mbox format (one or more emails per file). |
| |
| =item B<--mbx> |
| |
| sa-learn will read in the file(s) containing the emails to be learned, |
| and will process them in mbx format (one or more emails per file). |
| |
| =item B<--use-ignores> |
| |
| Don't learn the message if a from address matches configuration file |
| item C<bayes_ignore_from> or a to address matches C<bayes_ignore_to>. |
| The option might be used when learning from a large file of messages |
| from which the hammy spam messages or spammy ham messages have not |
| been removed. |
| |
| =item B<--sync> |
| |
| Synchronize the journal and databases. Upon successfully syncing the |
| database with the entries in the journal, the journal file is removed. |
| |
| =item B<--force-expire> |
| |
| Forces an expiry attempt, regardless of whether it may be necessary |
| or not. Note: This doesn't mean any tokens will actually expire. |
| Please see the EXPIRATION section below. |
| |
| Note: C<--force-expire> also causes the journal data to be synchronized |
| into the Bayes databases. |
| |
| =item B<--forget> |
| |
| Forget the input message(s) in the files following the option as previously |
| learnt. |
| |
| =item B<--dbpath> |
| |
| Allows a commandline override of the I<bayes_path> configuration option. |
| |
| =item B<--dump> I<option> |
| |
| Display the contents of the Bayes database. Without an option or with |
| the I<all> option, all magic tokens and data tokens will be displayed. |
| I<magic> will only display magic tokens, and I<data> will only display |
| the data tokens. |
| |
| Can also use the B<--regexp> I<RE> option to specify which tokens to |
| display based on a regular expression. |
| |
| =item B<--clear> |
| |
| Clear an existing Bayes database by removing all traces of the database. |
| |
| WARNING: This is destructive and should be used with care. |
| |
| =item B<--backup> |
| |
| Performs a dump of the Bayes database in machine/human readable format. |
| |
| The dump will include token and seen data. It is suitable for input back |
| into the --restore command. |
| |
| =item B<--restore>=I<filename> |
| |
| Performs a restore of the Bayes database defined by I<filename>. |
| |
| WARNING: This is a destructive operation, previous Bayes data will be wiped out. |
| |
| =item B<-h>, B<--help> |
| |
| Print help message and exit. |
| |
| =item B<-u> I<username>, B<--username>=I<username> |
| |
| If specified this username will override the username taken from the runtime |
| environment. You can use this option to specify users in a virtual user |
| configuration when using SQL as the Bayes backend. |
| |
| NOTE: This option will not change to the given I<username>, it will only attempt |
| to act on behalf of that user. Because of this you will need to have proper |
| permissions to be able to change files owned by I<username>. In the case of SQL |
| this generally is not a problem. |
| |
| =item B<-C> I<path>, B<--configpath>=I<path>, B<--config-file>=I<path> |
| |
| Use the specified path for locating the distributed configuration files. |
| Ignore the default directories (usually C</usr/share/spamassassin> or similar). |
| |
| =item B<--siteconfigpath>=I<path> |
| |
| Use the specified path for locating site-specific configuration files. Ignore |
| the default directories (usually C</etc/mail/spamassassin> or similar). |
| |
| =item B<--cf='config line'> |
| |
| Add additional lines of configuration directly from the command-line, parsed |
| after the configuration files are read. Multiple B<--cf> arguments can be |
| used, and each will be considered a separate line of configuration. |
| |
| =item B<-p> I<prefs>, B<--prefspath>=I<prefs>, B<--prefs-file>=I<prefs> |
| |
| Read user score preferences from I<prefs> (usually C<$HOME/.spamassassin/user_prefs>). |
| |
| =item B<--progress> |
| |
| Prints a progress bar (to STDERR) showing the current progress. In the case |
| where no valid terminal is found this option will behave very much like the |
| --showdots option. |
| |
| =item B<-D> [I<area,...>], B<--debug> [I<area,...>] |
| |
| Produce debugging output. If no areas are listed, all debugging information is |
| printed. Diagnostic output can also be enabled for each area individually; |
| I<area> is the area of the code to instrument. For example, to produce |
| diagnostic output on bayes, learn, and dns, use: |
| |
| spamassassin -D bayes,learn,dns |
| |
| Use an empty string (-D '') to indicate no areas when the next item on the |
| command line is a path, to prevent the path from being parsed as an area. |
| |
| For more information about which areas (also known as channels) are available, |
| please see the documentation at: |
| |
| C<https://wiki.apache.org/spamassassin/DebugChannels> |
| |
| Higher priority informational messages that are suitable for logging in normal |
| circumstances are available with an area of "info". |
| |
| =item B<--no-sync> |
| |
| Skip the slow synchronization step which normally takes place after |
| changing database entries. If you plan to learn from many folders in |
| a batch, or to learn many individual messages one-by-one, it is faster |
| to use this switch and run C<sa-learn --sync> once all the folders have |
| been scanned. |
| |
| Clarification: The state of I<--no-sync> overrides the |
| I<bayes_learn_to_journal> configuration option. If not specified, |
| sa-learn will learn to the database directly. If specified, sa-learn |
| will learn to the journal file. |
| |
| Note: I<--sync> and I<--no-sync> can be specified on the same commandline, |
| which is slightly confusing. In this case, the I<--no-sync> option is |
| ignored since there is no learn operation. |
| |
| =item B<-L>, B<--local> |
| |
| Do not perform any network accesses while learning details about the mail |
| messages. This should be normally used, as there really isn't anything |
| Bayes can learn from network lookup results. Official SpamAssassin plugins |
| do not currently do any network lookups when learning, but it's possible |
| that third party ones might. |
| |
| =item B<--import> |
| |
| If you previously used SpamAssassin's Bayesian learner without the C<DB_File> |
| module installed, it will have created files in other formats, such as |
| C<GDBM_File>, C<NDBM_File>, or C<SDBM_File>. This switch allows you to migrate |
| that old data into the C<DB_File> format. It will overwrite any data currently |
| in the C<DB_File>. |
| |
| Can also be used with the B<--dbpath> I<path> option to specify the location of |
| the Bayes files to use. |
| |
| =back |
| |
| =head1 MIGRATION |
| |
| There are now multiple backend storage modules available for storing |
| user's bayesian data. As such you might want to migrate from one |
| backend to another. Here is a simple procedure for migrating from one |
| backend to another. |
| |
| Note that if you have individual user databases you will have to |
| perform a similar procedure for each one of them. |
| |
| =over 4 |
| |
| =item sa-learn --sync |
| |
| This will sync any outstanding journal entries |
| |
| =item sa-learn --backup E<gt> backup.txt |
| |
| This will save all your Bayes data to a plain text file. |
| |
| =item sa-learn --clear |
| |
| This is optional, but good to do to clear out the old database. |
| |
| =item Repeat! |
| |
| At this point, if you have multiple databases, you should perform the |
| procedure above for each of them. (i.e. each user's database needs to |
| be backed up before continuing.) |
| |
| =item Switch backends |
| |
| Once you have backed up all databases you can update your |
| configuration for the new database backend. This will involve at least |
| the bayes_store_module config option and may involve some additional |
| config options depending on what is required by the module. (For |
| example, you may need to configure an SQL database.) |
| |
| =item sa-learn --restore backup.txt |
| |
| Again, you need to do this for every database. |
| |
| =back |
| |
| If you are migrating to SQL you can make use of the -u I<username> |
| option in sa-learn to populate each user's database. Otherwise, you |
| must run sa-learn as the user who database you are restoring. |
| |
| |
| =head1 INTRODUCTION TO BAYESIAN FILTERING |
| |
| (Thanks to Michael Bell for this section!) |
| |
| For a more lengthy description of how this works, go to |
| http://www.paulgraham.com/ and see "A Plan for Spam". It's reasonably |
| readable, even if statistics make me break out in hives. |
| |
| The short semi-inaccurate version: Given training, a spam heuristics engine |
| can take the most "spammy" and "hammy" words and apply probabilistic |
| analysis. Furthermore, once given a basis for the analysis, the engine can |
| continue to learn iteratively by applying both the non-Bayesian and Bayesian |
| rulesets together to create evolving "intelligence". |
| |
| SpamAssassin 2.50 and later supports Bayesian spam analysis, in |
| the form of the BAYES rules. This is a new feature, quite powerful, |
| and is disabled until enough messages have been learnt. |
| |
| The pros of Bayesian spam analysis: |
| |
| =over 4 |
| |
| =item Can greatly reduce false positives and false negatives. |
| |
| It learns from your mail, so it is tailored to your unique e-mail flow. |
| |
| =item Once it starts learning, it can continue to learn from SpamAssassin |
| and improve over time. |
| |
| =back |
| |
| And the cons: |
| |
| =over 4 |
| |
| =item A decent number of messages are required before results are useful |
| for ham/spam determination. |
| |
| =item It's hard to explain why a message is or isn't marked as spam. |
| |
| i.e.: a straightforward rule, that matches, say, "VIAGRA" is |
| easy to understand. If it generates a false positive or false negative, |
| it is fairly easy to understand why. |
| |
| With Bayesian analysis, it's all probabilities - "because the past says |
| it is likely as this falls into a probabilistic distribution common to past |
| spam in your systems". Tell that to your users! Tell that to the client |
| when he asks "what can I do to change this". (By the way, the answer in |
| this case is "use welcomelisting".) |
| |
| =item It will take disk space and memory. |
| |
| The databases it maintains take quite a lot of resources to store and use. |
| |
| =back |
| |
| =head1 GETTING STARTED |
| |
| Still interested? Ok, here's the guidelines for getting this working. |
| |
| First a high-level overview: |
| |
| =over 4 |
| |
| =item Build a significant sample of both ham and spam. |
| |
| I suggest several thousand of each, placed in SPAM and HAM directories or |
| mailboxes. Yes, you MUST hand-sort this - otherwise the results won't be much |
| better than SpamAssassin on its own. Verify the spamminess/haminess of EVERY |
| message. You're urged to avoid using a publicly available corpus (sample) - |
| this must be taken from YOUR mail server, if it is to be statistically useful. |
| Otherwise, the results may be pretty skewed. |
| |
| =item Use this tool to teach SpamAssassin about these samples, like so: |
| |
| sa-learn --spam /path/to/spam/folder |
| sa-learn --ham /path/to/ham/folder |
| sa-learn --ham hampath1 hampath2 --spam spampath1 spampath2 |
| ... |
| |
| Let SpamAssassin proceed, learning stuff. When it finds ham and spam |
| it will add the "interesting tokens" to the database. |
| |
| =item If you need SpamAssassin to forget about specific messages, use |
| the B<--forget> option. |
| |
| This can be applied to either ham or spam that has run through the |
| B<sa-learn> processes. It's a bit of a hammer, really, lowering the |
| weighting of the specific tokens in that message (only if that message has |
| been processed before). |
| |
| =item Learning from single messages uses a command like this: |
| |
| sa-learn --ham --no-sync mailmessage |
| |
| This is handy for binding to a key in your mail user agent. It's very fast, as |
| all the time-consuming stuff is deferred until you run with the C<--sync> |
| option. |
| |
| =item Autolearning is enabled by default |
| |
| If you don't have a corpus of mail saved to learn, you can let |
| SpamAssassin automatically learn the mail that you receive. If you are |
| autolearning from scratch, the amount of mail you receive will determine |
| how long until the BAYES_* rules are activated. |
| |
| =back |
| |
| =head1 EFFECTIVE TRAINING |
| |
| Learning filters require training to be effective. If you don't train |
| them, they won't work. In addition, you need to train them with new |
| messages regularly to keep them up-to-date, or their data will become |
| stale and impact accuracy. |
| |
| You need to train with both spam I<and> ham mails. One type of mail |
| alone will not have any effect. |
| |
| Note that if your mail folders contain things like forwarded spam, |
| discussions of spam-catching rules, etc., this will cause trouble. You |
| should avoid scanning those messages if possible. (An easy way to do this |
| is to move them aside, into a folder which is not scanned.) |
| |
| If the messages you are learning from have already been filtered through |
| SpamAssassin, the learner will compensate for this. In effect, it learns what |
| each message would look like if you had run C<spamassassin -d> over it in |
| advance. |
| |
| Another thing to be aware of, is that typically you should aim to train |
| with at least 1000 messages of spam, and 1000 ham messages, if |
| possible. More is better, but anything over about 5000 messages does not |
| improve accuracy significantly in our tests. |
| |
| Be careful that you train from the same source -- for example, if you train |
| on old spam, but new ham mail, then the classifier will think that |
| a mail with an old date stamp is likely to be spam. |
| |
| It's also worth noting that training with a very small quantity of |
| ham, will produce atrocious results. You should aim to train with at |
| least the same amount (or more if possible!) of ham data than spam. |
| |
| On an on-going basis, it is best to keep training the filter to make |
| sure it has fresh data to work from. There are various ways to do |
| this: |
| |
| =over 4 |
| |
| =item 1. Supervised learning |
| |
| This means keeping a copy of all or most of your mail, separated into spam |
| and ham piles, and periodically re-training using those. It produces |
| the best results, but requires more work from you, the user. |
| |
| (An easy way to do this, by the way, is to create a new folder for |
| 'deleted' messages, and instead of deleting them from other folders, |
| simply move them in there instead. Then keep all spam in a separate |
| folder and never delete it. As long as you remember to move misclassified |
| mails into the correct folder set, it is easy enough to keep up to date.) |
| |
| =item 2. Unsupervised learning from Bayesian classification |
| |
| Another way to train is to chain the results of the Bayesian classifier |
| back into the training, so it reinforces its own decisions. This is only |
| safe if you then retrain it based on any errors you discover. |
| |
| SpamAssassin does not support this method, due to experimental results |
| which strongly indicate that it does not work well, and since Bayes is |
| only one part of the resulting score presented to the user (while Bayes |
| may have made the wrong decision about a mail, it may have been overridden |
| by another system). |
| |
| =item 3. Unsupervised learning from SpamAssassin rules |
| |
| Also called 'auto-learning' in SpamAssassin. Based on statistical |
| analysis of the SpamAssassin success rates, we can automatically train the |
| Bayesian database with a certain degree of confidence that our training |
| data is accurate. |
| |
| It should be supplemented with some supervised training in addition, if |
| possible. |
| |
| This is the default, but can be turned off by setting the SpamAssassin |
| configuration parameter C<bayes_auto_learn> to 0. |
| |
| =item 4. Mistake-based training |
| |
| This means training on a small number of mails, then only training on |
| messages that SpamAssassin classifies incorrectly. This works, but it |
| takes longer to get it right than a full training session would. |
| |
| =back |
| |
| =head1 FILES |
| |
| B<sa-learn> and the other parts of SpamAssassin's Bayesian learner, |
| use a set of persistent database files to store the learnt tokens, as follows. |
| |
| =over 4 |
| |
| =item bayes_toks |
| |
| The database of tokens, containing the tokens learnt, their count of |
| occurrences in ham and spam, and the timestamp when the token was last |
| seen in a message. |
| |
| This database also contains some 'magic' tokens, as follows: the version |
| number of the database, the number of ham and spam messages learnt, the |
| number of tokens in the database, and timestamps of: the last journal |
| sync, the last expiry run, the last expiry token reduction count, the |
| last expiry timestamp delta, the oldest token timestamp in the database, |
| and the newest token timestamp in the database. |
| |
| This is a database file, using C<DB_File>. The database 'version |
| number' is 0 for databases from 2.5x, 1 for databases from certain 2.6x |
| development releases, 2 for 2.6x, and 3 for 3.0 and later releases. |
| |
| =item bayes_seen |
| |
| A map of Message-Id and some data from headers and body to what that |
| message was learnt as. This is used so that SpamAssassin can avoid |
| re-learning a message it has already seen, and so it can reverse the |
| training if you later decide that message was learnt incorrectly. |
| |
| This is a database file, using C<DB_File>. |
| |
| =item bayes_journal |
| |
| While SpamAssassin is scanning mails, it needs to track which tokens |
| it uses in its calculations. To avoid the contention of having each |
| SpamAssassin process attempting to gain write access to the Bayes DB, |
| the token timestamps are written to a 'journal' file which will later |
| (either automatically or via C<sa-learn --sync>) be used to synchronize |
| the Bayes DB. |
| |
| Also, through the use of C<bayes_learn_to_journal>, or when using the |
| C<--no-sync> option with sa-learn, the actual learning data will take |
| be placed into the journal for later synchronization. This is typically |
| useful for high-traffic sites to avoid the same contention as stated |
| above. |
| |
| =back |
| |
| =head1 EXPIRATION |
| |
| Since SpamAssassin can auto-learn messages, the Bayes database files |
| could increase perpetually until they fill your disk. To control this, |
| SpamAssassin performs journal synchronization and bayes expiration |
| periodically when certain criteria (listed below) are met. |
| |
| SpamAssassin can sync the journal and expire the DB tokens either |
| manually or opportunistically. A journal sync is due if I<--sync> |
| is passed to sa-learn (manual), or if the following is true |
| (opportunistic): |
| |
| =over 4 |
| |
| =item - bayes_journal_max_size does not equal 0 (means don't sync) |
| |
| =item - the journal file exists |
| |
| =back |
| |
| and either: |
| |
| =over 4 |
| |
| =item - the journal file has a size greater than bayes_journal_max_size |
| |
| =back |
| |
| or |
| |
| =over 4 |
| |
| =item - a journal sync has previously occurred, and at least 1 day has |
| passed since that sync |
| |
| =back |
| |
| Expiry is due if I<--force-expire> is passed to sa-learn (manual), |
| or if all of the following are true (opportunistic): |
| |
| =over 4 |
| |
| =item - the last expire was attempted at least 12hrs ago |
| |
| =item - bayes_auto_expire does not equal 0 |
| |
| =item - the number of tokens in the DB is E<gt> 100,000 |
| |
| =item - the number of tokens in the DB is E<gt> bayes_expiry_max_db_size |
| |
| =item - there is at least a 12 hr difference between the oldest and newest token atimes |
| |
| =back |
| |
| =head2 EXPIRE LOGIC |
| |
| If either the manual or opportunistic method causes an expire run |
| to start, here is the logic that is used: |
| |
| =over 4 |
| |
| =item - figure out how many tokens to keep. take the larger of |
| either bayes_expiry_max_db_size * 75% or 100,000 tokens. therefore, the goal |
| reduction is number of tokens - number of tokens to keep. |
| |
| =item - if the reduction number is < 1000 tokens, abort (not worth the effort). |
| |
| =item - if an expire has been done before, guesstimate the new |
| atime delta based on the old atime delta. (new_atime_delta = |
| old_atime_delta * old_reduction_count / goal) |
| |
| =item - if no expire has been done before, or the last expire looks |
| "weird", do an estimation pass. The definition of "weird" is: |
| |
| =over 8 |
| |
| =item - last expire over 30 days ago |
| |
| =item - last atime delta was < 12 hrs |
| |
| =item - last reduction count was < 1000 tokens |
| |
| =item - estimated new atime delta is < 12 hrs |
| |
| =item - the difference between the last reduction count and the goal reduction count is E<gt> 50% |
| |
| =back |
| |
| =back |
| |
| =head2 ESTIMATION PASS LOGIC |
| |
| Go through each of the DB's tokens. Starting at 12hrs, calculate |
| whether or not the token would be expired (based on the difference |
| between the token's atime and the db's newest token atime) and keep |
| the count. Work out from 12hrs exponentially by powers of 2. ie: |
| 12hrs * 1, 12hrs * 2, 12hrs * 4, 12hrs * 8, and so on, up to 12hrs |
| * 512 (6144hrs, or 256 days). |
| |
| The larger the delta, the smaller the number of tokens that will |
| be expired. Conversely, the number of tokens goes up as the delta |
| gets smaller. So starting at the largest atime delta, figure out |
| which delta will expire the most tokens without going above the |
| goal expiration count. Use this to choose the atime delta to use, |
| unless one of the following occurs: |
| |
| =over 8 |
| |
| =item - the largest atime (smallest reduction count) would expire |
| too many tokens. this means the learned tokens are mostly old and |
| there needs to be new tokens learned before an expire can |
| occur. |
| |
| =item - all of the atime choices result in 0 tokens being removed. |
| this means the tokens are all newer than 12 hours and there needs |
| to be new tokens learned before an expire can occur. |
| |
| =item - the number of tokens that would be removed is < 1000. the |
| benefit isn't worth the effort. more tokens need to be learned. |
| |
| =back |
| |
| If the expire run gets past this point, it will continue to the end. |
| A new DB is created since the majority of DB libraries don't shrink the |
| DB file when tokens are removed. So we do the "create new, migrate old |
| to new, remove old, rename new" shuffle. |
| |
| =head2 EXPIRY RELATED CONFIGURATION SETTINGS |
| |
| =over 4 |
| |
| =item C<bayes_auto_expire> is used to specify whether or not SpamAssassin |
| ought to opportunistically attempt to expire the Bayes database. |
| The default is 1 (yes). |
| |
| =item C<bayes_expiry_max_db_size> specifies both the auto-expire token |
| count point, as well as the resulting number of tokens after expiry |
| as described above. The default value is 150,000, which is roughly |
| equivalent to a 6Mb database file if you're using DB_File. |
| |
| =item C<bayes_journal_max_size> specifies how large the Bayes |
| journal will grow before it is opportunistically synced. The |
| default value is 102400. |
| |
| =back |
| |
| =head1 INSTALLATION |
| |
| The B<sa-learn> command is part of the B<Mail::SpamAssassin> Perl module. |
| Install this as a normal Perl module, using C<perl -MCPAN -e shell>, |
| or by hand. |
| |
| =head1 SEE ALSO |
| |
| spamassassin(1) |
| spamc(1) |
| Mail::SpamAssassin(3) |
| Mail::SpamAssassin::ArchiveIterator(3) |
| |
| E<lt>http://www.paulgraham.com/E<gt> |
| Paul Graham's "A Plan For Spam" paper |
| |
| E<lt>http://www.linuxjournal.com/article/6467E<gt> |
| Gary Robinson's f(x) and combining algorithms, as used in SpamAssassin |
| |
| E<lt>http://web.archive.org/web/20120512230723/http://www.bgl.nu/~glouis/bogofilter/E<gt> |
| 'Training on error' page. A discussion of various Bayes training regimes, |
| including 'train on error' and unsupervised training. |
| |
| =head1 PREREQUISITES |
| |
| C<Mail::SpamAssassin> |
| |
| =head1 AUTHORS |
| |
| The SpamAssassin(tm) Project E<lt>https://spamassassin.apache.org/E<gt> |
| |
| =cut |
| |