promotions validated
git-svn-id: https://svn.apache.org/repos/asf/spamassassin/tags/sa-update_3.4.1_20150415085034@1673682 13f79535-47bb-0310-9956-ffa450edef68
diff --git a/MANIFEST b/MANIFEST
index edef111..e1861d2 100644
--- a/MANIFEST
+++ b/MANIFEST
@@ -142,7 +142,6 @@
rules/v330.pre
rules/v340.pre
rules/v341.pre
-rules/20_aux_tlds.cf
sa-awl.raw
sa-check_spamd.raw
sa-compile.raw
@@ -420,7 +419,6 @@
t/html_colors.t
t/html_obfu.t
t/html_utf8.t
-t/idn_dots.t
t/if_can.t
t/ifversion.t
t/ip_addrs.t
@@ -550,8 +548,3 @@
t/uribl_all_types.t
t/uribl_ips_only.t
t/dnsbl_subtests.t
-powered_by/128-powered-by-spamassassin.png
-powered_by/256-powered-by-spamassassin.png
-powered_by/512-powered-by-spamassassin.png
-powered_by/LOGO_USAGE.TXT
-powered_by/powered_by_spamassassin.psd
diff --git a/build/README b/build/README
index 7458a2c..d9066d8 100644
--- a/build/README
+++ b/build/README
@@ -66,6 +66,16 @@
(ie., no "M" or "C" files; any files marked "M" have been locally
modified, and should be "svn revert"ed before you continue.)
+#- consider updating the TLD list in
+# Mail/SpamAssassin/Util/RegistrarBoundaries.pm
+#
+# Follow the documentation under %VALID_TLDS and $VALID_TLDS_RE for
+# updating the TLD list, make test, and do a commit if there are any
+# changes from the previous TLD list.
+#
+# Currently, the only way to change the TLD list is via a release as
+# of 6/17/2014.
+
- Consider updating the TLD list in 20_aux_tlds.cf. As of 4/6/2015,
this is not automated but bug 7165 is open for this purpose.
diff --git a/build/announcements/3.4.1-rc2.txt b/build/announcements/3.4.1-rc2.txt
index 747e9bf..915cf21 100644
--- a/build/announcements/3.4.1-rc2.txt
+++ b/build/announcements/3.4.1-rc2.txt
@@ -18,106 +18,129 @@
Notable features:
=================
-New plugins
------------
+Bug 7115: Adding SHA digests of MIME parts as Bayes tokens allows bayes
+to see non-textual content - added configurability
-There are three new plugins added with this release:
+rewritten Node::_normalize
- Mail::SpamAssassin::Plugin::TxRep
- Mail::SpamAssassin::Plugin::PDFInfo
- Mail::SpamAssassin::Plugin::URILocalBL
-
-The TxRep (Reputation) plugin is designed as a substantially improved
-replacement of the AWL plugin. It adjusts the final message spam score
-by looking up and taking in consideration the reputation of the sender.
-It cannot coexist with the old AWL plugin, which must be disabled when
-the TxRep is loaded.
-
-The PDFInfo plugin helps detected spam using attached PDF files.
-
-The URILocalBL plugin creates some new rule test types, such as
-"uri_block_cc", "uri_block_cidr", and "uri_block_isp". These rules
-apply to the URIs found in the HTML portion of a message, i.e.
-<a href=...> markup.
-
-All these three plugins are disabled by default. To enable, uncomment
-the loadplugin configuration options in file v341.pre or add them to
-some local .pre file such as local.pre .
-
-Plugins are documented in their respective man pages.
-
-
-Notable changes
----------------
-
-Adding SHA digests of MIME parts as Bayes tokens allows bayes
-to see non-textual content. The set of sources of bayes tokens is
-configurable with a new configuration option 'bayes_token_sources'
-as documented in the Mail::SpamAssassin::Conf man page, disabled
-by default for backward compatibility. (Bug 7115)
-
-Subroutine Node::_normalize has been rewritten. The new behavior
-is documented with the 'normalize_charset' option in the
-Mail::SpamAssassin::Conf man page. (Bug 7144, Bug 7126, Bug 7133)
-
-Tokenization of UTF-8 -encoded or normalized text has been improved
-in the Bayes plugin. (Bug 7130, Bug 7135, Bug 7141)
+improved tokenization of UTF-8 -encoded or normalized text in
+the Bayes plugin
New configuration options
-------------------------
-The 'normalize_charset' configuration option already existed in previous
-versions, but its functionality has been re-implemented to put more
-emphasis on the declared character set of a MIME part instead of relying
-on guesswork by Encode::Detect::Detector. When enabled, it converts
-non- UTF-8 textual parts of a mail message into UTF-8 encoding, before
-passing them to HTML decoding and to rules processing. This makes it
-possible to write regular expressions and strings in rules in UTF-8
-encoding, and allows plugins (such as tokenization in a Bayes plugin)
-to recognize multibyte characters and words in non-English languages
-as such, instead of 'randomly' considering some non-ASCII octets in
-multibyte characters as delimiters. Please see documentation for this
-configuration option in the Mail::SpamAssassin::Conf man page.
-
-The configuration option 'dns_server' can now specify a scoped
-link-local IPv6 address, e.g.: dns_server [fe80::1%lo0]:53
-
-A new configuration option 'bayes_token_sources' allows more control
-on the sources of tokens for the Bayes plugin. For compatibility
-the default set of sources is unchanged, but consider:
- bayes_token_sources all
-or: bayes_token_sources mimepart
-to include SHA1 digests of all MIME parts of a message as Bayes tokens.
-Please see documentation for this option in the Mail::SpamAssassin::Conf
-man page.
-
-A new configuration option 'dkim_minimum_key_bits' with a default value
-of 1024 bits now controls the smallest size of a signing key (in bits)
-for a valid signature to be considered for whitelisting. Please see
-documentation for this option in the Mail::SpamAssassin::Plugin::DKIM
-man page.
-
-A new configuration option 'parse_dkim_uris' allows DKIM header fields
-to be parsed for URIs to process alongside URIs found in the body with
-some rules and modules (e.g. URIDNSBL).
-
-The configuration option 'check_rbl_from_domain' checks all the domain
-names in a From mail address as an alternate to check_rbl_from_host.
-As of v3.4.1, it has been improved to include a subtest for a specific
-octet.
-
-
-
-
-
-??? perl_version
-??? (Introduced in 3.4.2) This will be replaced with the version
-???-->> THIS NEEDS TO BE FIXED in Conf.pm, WE ARE AT 3.4.1
-
Added flag 'noawl' to the 'tflags' configuration option.
+parse_dkim_uris ( 0 | 1 ) (default: 0)
+
+ If this option is set to 1 and the message contains DKIM headers,
+ the headers will be parsed for URIs to process alongside URIs found
+ in the body with some rules and moduels (ex. URIDNSBL)
+
+
+perl_version
+ (Introduced in 3.4.2) This will be replaced with the version
+-->> THIS NEEDS TO BE FIXED in Conf.pm, WE ARE AT 3.4.1
+
+
+changed implementation, may produce different result in some cases:
+
+normalize_charset ( 0 | 1) (default: 0)
+ Whether to decode non- UTF-8 and non-ASCII textual parts and recode
+ them to UTF-8 before the text is given over to rules processing.
+ The character set used for attempted decoding is primarily based on
+ a declared character set in a Content-Type header, but if the
+ decoding attempt fails a module Encode::Detect::Detector is
+ consulted (if available) to provide a guess based on the actual
+ text, and decoding is re-attempted. Even if the option is enabled
+ no unnecessary decoding and re-encoding work is done when possible
+ (like with an all-ASCII text with a US-ASCII or extended ASCII
+ character set declaration, e.g. UTF-8 or ISO-8859-nn or Windows-nnnn).
+
+ Unicode support in old versions of perl or in a core module Encode
+ is likely to be buggy in places, so if the normalize_charset
+ function is enabled it is advised to stick to more recent versions
+ of perl (preferably 5.12 or later). The module
+ Encode::Detect::Detector is optional, when necessary it will be
+ used if it is available.
+
+
+option dns_server can now specify a link-local IPv6 address, e.g.:
+ dns_server [fe80::1%lo0]:53
+
+
+new option:
+
+bayes_token_sources (default: header visible invisible uri)
+ Controls which sources in a mail message can contribute tokens
+ (e.g. words, phrases, etc.) to a Bayes classifier. The argument is
+ a space-separated list of keywords: header, visible, invisible,
+ uri, mimepart), each of which may be prefixed by a no to indicate
+ its exclusion. Additionally two reserved keywords are allowed: all
+ and none (or: noall). The list of keywords is processed
+ sequentially: a keyword all adds all available keywords to a set
+ being built, a none or noall clears the set, other non-negated
+ keywords are added to the set, and negated keywords are removed
+ from the set. Keywords are case-insensitive.
+
+ The default set is: header visible invisible uri, which is
+ equivalent for example to: All NoMIMEpart. The reason why mimepart
+ is not currently in a default set is that it is a newer source
+ (introduced with SpamAssassin version 3.4.1) and not much
+ experience has yet been gathered regarding its usefulness.
+
+ See also option "bayes_ignore_header" for a fine-grained control on
+ individual header fields under the umbrella of a more general
+ keyword header here.
+
+ Keywords imply the following data sources:
+ header - tokens collected from a message header section
+ visible - words from visible text (plain or HTML) in a message body
+ invisible - hidden/invisible text in HTML parts of a message body
+ uri - URIs collected from a message body
+ mimepart - digests (hashes) of all MIME parts (textual or non-
+ textual) of a message, computed after Base64 and quoted-printable
+ decoding, suffixed by their Content-Type
+ all - adds all the above keywords to the set being assembled
+ none or noall - removes all keywords from the set
+
+ The "bayes_token_sources" directive may appear multiple times, its
+ keywords are interpreted sequentially, adding or removing items
+ from the final set as they appear in their order in
+ "bayes_token_sources" directive(s).
+
+
+new option:
+
+dkim_minimum_key_bits n (default: 1024)
+ The smallest size of a signing key (in bits) for a valid signature
+ to be considered for whitelisting. Additionally, the eval function
+ check_dkim_valid() will return false on short keys when called with
+ explicitly listed domains, and the eval function
+ check_dkim_valid_author_sig() will return false on short keys
+ (regardless of its arguments). Setting the option to 0 disables a
+ key size check.
+
+ Note that the option has no effect when the eval function
+ check_dkim_valid() is called with no arguments (like in a rule
+ DKIM_VALID). A mere presence of some valid signature on a message
+ has no reputational value (without being associated with a
+ particular domain), regardless of its key size - anyone can prepend
+ its own signature on a copy of some third party mail and re-send
+ it, which makes it no more trustworthy than without such signature.
+ This is also a reason for a rule DKIM_VALID to have a near-zero score.
+
+
+change:
+
+check_rbl_from_domain
+ This checks all the from addrs domain names as an alternate to
+ check_rbl_from_host. As of v3.4.1, it has been improved to include
+ a subtest for a specific octet.
+
+
new template tags:
_SENDERDOMAIN_ a domain name of the envelope sender address, lowercased
_AUTHORDOMAIN_ a domain name of the author address (the From header
@@ -145,6 +168,16 @@
+New plugins
+-----------
+
+New plugin (optional):
+# loadplugin Mail::SpamAssassin::Plugin::TxRep
+# loadplugin Mail::SpamAssassin::Plugin::PDFInfo ???
+
+URILocalBL.pm ???
+
+
Rule updates
------------
@@ -213,7 +246,7 @@
Bug 7136: added has_check_for_spf_errors and if can() encapsulation
-Bug 7128: DCC plugin now uses IO::Socket::IP instead of IO::Socket::INET6
+Bug 7128: DCC plugin now uses IO::Socket::INET6 instead of IO::Socket::IP
Bug 7099: Adding tags SENDERDOMAIN and AUTHORDOMAIN
@@ -249,12 +282,12 @@
unnecessary copying was avoided when reading from a temporary file
in SA::Message::Node (small optimization)
+changed fillfactor for postgres bayes/awl tables to optimize for updates
+
a small hotspot in DnsResolver.pm was optimized
use faster utf8::encode instead of Encode::encode_utf8
-changed fillfactor for postgres bayes/awl tables to optimize for updates
-
disabled synchronous commit for Postgres Bayes store
diff --git a/lib/Mail/SpamAssassin/Conf.pm b/lib/Mail/SpamAssassin/Conf.pm
index c57edab..7deccc4 100644
--- a/lib/Mail/SpamAssassin/Conf.pm
+++ b/lib/Mail/SpamAssassin/Conf.pm
@@ -1139,29 +1139,29 @@
unless (defined $value && $value !~ /^$/) {
return $MISSING_REQUIRED_VALUE;
}
- if (lc $value eq 'yes' || $value eq '1') { $value = 1 }
- elsif (lc $value eq 'no' || $value eq '0') { $value = 0 }
- else { return $INVALID_VALUE }
-
- $self->{normalize_charset} = $value;
+ return if $value == 0;
+ return $INVALID_VALUE unless $value == 1;
unless ($] > 5.008004) {
$self->{parser}->lint_warn("config: normalize_charset requires Perl 5.8.5 or later");
- $self->{normalize_charset} = 0;
return $INVALID_VALUE;
}
require HTML::Parser;
#changed to eval to use VERSION so that this version was not incorrectly parsed for CPAN
unless ( eval { HTML::Parser->VERSION(3.46) } ) {
$self->{parser}->lint_warn("config: normalize_charset requires HTML::Parser 3.46 or later");
- $self->{normalize_charset} = 0;
return $INVALID_VALUE;
}
+# unless (eval 'require Encode::Detect::Detector') {
+# $self->{parser}->lint_warn("config: normalize_charset requires Encode::Detect::Detector");
+# return $INVALID_VALUE;
+# }
unless (eval 'require Encode') {
$self->{parser}->lint_warn("config: normalize_charset requires Encode");
- $self->{normalize_charset} = 0;
return $INVALID_VALUE;
}
+
+ $self->{normalize_charset} = 1;
}
});
diff --git a/lib/Mail/SpamAssassin/Message/Node.pm b/lib/Mail/SpamAssassin/Message/Node.pm
index f1ad898..8833d7a 100644
--- a/lib/Mail/SpamAssassin/Message/Node.pm
+++ b/lib/Mail/SpamAssassin/Message/Node.pm
@@ -486,11 +486,6 @@
$chset = 'Windows-1252'; $decoder = $enc_w1252;
} else {
$chset = $charset_declared; $decoder = Encode::find_encoding($chset);
- if (!$decoder && $chset =~ /^GB[ -]?18030(?:-20\d\d)?\z/i) {
- $decoder = Encode::find_encoding('GBK'); # a subset of GB18030
- dbg("message: no decoder for a declared charset %s, using GBK",
- $chset) if $decoder;
- }
}
if (!$decoder) {
dbg("message: failed decoding, no decoder for a declared charset %s",
@@ -544,11 +539,6 @@
my $charset_detected = Encode::Detect::Detector::detect($_[1]);
if ($charset_detected && lc $charset_detected ne lc $charset_declared) {
my $decoder = Encode::find_encoding($charset_detected);
- if (!$decoder && $charset_detected =~ /^GB[ -]?18030(?:-20\d\d)?\z/i) {
- $decoder = Encode::find_encoding('GBK'); # a subset of GB18030
- dbg("message: no decoder for a detected charset %s, using GBK",
- $charset_detected) if $decoder;
- }
if (!$decoder) {
dbg("message: failed decoding, no decoder for a detected charset %s",
$charset_detected);
diff --git a/rules/local.cf b/rules/local.cf
index 95bc494..a40b217 100644
--- a/rules/local.cf
+++ b/rules/local.cf
@@ -52,11 +52,6 @@
# bayes_ignore_header X-Spam-Status
-# Whether to decode non- UTF-8 and non-ASCII textual parts and recode
-# them to UTF-8 before the text is given over to rules processing.
-#
-# normalize_charset 1
-
# Some shortcircuiting, if the plugin is enabled
#
ifplugin Mail::SpamAssassin::Plugin::Shortcircuit
diff --git a/sql/README.txrep b/sql/README.txrep
index 38b975b..df0b5be 100644
--- a/sql/README.txrep
+++ b/sql/README.txrep
@@ -38,9 +38,9 @@
user_awl_dsn DBI:mysql:spamassassin:localhost
Would tell SpamAssassin to connect to the database named spamassassin using
-MySQL on the local server, and since <port> is omitted, the driver will use
-the default port number. The other two required options tells SpamAssassin
-to use the defined username and password to establish the connection.
+MySQL on the local server, and since <port> is omitted, the driver will use the
+default port number. The other two required options tells SpamAssassin to use
+the defined username and password to establish the connection.
If the user_awl_dsn option does not exist, SpamAssassin will not attempt
to use SQL for tracking reputations.
@@ -85,12 +85,3 @@
you must specify the proper storage backend in the config file in order
for this to work and the current username must be passed to spamd.
-Maintenance
----------------
-
-It is recommended to keep user_awl_sql_table clear of stale data, for
-performance reasons. A sample query that can be run on a regular
-schedule is below:
-
-DELETE FROM txrep WHERE last_hit <= (now() - INTERVAL 120 day);
-
diff --git a/sql/txrep_mysql.sql b/sql/txrep_mysql.sql
index bbe6b95..f540196 100644
--- a/sql/txrep_mysql.sql
+++ b/sql/txrep_mysql.sql
@@ -5,7 +5,5 @@
count int(11) NOT NULL default '0',
totscore float NOT NULL default '0',
signedby varchar(255) NOT NULL default '',
- last_hit timestamp NOT NULL default CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP,
- PRIMARY KEY (username,email,signedby,ip),
- KEY last_hit (last_hit)
+ PRIMARY KEY (username,email,signedby,ip)
) ENGINE=InnoDB;
diff --git a/sql/txrep_pg.sql b/sql/txrep_pg.sql
index 52c1003..591758d 100644
--- a/sql/txrep_pg.sql
+++ b/sql/txrep_pg.sql
@@ -5,9 +5,7 @@
count int(11) NOT NULL default '0',
totscore float NOT NULL default '0',
signedby varchar(255) NOT NULL default '',
- last_hit timestamp NOT NULL default CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP,
- PRIMARY KEY (username,email,signedby,ip),
- KEY last_hit (last_hit)
+ PRIMARY KEY (username,email,signedby,ip)
);
ALTER TABLE txrep SET (fillfactor=95);
diff --git a/t/idn_dots.t b/t/idn_dots.t
deleted file mode 100755
index 7073d58..0000000
--- a/t/idn_dots.t
+++ /dev/null
@@ -1,100 +0,0 @@
-#!/usr/bin/perl -w
-
-# test URIs with UTF8 IDNA-equivalent dots between domains instead of ordinary '.'
-
-BEGIN {
- if (-e 't/test_dir') { # if we are running "t/rule_names.t", kluge around ...
- chdir 't';
- }
-
- if (-e 'test_dir') { # running from test directory, not ..
- unshift(@INC, '../blib/lib');
- }
-}
-
-my $prefix = '.';
-if (-e 'test_dir') { # running from test directory, not ..
- $prefix = '..';
-}
-
-use strict;
-use SATest; sa_t_init("normalize_utf8_dots.t");
-use Test;
-use Mail::SpamAssassin;
-use vars qw(%patterns %anti_patterns);
-
-# settings
-plan tests => 6;
-
-# initialize SpamAssassin
-my $sa = create_saobj({dont_copy_prefs => 1});
-$sa->init(0); # parse rules
-
-# load tests and write mail
-%patterns = ();
-%anti_patterns = ();
-my $message = write_mail();
-
-my $mail = $sa->parse($message);
-my $msg = Mail::SpamAssassin::PerMsgStatus->new($sa, $mail);
-
-my $uris = join("\n", $msg->get_uri_list(), "");
-
-# run patterns and anti-patterns
-my $failures = 0;
-for my $pattern (keys %patterns) {
- if (!ok($uris =~ /${pattern}/m)) {
- warn "failure: did not find /$pattern/\n";
- $failures++;
- }
-}
-
-for my $anti_pattern (keys %anti_patterns) {
- if (!ok($uris !~ /${anti_pattern}/m)) {
- warn "failure: did find /$anti_pattern/\n";
- $failures++;
- }
-}
-
-if ($failures) {
- print "URIs found:\n$uris";
-}
-
-# function to write test email
-sub write_mail {
- my $message = <<'EOF';
-Message-ID: <clean.1010101@example.com>
-Date: Mon, 07 Oct 2002 09:00:00 +0000
-From: Sender <sender@example.com>
-MIME-Version: 1.0
-To: Recipient <recipient@example.com>
-Subject: this is a trivial message
-Content-Type: text/plain; charset="UTF-8"
-Content-Transfer-Encoding: 8bit
-
-EOF
-
- # Characters that look like a fullstop
- my @delims = split(//, "\x{002E}\x{3002}\x{FF0E}\x{FF61}\x{FE52}\x{2024}");
- my $i = 0;
-
- foreach my $delim_char (@delims) {
- $i++;
- my $delim = $delim_char; utf8::encode($delim); # to UTF-8 octets
- my $string = "http://utf$i" . $delim . "example" . $delim . "com";
- my @patterns = ("^http://utf$i\\.example\\.com\$");
- if ($string && @patterns) {
- $message .= "$string\n";
- for my $pattern (@patterns) {
- if ($pattern =~ /^!(.*)/) {
- $anti_patterns{$1} = 1;
- }
- else {
- $patterns{$pattern} = 1;
- }
- }
- }
- }
-
- return $message;
-}