promotions validated

git-svn-id: https://svn.apache.org/repos/asf/spamassassin/tags/sa-update_3.4.1_20150415085034@1673682 13f79535-47bb-0310-9956-ffa450edef68
diff --git a/MANIFEST b/MANIFEST
index edef111..e1861d2 100644
--- a/MANIFEST
+++ b/MANIFEST
@@ -142,7 +142,6 @@
 rules/v330.pre
 rules/v340.pre
 rules/v341.pre
-rules/20_aux_tlds.cf
 sa-awl.raw
 sa-check_spamd.raw
 sa-compile.raw
@@ -420,7 +419,6 @@
 t/html_colors.t
 t/html_obfu.t
 t/html_utf8.t
-t/idn_dots.t
 t/if_can.t
 t/ifversion.t
 t/ip_addrs.t
@@ -550,8 +548,3 @@
 t/uribl_all_types.t
 t/uribl_ips_only.t
 t/dnsbl_subtests.t
-powered_by/128-powered-by-spamassassin.png
-powered_by/256-powered-by-spamassassin.png
-powered_by/512-powered-by-spamassassin.png
-powered_by/LOGO_USAGE.TXT
-powered_by/powered_by_spamassassin.psd
diff --git a/build/README b/build/README
index 7458a2c..d9066d8 100644
--- a/build/README
+++ b/build/README
@@ -66,6 +66,16 @@
   (ie., no "M" or "C" files; any files marked "M" have been locally
   modified, and should be "svn revert"ed before you continue.)
 
+#- consider updating the TLD list in
+#  Mail/SpamAssassin/Util/RegistrarBoundaries.pm
+#
+#  Follow the documentation under %VALID_TLDS and $VALID_TLDS_RE for
+#  updating the TLD list, make test, and do a commit if there are any
+#  changes from the previous TLD list.
+#
+#  Currently, the only way to change the TLD list is via a release as 
+#  of 6/17/2014.
+
 - Consider updating the TLD list in 20_aux_tlds.cf.  As of 4/6/2015,
   this is not automated but bug 7165 is open for this purpose.
 
diff --git a/build/announcements/3.4.1-rc2.txt b/build/announcements/3.4.1-rc2.txt
index 747e9bf..915cf21 100644
--- a/build/announcements/3.4.1-rc2.txt
+++ b/build/announcements/3.4.1-rc2.txt
@@ -18,106 +18,129 @@
 Notable features:
 =================
 
-New plugins
------------
+Bug 7115: Adding SHA digests of MIME parts as Bayes tokens allows bayes
+to see non-textual content - added configurability
 
-There are three new plugins added with this release:
+rewritten Node::_normalize
 
-  Mail::SpamAssassin::Plugin::TxRep
-  Mail::SpamAssassin::Plugin::PDFInfo
-  Mail::SpamAssassin::Plugin::URILocalBL
-
-The TxRep (Reputation) plugin is designed as a substantially improved
-replacement of the AWL plugin. It adjusts the final message spam score
-by looking up and taking in consideration the reputation of the sender.
-It cannot coexist with the old AWL plugin, which must be disabled when
-the TxRep is loaded.
-
-The PDFInfo plugin helps detected spam using attached PDF files.
-
-The URILocalBL plugin creates some new rule test types, such as
-"uri_block_cc", "uri_block_cidr", and "uri_block_isp".  These rules
-apply to the URIs found in the HTML portion of a message, i.e.
-<a href=...> markup.
-
-All these three plugins are disabled by default. To enable, uncomment
-the loadplugin configuration options in file v341.pre or add them to
-some local .pre file such as local.pre .
-
-Plugins are documented in their respective man pages.
-
-
-Notable changes
----------------
-
-Adding SHA digests of MIME parts as Bayes tokens allows bayes
-to see non-textual content. The set of sources of bayes tokens is
-configurable with a new configuration option 'bayes_token_sources'
-as documented in the Mail::SpamAssassin::Conf man page, disabled
-by default for backward compatibility. (Bug 7115)
-
-Subroutine Node::_normalize has been rewritten. The new behavior
-is documented with the 'normalize_charset' option in the
-Mail::SpamAssassin::Conf man page. (Bug 7144, Bug 7126, Bug 7133)
-
-Tokenization of UTF-8 -encoded or normalized text has been improved
-in the Bayes plugin. (Bug 7130, Bug 7135, Bug 7141)
+improved tokenization of UTF-8 -encoded or normalized text in
+the Bayes plugin
 
 
 New configuration options
 -------------------------
 
-The 'normalize_charset' configuration option already existed in previous
-versions, but its functionality has been re-implemented to put more
-emphasis on the declared character set of a MIME part instead of relying
-on guesswork by Encode::Detect::Detector. When enabled, it converts
-non- UTF-8 textual parts of a mail message into UTF-8 encoding, before
-passing them to HTML decoding and to rules processing. This makes it
-possible to write regular expressions and strings in rules in UTF-8
-encoding, and allows plugins (such as tokenization in a Bayes plugin)
-to recognize multibyte characters and words in non-English languages
-as such, instead of 'randomly' considering some non-ASCII octets in
-multibyte characters as delimiters. Please see documentation for this
-configuration option in the Mail::SpamAssassin::Conf man page.
-
-The configuration option 'dns_server' can now specify a scoped
-link-local IPv6 address, e.g.:  dns_server [fe80::1%lo0]:53
-
-A new configuration option 'bayes_token_sources' allows more control
-on the sources of tokens for the Bayes plugin. For compatibility
-the default set of sources is unchanged, but consider: 
-    bayes_token_sources all
-or: bayes_token_sources mimepart
-to include SHA1 digests of all MIME parts of a message as Bayes tokens.
-Please see documentation for this option in the Mail::SpamAssassin::Conf
-man page.
-
-A new configuration option 'dkim_minimum_key_bits' with a default value
-of 1024 bits now controls the smallest size of a signing key (in bits)
-for a valid signature to be considered for whitelisting. Please see
-documentation for this option in the Mail::SpamAssassin::Plugin::DKIM
-man page.
-
-A new configuration option 'parse_dkim_uris' allows DKIM header fields
-to be parsed for URIs to process alongside URIs found in the body with
-some rules and modules (e.g. URIDNSBL).
-
-The configuration option 'check_rbl_from_domain' checks all the domain
-names in a From mail address as an alternate to check_rbl_from_host.
-As of v3.4.1, it has been improved to include a subtest for a specific
-octet.
-
-
-
-
-
-??? perl_version
-???   (Introduced in 3.4.2)  This will be replaced with the version
-???-->>  THIS NEEDS TO BE FIXED in Conf.pm, WE ARE AT 3.4.1
-
 Added flag 'noawl' to the 'tflags' configuration option.
 
 
+parse_dkim_uris ( 0 | 1 ) (default: 0)
+
+  If this option is set to 1 and the message contains DKIM headers,
+  the headers will be parsed for URIs to process alongside URIs found
+  in the body with some rules and moduels (ex. URIDNSBL)
+
+
+perl_version
+  (Introduced in 3.4.2)  This will be replaced with the version
+-->>  THIS NEEDS TO BE FIXED in Conf.pm, WE ARE AT 3.4.1
+
+
+changed implementation, may produce different result in some cases:
+
+normalize_charset ( 0 | 1)        (default: 0)
+  Whether to decode non- UTF-8 and non-ASCII textual parts and recode
+  them to UTF-8 before the text is given over to rules processing.
+  The character set used for attempted decoding is primarily based on
+  a declared character set in a Content-Type header, but if the
+  decoding attempt fails a module Encode::Detect::Detector is
+  consulted (if available) to provide a guess based on the actual
+  text, and decoding is re-attempted. Even if the option is enabled
+  no unnecessary decoding and re-encoding work is done when possible
+  (like with an all-ASCII text with a US-ASCII or extended ASCII
+  character set declaration, e.g. UTF-8 or ISO-8859-nn or Windows-nnnn).
+
+  Unicode support in old versions of perl or in a core module Encode
+  is likely to be buggy in places, so if the normalize_charset
+  function is enabled it is advised to stick to more recent versions
+  of perl (preferably 5.12 or later). The module
+  Encode::Detect::Detector is optional, when necessary it will be
+  used if it is available.
+
+  
+option dns_server can now specify a link-local IPv6 address, e.g.:
+  dns_server [fe80::1%lo0]:53
+
+
+new option:
+
+bayes_token_sources  (default: header visible invisible uri)
+  Controls which sources in a mail message can contribute tokens
+  (e.g. words, phrases, etc.) to a Bayes classifier. The argument is
+  a space-separated list of keywords: header, visible, invisible,
+  uri, mimepart), each of which may be prefixed by a no to indicate
+  its exclusion. Additionally two reserved keywords are allowed: all
+  and none (or: noall). The list of keywords is processed
+  sequentially: a keyword all adds all available keywords to a set
+  being built, a none or noall clears the set, other non-negated
+  keywords are added to the set, and negated keywords are removed
+  from the set. Keywords are case-insensitive.
+
+  The default set is: header visible invisible uri, which is
+  equivalent for example to: All NoMIMEpart. The reason why mimepart
+  is not currently in a default set is that it is a newer source
+  (introduced with SpamAssassin version 3.4.1) and not much
+  experience has yet been gathered regarding its usefulness.
+
+  See also option "bayes_ignore_header" for a fine-grained control on
+  individual header fields under the umbrella of a more general
+  keyword header here.
+
+  Keywords imply the following data sources:
+    header - tokens collected from a message header section
+    visible - words from visible text (plain or HTML) in a message body
+    invisible - hidden/invisible text in HTML parts of a message body
+    uri - URIs collected from a message body
+    mimepart - digests (hashes) of all MIME parts (textual or non-
+    textual) of a message, computed after Base64 and quoted-printable
+    decoding, suffixed by their Content-Type
+    all - adds all the above keywords to the set being assembled
+    none or noall - removes all keywords from the set
+
+  The "bayes_token_sources" directive may appear multiple times, its
+  keywords are interpreted sequentially, adding or removing items
+  from the final set as they appear in their order in
+  "bayes_token_sources" directive(s).
+
+
+new option:
+
+dkim_minimum_key_bits n             (default: 1024)
+  The smallest size of a signing key (in bits) for a valid signature
+  to be considered for whitelisting. Additionally, the eval function
+  check_dkim_valid() will return false on short keys when called with
+  explicitly listed domains, and the eval function
+  check_dkim_valid_author_sig() will return false on short keys
+  (regardless of its arguments). Setting the option to 0 disables a
+  key size check.
+
+  Note that the option has no effect when the eval function
+  check_dkim_valid() is called with no arguments (like in a rule
+  DKIM_VALID). A mere presence of some valid signature on a message
+  has no reputational value (without being associated with a
+  particular domain), regardless of its key size - anyone can prepend
+  its own signature on a copy of some third party mail and re-send
+  it, which makes it no more trustworthy than without such signature.
+  This is also a reason for a rule DKIM_VALID to have a near-zero score.
+
+
+change:
+
+check_rbl_from_domain
+  This checks all the from addrs domain names as an alternate to
+  check_rbl_from_host.  As of v3.4.1, it has been improved to include
+  a subtest for a specific octet.
+
+
 new template tags:
 _SENDERDOMAIN_  a domain name of the envelope sender address, lowercased
 _AUTHORDOMAIN_  a domain name of the author address (the From header
@@ -145,6 +168,16 @@
 
 
 
+New plugins
+-----------
+
+New plugin (optional):
+# loadplugin Mail::SpamAssassin::Plugin::TxRep
+# loadplugin Mail::SpamAssassin::Plugin::PDFInfo  ???
+
+URILocalBL.pm ???
+
+
 Rule updates
 ------------
 
@@ -213,7 +246,7 @@
 
 Bug 7136: added has_check_for_spf_errors and if can() encapsulation
 
-Bug 7128: DCC plugin now uses IO::Socket::IP instead of IO::Socket::INET6 
+Bug 7128: DCC plugin now uses IO::Socket::INET6 instead of IO::Socket::IP
 
 Bug 7099: Adding tags SENDERDOMAIN and AUTHORDOMAIN
 
@@ -249,12 +282,12 @@
 unnecessary copying was avoided when reading from a temporary file
 in SA::Message::Node (small optimization)
 
+changed fillfactor for postgres bayes/awl tables to optimize for updates
+
 a small hotspot in DnsResolver.pm was optimized
 
 use faster utf8::encode instead of Encode::encode_utf8
 
-changed fillfactor for postgres bayes/awl tables to optimize for updates
-
 disabled synchronous commit for Postgres Bayes store
 
 
diff --git a/lib/Mail/SpamAssassin/Conf.pm b/lib/Mail/SpamAssassin/Conf.pm
index c57edab..7deccc4 100644
--- a/lib/Mail/SpamAssassin/Conf.pm
+++ b/lib/Mail/SpamAssassin/Conf.pm
@@ -1139,29 +1139,29 @@
 	unless (defined $value && $value !~ /^$/) {
 	    return $MISSING_REQUIRED_VALUE;
 	}
-        if    (lc $value eq 'yes' || $value eq '1') { $value = 1 }
-        elsif (lc $value eq 'no'  || $value eq '0') { $value = 0 }
-        else { return $INVALID_VALUE }
-
-	$self->{normalize_charset} = $value;
+	return  if $value == 0;
+	return $INVALID_VALUE unless $value == 1;
 
 	unless ($] > 5.008004) {
 	    $self->{parser}->lint_warn("config: normalize_charset requires Perl 5.8.5 or later");
-	    $self->{normalize_charset} = 0;
 	    return $INVALID_VALUE;
 	}
 	require HTML::Parser;
         #changed to eval to use VERSION so that this version was not incorrectly parsed for CPAN
 	unless ( eval { HTML::Parser->VERSION(3.46) } ) {
 	    $self->{parser}->lint_warn("config: normalize_charset requires HTML::Parser 3.46 or later");
-	    $self->{normalize_charset} = 0;
 	    return $INVALID_VALUE;
 	}
+#	unless (eval 'require Encode::Detect::Detector') {
+#	    $self->{parser}->lint_warn("config: normalize_charset requires Encode::Detect::Detector");
+#	    return $INVALID_VALUE;
+#	}
 	unless (eval 'require Encode') {
 	    $self->{parser}->lint_warn("config: normalize_charset requires Encode");
-	    $self->{normalize_charset} = 0;
 	    return $INVALID_VALUE;
 	}
+
+	$self->{normalize_charset} = 1;
     }
   });
 
diff --git a/lib/Mail/SpamAssassin/Message/Node.pm b/lib/Mail/SpamAssassin/Message/Node.pm
index f1ad898..8833d7a 100644
--- a/lib/Mail/SpamAssassin/Message/Node.pm
+++ b/lib/Mail/SpamAssassin/Message/Node.pm
@@ -486,11 +486,6 @@
       $chset = 'Windows-1252'; $decoder = $enc_w1252;
     } else {
       $chset = $charset_declared; $decoder = Encode::find_encoding($chset);
-      if (!$decoder && $chset =~ /^GB[ -]?18030(?:-20\d\d)?\z/i) {
-        $decoder = Encode::find_encoding('GBK');  # a subset of GB18030
-        dbg("message: no decoder for a declared charset %s, using GBK",
-            $chset)  if $decoder;
-      }
     }
     if (!$decoder) {
       dbg("message: failed decoding, no decoder for a declared charset %s",
@@ -544,11 +539,6 @@
     my $charset_detected = Encode::Detect::Detector::detect($_[1]);
     if ($charset_detected && lc $charset_detected ne lc $charset_declared) {
       my $decoder = Encode::find_encoding($charset_detected);
-      if (!$decoder && $charset_detected =~ /^GB[ -]?18030(?:-20\d\d)?\z/i) {
-        $decoder = Encode::find_encoding('GBK');  # a subset of GB18030
-        dbg("message: no decoder for a detected charset %s, using GBK",
-            $charset_detected)  if $decoder;
-      }
       if (!$decoder) {
         dbg("message: failed decoding, no decoder for a detected charset %s",
             $charset_detected);
diff --git a/rules/local.cf b/rules/local.cf
index 95bc494..a40b217 100644
--- a/rules/local.cf
+++ b/rules/local.cf
@@ -52,11 +52,6 @@
 # bayes_ignore_header X-Spam-Status
 
 
-#   Whether to decode non- UTF-8 and non-ASCII textual parts and recode
-#   them to UTF-8 before the text is given over to rules processing.
-#
-# normalize_charset 1
-
 #   Some shortcircuiting, if the plugin is enabled
 # 
 ifplugin Mail::SpamAssassin::Plugin::Shortcircuit
diff --git a/sql/README.txrep b/sql/README.txrep
index 38b975b..df0b5be 100644
--- a/sql/README.txrep
+++ b/sql/README.txrep
@@ -38,9 +38,9 @@
 user_awl_dsn                DBI:mysql:spamassassin:localhost
 
 Would tell SpamAssassin to connect to the database named spamassassin using
-MySQL on the local server, and since <port> is omitted, the driver will use
-the default port number.  The other two required options tells SpamAssassin
-to use the defined username and password to establish the connection.
+MySQL on the local server, and since <port> is omitted, the driver will use the
+default port number.  The other two required options tells SpamAssassin to use
+the defined username and password to establish the connection.
 
 If the user_awl_dsn option does not exist, SpamAssassin will not attempt
 to use SQL for tracking reputations.
@@ -85,12 +85,3 @@
 you must specify the proper storage backend in the config file in order
 for this to work and the current username must be passed to spamd.
 
-Maintenance
----------------
-
-It is recommended to keep user_awl_sql_table clear of stale data, for
-performance reasons. A sample query that can be run on a regular
-schedule is below:
-
-DELETE FROM txrep WHERE last_hit <= (now() - INTERVAL 120 day);
-
diff --git a/sql/txrep_mysql.sql b/sql/txrep_mysql.sql
index bbe6b95..f540196 100644
--- a/sql/txrep_mysql.sql
+++ b/sql/txrep_mysql.sql
@@ -5,7 +5,5 @@
   count int(11) NOT NULL default '0',
   totscore float NOT NULL default '0',
   signedby varchar(255) NOT NULL default '',
-  last_hit timestamp NOT NULL default CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP,
-  PRIMARY KEY (username,email,signedby,ip),
-  KEY last_hit (last_hit)
+  PRIMARY KEY (username,email,signedby,ip)
 ) ENGINE=InnoDB;
diff --git a/sql/txrep_pg.sql b/sql/txrep_pg.sql
index 52c1003..591758d 100644
--- a/sql/txrep_pg.sql
+++ b/sql/txrep_pg.sql
@@ -5,9 +5,7 @@
   count int(11) NOT NULL default '0',
   totscore float NOT NULL default '0',
   signedby varchar(255) NOT NULL default '',
-  last_hit timestamp NOT NULL default CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP,
-  PRIMARY KEY (username,email,signedby,ip),
-  KEY last_hit (last_hit)
+  PRIMARY KEY (username,email,signedby,ip)
 );
 
 ALTER TABLE txrep SET (fillfactor=95);
diff --git a/t/idn_dots.t b/t/idn_dots.t
deleted file mode 100755
index 7073d58..0000000
--- a/t/idn_dots.t
+++ /dev/null
@@ -1,100 +0,0 @@
-#!/usr/bin/perl -w
-
-# test URIs with UTF8 IDNA-equivalent dots between domains instead of ordinary '.'
-
-BEGIN {
-  if (-e 't/test_dir') { # if we are running "t/rule_names.t", kluge around ...
-    chdir 't';
-  }
-
-  if (-e 'test_dir') {            # running from test directory, not ..
-    unshift(@INC, '../blib/lib');
-  }
-}
-
-my $prefix = '.';
-if (-e 'test_dir') {            # running from test directory, not ..
-  $prefix = '..';
-}
-
-use strict;
-use SATest; sa_t_init("normalize_utf8_dots.t");
-use Test;
-use Mail::SpamAssassin;
-use vars qw(%patterns %anti_patterns);
-
-# settings
-plan tests => 6;
-
-# initialize SpamAssassin
-my $sa = create_saobj({dont_copy_prefs => 1});
-$sa->init(0); # parse rules
-
-# load tests and write mail
-%patterns = ();
-%anti_patterns = ();
-my $message = write_mail();
-
-my $mail = $sa->parse($message);
-my $msg = Mail::SpamAssassin::PerMsgStatus->new($sa, $mail);
-
-my $uris = join("\n", $msg->get_uri_list(), "");
-
-# run patterns and anti-patterns
-my $failures = 0;
-for my $pattern (keys %patterns) {
-  if (!ok($uris =~ /${pattern}/m)) {
-    warn "failure: did not find /$pattern/\n";
-    $failures++;
-  }
-}
-
-for my $anti_pattern (keys %anti_patterns) {
-  if (!ok($uris !~ /${anti_pattern}/m)) {
-    warn "failure: did find /$anti_pattern/\n";
-    $failures++;
-  }
-}
-
-if ($failures) {
-  print "URIs found:\n$uris";
-}
-
-# function to write test email
-sub write_mail {
-  my $message = <<'EOF';
-Message-ID: <clean.1010101@example.com>
-Date: Mon, 07 Oct 2002 09:00:00 +0000
-From: Sender <sender@example.com>
-MIME-Version: 1.0
-To: Recipient <recipient@example.com>
-Subject: this is a trivial message
-Content-Type: text/plain; charset="UTF-8"
-Content-Transfer-Encoding: 8bit
-
-EOF
-
-  # Characters that look like a fullstop
-  my @delims = split(//, "\x{002E}\x{3002}\x{FF0E}\x{FF61}\x{FE52}\x{2024}");
-  my $i = 0;
-
-  foreach my $delim_char (@delims) {
-    $i++;
-    my $delim = $delim_char; utf8::encode($delim);  # to UTF-8 octets
-    my $string = "http://utf$i" . $delim . "example" . $delim . "com";
-    my @patterns = ("^http://utf$i\\.example\\.com\$");
-    if ($string && @patterns) {
-      $message .= "$string\n";
-      for my $pattern (@patterns) {
-        if ($pattern =~ /^!(.*)/) {
-          $anti_patterns{$1} = 1;
-        }
-        else {
-          $patterns{$pattern} = 1;
-        }
-      }
-    }
-  }
-
-  return $message;
-}