bug 8272 - sometimes allow decoding as utf-8 with errors, strip user:pass@ in IDN decoding of URL
git-svn-id: https://svn.apache.org/repos/asf/spamassassin/trunk@1919710 13f79535-47bb-0310-9956-ffa450edef68
diff --git a/lib/Mail/SpamAssassin/Message/Node.pm b/lib/Mail/SpamAssassin/Message/Node.pm
index 1ecc71d..274eb98 100644
--- a/lib/Mail/SpamAssassin/Message/Node.pm
+++ b/lib/Mail/SpamAssassin/Message/Node.pm
@@ -493,7 +493,7 @@
# Try first as UTF-8 ignoring declaring?
my $tried_utf8;
if ($cnt_8bits && !$insist_on_declared_charset) {
- if (eval { $rv = $enc_utf8->decode($_[0], 1|8); defined $rv }) {
+ if (eval { $rv = $enc_utf8->decode($_[0], Encode::FB_CROAK | Encode::LEAVE_SRC); defined $rv }) {
dbg("message: decoded as charset UTF-8, declared %s",
$charset_declared);
return $_[0] if !$return_decoded;
@@ -523,7 +523,7 @@
my $decoder = detect_utf16( $_[0] );
if (defined $decoder) {
- if (eval { $rv = $decoder->decode($_[0], 1|8); defined $rv }) {
+ if (eval { $rv = $decoder->decode($_[0], Encode::FB_CROAK | Encode::LEAVE_SRC); defined $rv }) {
dbg("message: decoded as charset %s, declared %s",
$decoder->name, $charset_declared);
utf8::encode($rv) if !$return_decoded;
@@ -574,12 +574,9 @@
dbg("message: failed decoding, no decoder for a declared charset %s",
$chset);
}
- elsif ($tried_utf8 && $chset eq 'UTF-8') {
- # was already tried initially, no point doing again
- }
else {
- my $check_flags = Encode::LEAVE_SRC; # 0x0008
- $check_flags |= Encode::FB_CROAK unless $insist_on_declared_charset;
+ my $check_flags = Encode::LEAVE_SRC;
+ $check_flags |= Encode::FB_CROAK unless $insist_on_declared_charset || ($tried_utf8 && $chset eq 'UTF-8');
my $err = '';
if (eval { $rv = $decoder->decode($_[0], $check_flags); defined $rv }) {
dbg("message: decoded as charset %s, declared %s",
@@ -616,7 +613,7 @@
# NBSP (A0) and SHY (AD) are at the same position in ISO-8859-* too
# consider also: AE (r), 80 Euro
my $err = '';
- eval { $rv = $enc_w1252->decode($_[0], 1|8) }; # FB_CROAK | LEAVE_SRC
+ eval { $rv = $enc_w1252->decode($_[0], Encode::FB_CROAK | Encode::LEAVE_SRC) };
if ($@) {
$err = $@; $err =~ s/\s+/ /gs; $err =~ s/(.*) at .*/$1/;
$err = " ($err)";
@@ -649,7 +646,7 @@
$charset_detected);
} else {
my $err = '';
- eval { $rv = $decoder->decode($_[0], 1|8) }; # FB_CROAK | LEAVE_SRC
+ eval { $rv = $decoder->decode($_[0], Encode::FB_CROAK | Encode::LEAVE_SRC) };
if ($@) {
$err = $@; $err =~ s/\s+/ /gs; $err =~ s/(.*) at .*/$1/;
$err = " ($err)";
@@ -760,7 +757,7 @@
} else { # non-ASCII, try UTF-8
my $rv;
# with some luck input can be interpreted as UTF-8
- if (eval { $rv = $enc_utf8->decode($text, 1|8); defined $rv }) {
+ if (eval { $rv = $enc_utf8->decode($text, Encode::FB_CROAK | Encode::LEAVE_SRC); defined $rv }) {
$text = $rv; # decoded to perl characters
$character_semantics = 1; # $text will be in characters
dbg("message: decoded as charset UTF-8, declared %s", $charset);
@@ -822,7 +819,7 @@
if ($text =~ tr/\x00-\x7F//c) { # non-ASCII, try UTF-8
my $rv;
# with some luck input can be interpreted as UTF-8
- if (eval { $rv = $enc_utf8->decode($text, 1|8); defined $rv }) {
+ if (eval { $rv = $enc_utf8->decode($text, Encode::FB_CROAK | Encode::LEAVE_SRC); defined $rv }) {
$text = $rv; # decoded to perl characters
dbg("message: decoded as charset UTF-8, declared %s", $charset);
} else {
diff --git a/lib/Mail/SpamAssassin/RegistryBoundaries.pm b/lib/Mail/SpamAssassin/RegistryBoundaries.pm
index eabb710..41481bf 100644
--- a/lib/Mail/SpamAssassin/RegistryBoundaries.pm
+++ b/lib/Mail/SpamAssassin/RegistryBoundaries.pm
@@ -109,6 +109,7 @@
$domain = lc $domain;
} else {
# convert to ascii, handles Unicode dot normalization also
+ $domain = _strip_user_pass($domain);
$domain = idn_to_ascii($domain);
}
@@ -222,6 +223,7 @@
$dom = lc $dom;
} else {
# convert to ascii, handles Unicode dot normalization also
+ $dom = _strip_user_pass($dom);
$dom = idn_to_ascii($dom);
}
@@ -254,14 +256,9 @@
return unless $uri =~ s/.*@//; # drop username or abort
} else {
$uri =~ s{^[a-z]+:/{0,2}}{}gs; # drop the protocol
- # strip path, CGI params, fragment. note: bug 4213 shows that "&" should
- # *not* be likewise stripped here -- it's permitted in hostnames by
- # some common MUAs!
- $uri =~ s{[/?#].*}{}gs;
- $uri =~ s{^[^/]*\@}{}gs; # drop username/passwd
- $uri =~ s{:\d*$}{}gs; # port, bug 4191: sometimes the # is missing
+ $uri = _strip_user_pass($uri);
}
-
+
# skip undecoded URIs if the encoded bits shouldn't be.
# we'll see the decoded version as well. see url_encode()
return if $uri =~ /\%(?:2[1-9a-f]|[3-6][0-9a-f]|7[0-9a-e])/;
@@ -283,5 +280,21 @@
return !wantarray ? $domain : ($domain, $host);
}
+sub _strip_user_pass {
+ my $uri = shift;
+
+ if ($uri =~ s/^mailto://i) { # handle mailto: specially
+ return $uri # drop parameters ?subject= etc
+ } else {
+ # strip path, CGI params, fragment. note: bug 4213 shows that "&" should
+ # *not* be likewise stripped here -- it's permitted in hostnames by
+ # some common MUAs!
+ $uri =~ s{[/?#].*}{}gs;
+ $uri =~ s{^[^/]*\@}{}gs; # drop username/passwd
+ $uri =~ s{:\d*$}{}gs; # port, bug 4191: sometimes the # is missing
+ return $uri;
+ }
+}
+
1;