bug 8272 - sometimes allow decoding as utf-8 with errors, strip user:pass@ in IDN decoding of URL

git-svn-id: https://svn.apache.org/repos/asf/spamassassin/trunk@1919710 13f79535-47bb-0310-9956-ffa450edef68
diff --git a/lib/Mail/SpamAssassin/Message/Node.pm b/lib/Mail/SpamAssassin/Message/Node.pm
index 1ecc71d..274eb98 100644
--- a/lib/Mail/SpamAssassin/Message/Node.pm
+++ b/lib/Mail/SpamAssassin/Message/Node.pm
@@ -493,7 +493,7 @@
   # Try first as UTF-8 ignoring declaring?
   my $tried_utf8;
   if ($cnt_8bits && !$insist_on_declared_charset) {
-    if (eval { $rv = $enc_utf8->decode($_[0], 1|8); defined $rv }) {
+    if (eval { $rv = $enc_utf8->decode($_[0], Encode::FB_CROAK | Encode::LEAVE_SRC); defined $rv }) {
       dbg("message: decoded as charset UTF-8, declared %s",
         $charset_declared);
       return $_[0]  if !$return_decoded;
@@ -523,7 +523,7 @@
 
     my $decoder = detect_utf16( $_[0] );
     if (defined $decoder) {
-      if (eval { $rv = $decoder->decode($_[0], 1|8); defined $rv }) {
+      if (eval { $rv = $decoder->decode($_[0], Encode::FB_CROAK | Encode::LEAVE_SRC); defined $rv }) {
         dbg("message: decoded as charset %s, declared %s",
           $decoder->name, $charset_declared);
         utf8::encode($rv) if !$return_decoded;
@@ -574,12 +574,9 @@
       dbg("message: failed decoding, no decoder for a declared charset %s",
           $chset);
     }
-    elsif ($tried_utf8 && $chset eq 'UTF-8') {
-      # was already tried initially, no point doing again
-    }
     else {
-      my $check_flags = Encode::LEAVE_SRC;  # 0x0008
-      $check_flags |= Encode::FB_CROAK  unless $insist_on_declared_charset;
+      my $check_flags = Encode::LEAVE_SRC;
+      $check_flags |= Encode::FB_CROAK  unless $insist_on_declared_charset || ($tried_utf8 && $chset eq 'UTF-8');
       my $err = '';
       if (eval { $rv = $decoder->decode($_[0], $check_flags); defined $rv }) {
         dbg("message: decoded as charset %s, declared %s",
@@ -616,7 +613,7 @@
     # NBSP (A0) and SHY (AD) are at the same position in ISO-8859-* too
     # consider also: AE (r), 80 Euro
     my $err = '';
-    eval { $rv = $enc_w1252->decode($_[0], 1|8) };  # FB_CROAK | LEAVE_SRC
+    eval { $rv = $enc_w1252->decode($_[0], Encode::FB_CROAK | Encode::LEAVE_SRC) };
     if ($@) {
       $err = $@; $err =~ s/\s+/ /gs; $err =~ s/(.*) at .*/$1/;
       $err = " ($err)";
@@ -649,7 +646,7 @@
             $charset_detected);
       } else {
         my $err = '';
-        eval { $rv = $decoder->decode($_[0], 1|8) };  # FB_CROAK | LEAVE_SRC
+        eval { $rv = $decoder->decode($_[0], Encode::FB_CROAK | Encode::LEAVE_SRC) };
         if ($@) {
           $err = $@; $err =~ s/\s+/ /gs; $err =~ s/(.*) at .*/$1/;
           $err = " ($err)";
@@ -760,7 +757,7 @@
       } else { # non-ASCII, try UTF-8
         my $rv;
         # with some luck input can be interpreted as UTF-8
-        if (eval { $rv = $enc_utf8->decode($text, 1|8); defined $rv }) {
+        if (eval { $rv = $enc_utf8->decode($text, Encode::FB_CROAK | Encode::LEAVE_SRC); defined $rv }) {
           $text = $rv;  # decoded to perl characters
           $character_semantics = 1;  # $text will be in characters
           dbg("message: decoded as charset UTF-8, declared %s", $charset);
@@ -822,7 +819,7 @@
       if ($text =~ tr/\x00-\x7F//c) {  # non-ASCII, try UTF-8
         my $rv;
         # with some luck input can be interpreted as UTF-8
-        if (eval { $rv = $enc_utf8->decode($text, 1|8); defined $rv }) {
+        if (eval { $rv = $enc_utf8->decode($text, Encode::FB_CROAK | Encode::LEAVE_SRC); defined $rv }) {
           $text = $rv;  # decoded to perl characters
           dbg("message: decoded as charset UTF-8, declared %s", $charset);
         } else {
diff --git a/lib/Mail/SpamAssassin/RegistryBoundaries.pm b/lib/Mail/SpamAssassin/RegistryBoundaries.pm
index eabb710..41481bf 100644
--- a/lib/Mail/SpamAssassin/RegistryBoundaries.pm
+++ b/lib/Mail/SpamAssassin/RegistryBoundaries.pm
@@ -109,6 +109,7 @@
     $domain = lc $domain;
   } else {
     # convert to ascii, handles Unicode dot normalization also
+    $domain = _strip_user_pass($domain);  
     $domain = idn_to_ascii($domain);
   }
 
@@ -222,6 +223,7 @@
     $dom = lc $dom;
   } else {
     # convert to ascii, handles Unicode dot normalization also
+    $dom = _strip_user_pass($dom);  
     $dom = idn_to_ascii($dom);
   }
 
@@ -254,14 +256,9 @@
     return unless $uri =~ s/.*@//;	# drop username or abort
   } else {
     $uri =~ s{^[a-z]+:/{0,2}}{}gs;	# drop the protocol
-    # strip path, CGI params, fragment.  note: bug 4213 shows that "&" should
-    # *not* be likewise stripped here -- it's permitted in hostnames by
-    # some common MUAs!
-    $uri =~ s{[/?#].*}{}gs;              
-    $uri =~ s{^[^/]*\@}{}gs;		# drop username/passwd
-    $uri =~ s{:\d*$}{}gs;		# port, bug 4191: sometimes the # is missing
+    $uri = _strip_user_pass($uri);
   }
-
+  
   # skip undecoded URIs if the encoded bits shouldn't be.
   # we'll see the decoded version as well.  see url_encode()
   return if $uri =~ /\%(?:2[1-9a-f]|[3-6][0-9a-f]|7[0-9a-e])/;
@@ -283,5 +280,21 @@
   return !wantarray ? $domain : ($domain, $host);
 }
 
+sub _strip_user_pass {
+  my $uri = shift;
+
+  if ($uri =~ s/^mailto://i) { # handle mailto: specially
+     return $uri                   # drop parameters ?subject= etc
+  } else {
+    # strip path, CGI params, fragment.  note: bug 4213 shows that "&" should
+    # *not* be likewise stripped here -- it's permitted in hostnames by
+    # some common MUAs!
+    $uri =~ s{[/?#].*}{}gs;              
+    $uri =~ s{^[^/]*\@}{}gs;            # drop username/passwd
+    $uri =~ s{:\d*$}{}gs;               # port, bug 4191: sometimes the # is missing
+    return $uri;
+  }
+}
+
 1;