lib/Mail/SpamAssassin/Plugin/BodyEval.pm - spamassassin - Git at Google

 # <@LICENSE>
 # Licensed to the Apache Software Foundation (ASF) under one or more
 # contributor license agreements.  See the NOTICE file distributed with
 # this work for additional information regarding copyright ownership.
 # The ASF licenses this file to you under the Apache License, Version 2.0
 # (the "License"); you may not use this file except in compliance with
 # the License.  You may obtain a copy of the License at:
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # </@LICENSE>

 package Mail::SpamAssassin::Plugin::BodyEval;

 use Mail::SpamAssassin::Plugin;
 use Mail::SpamAssassin::Logger;
 use Mail::SpamAssassin::Constants qw(:sa);

 use strict;
 use warnings;
 use bytes;

 use vars qw(@ISA);
 @ISA = qw(Mail::SpamAssassin::Plugin);

 # constructor: register the eval rule
 sub new {
   my $class = shift;
   my $mailsaobject = shift;

   # some boilerplate...
   $class = ref($class) || $class;
   my $self = $class->SUPER::new($mailsaobject);
   bless ($self, $class);

   # the important bit!
   $self->register_eval_rule("multipart_alternative_difference");
   $self->register_eval_rule("multipart_alternative_difference_count");
   $self->register_eval_rule("check_blank_line_ratio");
   $self->register_eval_rule("tvd_vertical_words");

   return $self;
 }

 sub multipart_alternative_difference {
   my ($self, $pms, $fulltext, $min, $max) = @_;

   $self->_multipart_alternative_difference($pms) unless (exists $pms->{madiff});

   if (($min == 0 || $pms->{madiff} > $min) &&
       ($max eq "undef" || $pms->{madiff} <= $max)) {
       return 1;
   }
   return 0;
 }

 sub multipart_alternative_difference_count {
   my ($self, $pms, $fulltext, $ratio, $minhtml) = @_;
   $self->_multipart_alternative_difference($pms) unless (exists $pms->{madiff});
   return 0 unless $pms->{madiff_html} > $minhtml;
   return(($pms->{madiff_text} / $pms->{madiff_html}) > $ratio);
 }

 sub _multipart_alternative_difference {
   my ($self, $pms) = @_;
   $pms->{madiff} = 0;
   $pms->{madiff_html} = 0;
   $pms->{madiff_text} = 0;

   my $msg = $pms->{msg};

   # Find all multipart/alternative parts in the message
   my @ma = $msg->find_parts(qr@^multipart/alternative\b@i);

   # If there are no multipart/alternative sections, skip this test.
   return if (!@ma);

   # Figure out what the MIME content of the message looks like
   my @content = $msg->content_summary();

   # Exchange meeting requests come in as m/a text/html text/calendar,
   # which we want to ignore because of the high FP rate it would cause.
   #
   if (@content == 3 && $content[2] eq 'text/calendar' &&
   	$content[1] eq 'text/html' &&
   	$content[0] eq 'multipart/alternative') {
     return;
   }

   # Go through each of the multipart parts
   foreach my $part (@ma) {
     my %html = ();
     my %text = ();

     # limit our search to text-based parts
     my @txt = $part->find_parts(qr@^text\b@i);
     foreach my $text (@txt) {
       # we only care about the rendered version of the part
       my ($type, $rnd) = $text->rendered();
       next unless defined $type;

       # parse the rendered text into tokens.  assume they are whitespace
       # separated, and ignore anything that doesn't have a word-character
       # in it (0-9a-zA-Z_) since those are probably things like bullet
       # points, horizontal lines, etc.  this assumes that punctuation
       # in one part will be the same in other parts.
       #
       if ($type eq 'text/html') {
         foreach my $w (grep(/\w/, split(/\s+/, $rnd))) {
 	  #dbg("eval: HTML: $w");
           $html{$w}++;
         }

 	# If there are no words, mark if there's at least 1 image ...
 	if (keys %html == 0 && exists $pms->{html}{inside}{img}) {
 	  # Use "\n" as the mark since it can't ever occur normally
 	  $html{"\n"}=1;
 	}
       }
       else {
         foreach my $w (grep(/\w/, split(/\s+/, $rnd))) {
 	  #dbg("eval: TEXT: $w");
           $text{$w}++;
         }
       }
     }

     # How many HTML tokens do we have at the start?
     my $orig = keys %html;
     next if ($orig == 0);

     $pms->{madiff_html} = $orig;
     $pms->{madiff_text} = keys %text;
     dbg('eval: text words: ' . $pms->{madiff_text} . ', html words: ' . $pms->{madiff_html});

     # If the token appears at least as many times in the text part as
     # in the html part, remove it from the list of html tokens.
     while(my ($k,$v) = each %text) {
       delete $html{$k} if (exists $html{$k} && $html{$k}-$text{$k} < 1);
     }

     #map { dbg("eval: LEFT: $_") } keys %html;

     # In theory, the tokens should be the same in both text and html
     # parts, so there would be 0 tokens left in the html token list, for
     # a 0% difference rate.  Calculate it here, and record the difference
     # if it's been the highest so far in this message.
     my $diff = scalar(keys %html)/$orig*100;
     $pms->{madiff} = $diff if ($diff > $pms->{madiff});

     dbg("eval: " . sprintf "madiff: left: %d, orig: %d, max-difference: %0.2f%%", scalar(keys %html), $orig, $pms->{madiff});
   }

   return;
 }

 sub check_blank_line_ratio {
   my ($self, $pms, $fulltext, $min, $max, $minlines) = @_;

   if (!defined $minlines || $minlines < 1) {
     $minlines = 1;
   }

   if (! exists $pms->{blank_line_ratio}->{$minlines}) {
     $fulltext = $pms->get_decoded_body_text_array();
     my ($blank) = 0;
     if (scalar @{$fulltext} >= $minlines) {
       foreach my $line (@{$fulltext}) {
         next if ($line =~ /\S/);
         $blank++;
       }
       $pms->{blank_line_ratio}->{$minlines} = 100 * $blank / scalar @{$fulltext};
     }
     else {
       $pms->{blank_line_ratio}->{$minlines} = -1; # don't report if it's a blank message ...
     }
   }

   return (($min == 0 && $pms->{blank_line_ratio}->{$minlines} <= $max) ||
 	  ($pms->{blank_line_ratio}->{$minlines} > $min &&
 	   $pms->{blank_line_ratio}->{$minlines} <= $max));
 }

 sub tvd_vertical_words {
   my ($self, $pms, $text, $min, $max) = @_;

   # klugy
   $max = 101 if ($max >= 100);

   if (!defined $pms->{tvd_vertical_words}) {
     $pms->{tvd_vertical_words} = 0;

     foreach (@{$text}) {
       my $l = length $_;
       next unless ($l > 5);
       my $spaces = tr/ / /;
       my $nonspaces = $l - $spaces;
       my $pct;
       if ($spaces > $nonspaces || $nonspaces == 0) {
         $pct = 100;
       }
       else {
         $pct = int(100*$spaces/$nonspaces);
       }
       $pms->{tvd_vertical_words} = $pct if ($pct > $pms->{tvd_vertical_words});
     }
   }

   return 1 if ($pms->{tvd_vertical_words} >= $min && $pms->{tvd_vertical_words} < $max);
 }

 1;
	# <@LICENSE>
	# Licensed to the Apache Software Foundation (ASF) under one or more
	# contributor license agreements. See the NOTICE file distributed with
	# this work for additional information regarding copyright ownership.
	# The ASF licenses this file to you under the Apache License, Version 2.0
	# (the "License"); you may not use this file except in compliance with
	# the License. You may obtain a copy of the License at:
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.
	# </@LICENSE>

	package Mail::SpamAssassin::Plugin::BodyEval;

	use Mail::SpamAssassin::Plugin;
	use Mail::SpamAssassin::Logger;
	use Mail::SpamAssassin::Constants qw(:sa);

	use strict;
	use warnings;
	use bytes;

	use vars qw(@ISA);
	@ISA = qw(Mail::SpamAssassin::Plugin);

	# constructor: register the eval rule
	sub new {
	my $class = shift;
	my $mailsaobject = shift;

	# some boilerplate...
	$class = ref($class) \|\| $class;
	my $self = $class->SUPER::new($mailsaobject);
	bless ($self, $class);

	# the important bit!
	$self->register_eval_rule("multipart_alternative_difference");
	$self->register_eval_rule("multipart_alternative_difference_count");
	$self->register_eval_rule("check_blank_line_ratio");
	$self->register_eval_rule("tvd_vertical_words");

	return $self;
	}

	sub multipart_alternative_difference {
	my ($self, $pms, $fulltext, $min, $max) = @_;

	$self->_multipart_alternative_difference($pms) unless (exists $pms->{madiff});

	if (($min == 0 \|\| $pms->{madiff} > $min) &&
	($max eq "undef" \|\| $pms->{madiff} <= $max)) {
	return 1;
	}
	return 0;
	}

	sub multipart_alternative_difference_count {
	my ($self, $pms, $fulltext, $ratio, $minhtml) = @_;
	$self->_multipart_alternative_difference($pms) unless (exists $pms->{madiff});
	return 0 unless $pms->{madiff_html} > $minhtml;
	return(($pms->{madiff_text} / $pms->{madiff_html}) > $ratio);
	}

	sub _multipart_alternative_difference {
	my ($self, $pms) = @_;
	$pms->{madiff} = 0;
	$pms->{madiff_html} = 0;
	$pms->{madiff_text} = 0;

	my $msg = $pms->{msg};

	# Find all multipart/alternative parts in the message
	my @ma = $msg->find_parts(qr@^multipart/alternative\b@i);

	# If there are no multipart/alternative sections, skip this test.
	return if (!@ma);

	# Figure out what the MIME content of the message looks like
	my @content = $msg->content_summary();

	# Exchange meeting requests come in as m/a text/html text/calendar,
	# which we want to ignore because of the high FP rate it would cause.
	#
	if (@content == 3 && $content[2] eq 'text/calendar' &&
	$content[1] eq 'text/html' &&
	$content[0] eq 'multipart/alternative') {
	return;
	}

	# Go through each of the multipart parts
	foreach my $part (@ma) {
	my %html = ();
	my %text = ();

	# limit our search to text-based parts
	my @txt = $part->find_parts(qr@^text\b@i);
	foreach my $text (@txt) {
	# we only care about the rendered version of the part
	my ($type, $rnd) = $text->rendered();
	next unless defined $type;

	# parse the rendered text into tokens. assume they are whitespace
	# separated, and ignore anything that doesn't have a word-character
	# in it (0-9a-zA-Z_) since those are probably things like bullet
	# points, horizontal lines, etc. this assumes that punctuation
	# in one part will be the same in other parts.
	#
	if ($type eq 'text/html') {
	foreach my $w (grep(/\w/, split(/\s+/, $rnd))) {
	#dbg("eval: HTML: $w");
	$html{$w}++;
	}

	# If there are no words, mark if there's at least 1 image ...
	if (keys %html == 0 && exists $pms->{html}{inside}{img}) {
	# Use "\n" as the mark since it can't ever occur normally
	$html{"\n"}=1;
	}
	}
	else {
	foreach my $w (grep(/\w/, split(/\s+/, $rnd))) {
	#dbg("eval: TEXT: $w");
	$text{$w}++;
	}
	}
	}

	# How many HTML tokens do we have at the start?
	my $orig = keys %html;
	next if ($orig == 0);

	$pms->{madiff_html} = $orig;
	$pms->{madiff_text} = keys %text;
	dbg('eval: text words: ' . $pms->{madiff_text} . ', html words: ' . $pms->{madiff_html});

	# If the token appears at least as many times in the text part as
	# in the html part, remove it from the list of html tokens.
	while(my ($k,$v) = each %text) {
	delete $html{$k} if (exists $html{$k} && $html{$k}-$text{$k} < 1);
	}

	#map { dbg("eval: LEFT: $_") } keys %html;

	# In theory, the tokens should be the same in both text and html
	# parts, so there would be 0 tokens left in the html token list, for
	# a 0% difference rate. Calculate it here, and record the difference
	# if it's been the highest so far in this message.
	my $diff = scalar(keys %html)/$orig*100;
	$pms->{madiff} = $diff if ($diff > $pms->{madiff});

	dbg("eval: " . sprintf "madiff: left: %d, orig: %d, max-difference: %0.2f%%", scalar(keys %html), $orig, $pms->{madiff});
	}

	return;
	}

	sub check_blank_line_ratio {
	my ($self, $pms, $fulltext, $min, $max, $minlines) = @_;

	if (!defined $minlines \|\| $minlines < 1) {
	$minlines = 1;
	}

	if (! exists $pms->{blank_line_ratio}->{$minlines}) {
	$fulltext = $pms->get_decoded_body_text_array();
	my ($blank) = 0;
	if (scalar @{$fulltext} >= $minlines) {
	foreach my $line (@{$fulltext}) {
	next if ($line =~ /\S/);
	$blank++;
	}
	$pms->{blank_line_ratio}->{$minlines} = 100 * $blank / scalar @{$fulltext};
	}
	else {
	$pms->{blank_line_ratio}->{$minlines} = -1; # don't report if it's a blank message ...
	}
	}

	return (($min == 0 && $pms->{blank_line_ratio}->{$minlines} <= $max) \|\|
	($pms->{blank_line_ratio}->{$minlines} > $min &&
	$pms->{blank_line_ratio}->{$minlines} <= $max));
	}

	sub tvd_vertical_words {
	my ($self, $pms, $text, $min, $max) = @_;

	# klugy
	$max = 101 if ($max >= 100);

	if (!defined $pms->{tvd_vertical_words}) {
	$pms->{tvd_vertical_words} = 0;

	foreach (@{$text}) {
	my $l = length $_;
	next unless ($l > 5);
	my $spaces = tr/ / /;
	my $nonspaces = $l - $spaces;
	my $pct;
	if ($spaces > $nonspaces \|\| $nonspaces == 0) {
	$pct = 100;
	}
	else {
	$pct = int(100*$spaces/$nonspaces);
	}
	$pms->{tvd_vertical_words} = $pct if ($pct > $pms->{tvd_vertical_words});
	}
	}

	return 1 if ($pms->{tvd_vertical_words} >= $min && $pms->{tvd_vertical_words} < $max);
	}

	1;