lib/Mail/SpamAssassin/Plugin/BodyEval.pm - spamassassin - Git at Google

 # <@LICENSE>
 # Licensed to the Apache Software Foundation (ASF) under one or more
 # contributor license agreements.  See the NOTICE file distributed with
 # this work for additional information regarding copyright ownership.
 # The ASF licenses this file to you under the Apache License, Version 2.0
 # (the "License"); you may not use this file except in compliance with
 # the License.  You may obtain a copy of the License at:
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # </@LICENSE>

 package Mail::SpamAssassin::Plugin::BodyEval;

 use Mail::SpamAssassin::Plugin;
 use Mail::SpamAssassin::Logger;
 use Mail::SpamAssassin::Constants qw(:sa);

 use strict;
 use warnings;
 use bytes;
 use re 'taint';

 use vars qw(@ISA);
 @ISA = qw(Mail::SpamAssassin::Plugin);

 # constructor: register the eval rule
 sub new {
   my $class = shift;
   my $mailsaobject = shift;

   # some boilerplate...
   $class = ref($class) || $class;
   my $self = $class->SUPER::new($mailsaobject);
   bless ($self, $class);

   # the important bit!
   $self->register_eval_rule("multipart_alternative_difference");
   $self->register_eval_rule("multipart_alternative_difference_count");
   $self->register_eval_rule("check_blank_line_ratio");
   $self->register_eval_rule("tvd_vertical_words");
   $self->register_eval_rule("check_stock_info");
   $self->register_eval_rule("check_body_length");

   return $self;
 }

 sub multipart_alternative_difference {
   my ($self, $pms, $fulltext, $min, $max) = @_;

   $self->_multipart_alternative_difference($pms) unless (exists $pms->{madiff});

   if (($min == 0 || $pms->{madiff} > $min) &&
       ($max eq "undef" || $pms->{madiff} <= $max)) {
       return 1;
   }
   return 0;
 }

 sub multipart_alternative_difference_count {
   my ($self, $pms, $fulltext, $ratio, $minhtml) = @_;
   $self->_multipart_alternative_difference($pms) unless (exists $pms->{madiff});
   return 0 unless $pms->{madiff_html} > $minhtml;
   return(($pms->{madiff_text} / $pms->{madiff_html}) > $ratio);
 }

 sub _multipart_alternative_difference {
   my ($self, $pms) = @_;
   $pms->{madiff} = 0;
   $pms->{madiff_html} = 0;
   $pms->{madiff_text} = 0;

   my $msg = $pms->{msg};

   # Find all multipart/alternative parts in the message
   my @ma = $msg->find_parts(qr@^multipart/alternative\b@i);

   # If there are no multipart/alternative sections, skip this test.
   return if (!@ma);

   # Figure out what the MIME content of the message looks like
   my @content = $msg->content_summary();

   # Exchange meeting requests come in as m/a text/html text/calendar,
   # which we want to ignore because of the high FP rate it would cause.
   #
   if (@content == 3 && $content[2] eq 'text/calendar' &&
   	$content[1] eq 'text/html' &&
   	$content[0] eq 'multipart/alternative') {
     return;
   }

   # Go through each of the multipart parts
   foreach my $part (@ma) {
     my %html;
     my %text;

     # limit our search to text-based parts
     my @txt = $part->find_parts(qr@^text\b@i);
     foreach my $text (@txt) {
       # we only care about the rendered version of the part
       my ($type, $rnd) = $text->rendered();
       next unless defined $type;

       # parse the rendered text into tokens.  assume they are whitespace
       # separated, and ignore anything that doesn't have a word-character
       # in it (0-9a-zA-Z_) since those are probably things like bullet
       # points, horizontal lines, etc.  this assumes that punctuation
       # in one part will be the same in other parts.
       #
       if ($type eq 'text/html') {
         foreach my $w (grep(/\w/, split(/\s+/, $rnd))) {
 	  #dbg("eval: HTML: $w");
           $html{$w}++;
         }

 	# If there are no words, mark if there's at least 1 image ...
 	if (!%html && exists $pms->{html}{inside}{img}) {
 	  # Use "\n" as the mark since it can't ever occur normally
 	  $html{"\n"}=1;
 	}
       }
       else {
         foreach my $w (grep(/\w/, split(/\s+/, $rnd))) {
 	  #dbg("eval: TEXT: $w");
           $text{$w}++;
         }
       }
     }

     # How many HTML tokens do we have at the start?
     my $orig = keys %html;
     next if ($orig == 0);

     $pms->{madiff_html} = $orig;
     $pms->{madiff_text} = keys %text;
     dbg('eval: text words: ' . $pms->{madiff_text} . ', html words: ' . $pms->{madiff_html});

     # If the token appears at least as many times in the text part as
     # in the html part, remove it from the list of html tokens.
     while(my ($k,$v) = each %text) {
       delete $html{$k} if (exists $html{$k} && $html{$k}-$text{$k} < 1);
     }

     #map { dbg("eval: LEFT: $_") } keys %html;

     # In theory, the tokens should be the same in both text and html
     # parts, so there would be 0 tokens left in the html token list, for
     # a 0% difference rate.  Calculate it here, and record the difference
     # if it's been the highest so far in this message.
     my $diff = scalar(keys %html)/$orig*100;
     $pms->{madiff} = $diff if ($diff > $pms->{madiff});

     dbg("eval: " . sprintf "madiff: left: %d, orig: %d, max-difference: %0.2f%%", scalar(keys %html), $orig, $pms->{madiff});
   }

   return;
 }

 sub check_blank_line_ratio {
   my ($self, $pms, $fulltext, $min, $max, $minlines) = @_;

   if (!defined $minlines || $minlines < 1) {
     $minlines = 1;
   }

   my $blank_line_ratio_ref = $pms->{blank_line_ratio};

   if (! exists $blank_line_ratio_ref->{$minlines}) {
     $fulltext = $pms->get_decoded_body_text_array();

     my $blank = 0;
     my $nlines = 0;
     foreach my $chunk (@$fulltext) {
       foreach (split(/^/m, $chunk, -1)) {
         $nlines++;
         $blank++  if !/\S/;
       }
     }

     # report -1 if it's a blank message ...
     $blank_line_ratio_ref->{$minlines} =
       $nlines < $minlines ? -1 : 100 * $blank / $nlines;
   }

   return (($min == 0 && $blank_line_ratio_ref->{$minlines} <= $max) ||
 	  ($blank_line_ratio_ref->{$minlines} > $min &&
 	   $blank_line_ratio_ref->{$minlines} <= $max));
 }

 sub tvd_vertical_words {
   my ($self, $pms, $text, $min, $max) = @_;

   # klugy
   $max = 101 if ($max >= 100);

   if (!defined $pms->{tvd_vertical_words}) {
     $pms->{tvd_vertical_words} = -1;

     foreach (@{$text}) {
       my $l = length $_;
       next unless ($l > 5);
       my $spaces = tr/ / /;
       my $nonspaces = $l - $spaces;
       my $pct;
       if ($spaces > $nonspaces || $nonspaces == 0) {
         $pct = 100;
       }
       else {
         $pct = int(100*$spaces/$nonspaces);
       }
       $pms->{tvd_vertical_words} = $pct if ($pct > $pms->{tvd_vertical_words});
     }
   }

   return 1 if ($pms->{tvd_vertical_words} >= $min && $pms->{tvd_vertical_words} < $max);
 }

 sub check_stock_info {
   my ($self, $pms, $fulltext, $min) = @_;

   $self->_check_stock_info($pms) unless (exists $pms->{stock_info});

   if ($min == 0 || $pms->{stock_info} >= $min) {
       return 1;
   }
   return 0;
 }

 sub _check_stock_info {
   my ($self, $pms) = @_;
   $pms->{stock_info} = 0;

   # Find all multipart/alternative parts in the message
   my @parts = $pms->{msg}->find_parts(qr@^text/plain$@i);
   return if (!@parts);

   # Go through each of the multipart parts
   my %hits;
   my $part = $parts[0];
   my ($type, $rnd) = $part->rendered();
   return unless $type;

   # bug 5644,5717: avoid pathological cases where a regexp takes massive amount
   # of time by applying the regexp to limited-size text chunks, one at a time

   foreach my $rnd_chunk (
     Mail::SpamAssassin::Message::split_into_array_of_short_paragraphs($rnd))
   {
     foreach ( $rnd_chunk =~ /^\s*([^:\s][^:\n]{2,29})\s*:\s*\S/mg ) {
       my $str = lc $_;
       $str =~ tr/a-z//cd;
       #$str =~ s/([a-z])0([a-z])/$1o$2/g;

       if ($str =~ /(
         ^trad(?:e|ing)date|
         company(?:name)?|
         s\w?(?:t\w?o\w?c\w?k|y\w?m(?:\w?b\w?o\w?l)?)|
         t(?:arget|icker)|
         (?:opening|current)p(?:rice)?|
         p(?:rojected|osition)|
         expectations|
         weeks?high|
         marketperformance|
         (?:year|week|month|day|price)(?:target|estimates?)|
         sector|
         r(?:ecommendation|ating)
       )$/x) {
         $hits{$1}++;
         dbg("eval: stock info hit: $1");
       }
     }
   }

   $pms->{stock_info} = scalar keys %hits;
   dbg("eval: stock info total: ".$pms->{stock_info});

   return;
 }

 sub check_body_length {
   my ($self, $pms, undef, $min) = @_;

   my $body_length = $pms->{msg}->{pristine_body_length};
   dbg("eval: body_length - %s - check for min of %s", $body_length, $min);

   return (defined $body_length && $body_length <= $min) ? 1 : 0;
 }

 # ---------------------------------------------------------------------------

 # capability checks for "if can()":
 #
 sub has_check_body_length { 1 }

 1;
	# <@LICENSE>
	# Licensed to the Apache Software Foundation (ASF) under one or more
	# contributor license agreements. See the NOTICE file distributed with
	# this work for additional information regarding copyright ownership.
	# The ASF licenses this file to you under the Apache License, Version 2.0
	# (the "License"); you may not use this file except in compliance with
	# the License. You may obtain a copy of the License at:
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.
	# </@LICENSE>

	package Mail::SpamAssassin::Plugin::BodyEval;

	use Mail::SpamAssassin::Plugin;
	use Mail::SpamAssassin::Logger;
	use Mail::SpamAssassin::Constants qw(:sa);

	use strict;
	use warnings;
	use bytes;
	use re 'taint';

	use vars qw(@ISA);
	@ISA = qw(Mail::SpamAssassin::Plugin);

	# constructor: register the eval rule
	sub new {
	my $class = shift;
	my $mailsaobject = shift;

	# some boilerplate...
	$class = ref($class) \|\| $class;
	my $self = $class->SUPER::new($mailsaobject);
	bless ($self, $class);

	# the important bit!
	$self->register_eval_rule("multipart_alternative_difference");
	$self->register_eval_rule("multipart_alternative_difference_count");
	$self->register_eval_rule("check_blank_line_ratio");
	$self->register_eval_rule("tvd_vertical_words");
	$self->register_eval_rule("check_stock_info");
	$self->register_eval_rule("check_body_length");

	return $self;
	}

	sub multipart_alternative_difference {
	my ($self, $pms, $fulltext, $min, $max) = @_;

	$self->_multipart_alternative_difference($pms) unless (exists $pms->{madiff});

	if (($min == 0 \|\| $pms->{madiff} > $min) &&
	($max eq "undef" \|\| $pms->{madiff} <= $max)) {
	return 1;
	}
	return 0;
	}

	sub multipart_alternative_difference_count {
	my ($self, $pms, $fulltext, $ratio, $minhtml) = @_;
	$self->_multipart_alternative_difference($pms) unless (exists $pms->{madiff});
	return 0 unless $pms->{madiff_html} > $minhtml;
	return(($pms->{madiff_text} / $pms->{madiff_html}) > $ratio);
	}

	sub _multipart_alternative_difference {
	my ($self, $pms) = @_;
	$pms->{madiff} = 0;
	$pms->{madiff_html} = 0;
	$pms->{madiff_text} = 0;

	my $msg = $pms->{msg};

	# Find all multipart/alternative parts in the message
	my @ma = $msg->find_parts(qr@^multipart/alternative\b@i);

	# If there are no multipart/alternative sections, skip this test.
	return if (!@ma);

	# Figure out what the MIME content of the message looks like
	my @content = $msg->content_summary();

	# Exchange meeting requests come in as m/a text/html text/calendar,
	# which we want to ignore because of the high FP rate it would cause.
	#
	if (@content == 3 && $content[2] eq 'text/calendar' &&
	$content[1] eq 'text/html' &&
	$content[0] eq 'multipart/alternative') {
	return;
	}

	# Go through each of the multipart parts
	foreach my $part (@ma) {
	my %html;
	my %text;

	# limit our search to text-based parts
	my @txt = $part->find_parts(qr@^text\b@i);
	foreach my $text (@txt) {
	# we only care about the rendered version of the part
	my ($type, $rnd) = $text->rendered();
	next unless defined $type;

	# parse the rendered text into tokens. assume they are whitespace
	# separated, and ignore anything that doesn't have a word-character
	# in it (0-9a-zA-Z_) since those are probably things like bullet
	# points, horizontal lines, etc. this assumes that punctuation
	# in one part will be the same in other parts.
	#
	if ($type eq 'text/html') {
	foreach my $w (grep(/\w/, split(/\s+/, $rnd))) {
	#dbg("eval: HTML: $w");
	$html{$w}++;
	}

	# If there are no words, mark if there's at least 1 image ...
	if (!%html && exists $pms->{html}{inside}{img}) {
	# Use "\n" as the mark since it can't ever occur normally
	$html{"\n"}=1;
	}
	}
	else {
	foreach my $w (grep(/\w/, split(/\s+/, $rnd))) {
	#dbg("eval: TEXT: $w");
	$text{$w}++;
	}
	}
	}

	# How many HTML tokens do we have at the start?
	my $orig = keys %html;
	next if ($orig == 0);

	$pms->{madiff_html} = $orig;
	$pms->{madiff_text} = keys %text;
	dbg('eval: text words: ' . $pms->{madiff_text} . ', html words: ' . $pms->{madiff_html});

	# If the token appears at least as many times in the text part as
	# in the html part, remove it from the list of html tokens.
	while(my ($k,$v) = each %text) {
	delete $html{$k} if (exists $html{$k} && $html{$k}-$text{$k} < 1);
	}

	#map { dbg("eval: LEFT: $_") } keys %html;

	# In theory, the tokens should be the same in both text and html
	# parts, so there would be 0 tokens left in the html token list, for
	# a 0% difference rate. Calculate it here, and record the difference
	# if it's been the highest so far in this message.
	my $diff = scalar(keys %html)/$orig*100;
	$pms->{madiff} = $diff if ($diff > $pms->{madiff});

	dbg("eval: " . sprintf "madiff: left: %d, orig: %d, max-difference: %0.2f%%", scalar(keys %html), $orig, $pms->{madiff});
	}

	return;
	}

	sub check_blank_line_ratio {
	my ($self, $pms, $fulltext, $min, $max, $minlines) = @_;

	if (!defined $minlines \|\| $minlines < 1) {
	$minlines = 1;
	}

	my $blank_line_ratio_ref = $pms->{blank_line_ratio};

	if (! exists $blank_line_ratio_ref->{$minlines}) {
	$fulltext = $pms->get_decoded_body_text_array();

	my $blank = 0;
	my $nlines = 0;
	foreach my $chunk (@$fulltext) {
	foreach (split(/^/m, $chunk, -1)) {
	$nlines++;
	$blank++ if !/\S/;
	}
	}

	# report -1 if it's a blank message ...
	$blank_line_ratio_ref->{$minlines} =
	$nlines < $minlines ? -1 : 100 * $blank / $nlines;
	}

	return (($min == 0 && $blank_line_ratio_ref->{$minlines} <= $max) \|\|
	($blank_line_ratio_ref->{$minlines} > $min &&
	$blank_line_ratio_ref->{$minlines} <= $max));
	}

	sub tvd_vertical_words {
	my ($self, $pms, $text, $min, $max) = @_;

	# klugy
	$max = 101 if ($max >= 100);

	if (!defined $pms->{tvd_vertical_words}) {
	$pms->{tvd_vertical_words} = -1;

	foreach (@{$text}) {
	my $l = length $_;
	next unless ($l > 5);
	my $spaces = tr/ / /;
	my $nonspaces = $l - $spaces;
	my $pct;
	if ($spaces > $nonspaces \|\| $nonspaces == 0) {
	$pct = 100;
	}
	else {
	$pct = int(100*$spaces/$nonspaces);
	}
	$pms->{tvd_vertical_words} = $pct if ($pct > $pms->{tvd_vertical_words});
	}
	}

	return 1 if ($pms->{tvd_vertical_words} >= $min && $pms->{tvd_vertical_words} < $max);
	}

	sub check_stock_info {
	my ($self, $pms, $fulltext, $min) = @_;

	$self->_check_stock_info($pms) unless (exists $pms->{stock_info});

	if ($min == 0 \|\| $pms->{stock_info} >= $min) {
	return 1;
	}
	return 0;
	}

	sub _check_stock_info {
	my ($self, $pms) = @_;
	$pms->{stock_info} = 0;

	# Find all multipart/alternative parts in the message
	my @parts = $pms->{msg}->find_parts(qr@^text/plain$@i);
	return if (!@parts);

	# Go through each of the multipart parts
	my %hits;
	my $part = $parts[0];
	my ($type, $rnd) = $part->rendered();
	return unless $type;

	# bug 5644,5717: avoid pathological cases where a regexp takes massive amount
	# of time by applying the regexp to limited-size text chunks, one at a time

	foreach my $rnd_chunk (
	Mail::SpamAssassin::Message::split_into_array_of_short_paragraphs($rnd))
	{
	foreach ( $rnd_chunk =~ /^\s([^:\s][^:\n]{2,29})\s:\s*\S/mg ) {
	my $str = lc $_;
	$str =~ tr/a-z//cd;
	#$str =~ s/([a-z])0([a-z])/$1o$2/g;

	if ($str =~ /(
	^trad(?:e\|ing)date\|
	company(?:name)?\|
	s\w?(?:t\w?o\w?c\w?k\|y\w?m(?:\w?b\w?o\w?l)?)\|
	t(?:arget\|icker)\|
	(?:opening\|current)p(?:rice)?\|
	p(?:rojected\|osition)\|
	expectations\|
	weeks?high\|
	marketperformance\|
	(?:year\|week\|month\|day\|price)(?:target\|estimates?)\|
	sector\|
	r(?:ecommendation\|ating)
	)$/x) {
	$hits{$1}++;
	dbg("eval: stock info hit: $1");
	}
	}
	}

	$pms->{stock_info} = scalar keys %hits;
	dbg("eval: stock info total: ".$pms->{stock_info});

	return;
	}

	sub check_body_length {
	my ($self, $pms, undef, $min) = @_;

	my $body_length = $pms->{msg}->{pristine_body_length};
	dbg("eval: body_length - %s - check for min of %s", $body_length, $min);

	return (defined $body_length && $body_length <= $min) ? 1 : 0;
	}

	# ---------------------------------------------------------------------------

	# capability checks for "if can()":
	#
	sub has_check_body_length { 1 }

	1;