lib/Mail/SpamAssassin/Plugin/ExtractText.pm - spamassassin - Git at Google

 # <@LICENSE>
 # Licensed to the Apache Software Foundation (ASF) under one or more
 # contributor license agreements.  See the NOTICE file distributed with
 # this work for additional information regarding copyright ownership.
 # The ASF licenses this file to you under the Apache License, Version 2.0
 # (the "License"); you may not use this file except in compliance with
 # the License.  You may obtain a copy of the License at:
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # </@LICENSE>

 # Authors: Jonas Eckerman, Dave Wreski, Giovanni Bechis

 =head1 NAME

 ExtractText - extracts text from documenmts.

 =head1 SYNOPSIS

 loadplugin Mail::SpamAssassin::Plugin::ExtractText

 ifplugin Mail::SpamAssassin::Plugin::ExtractText

   extracttext_external  pdftotext  /usr/bin/pdftotext -nopgbrk -layout -enc UTF-8 {} -
   extracttext_use       pdftotext  .pdf application/pdf

   # http://docx2txt.sourceforge.net
   extracttext_external  docx2txt   /usr/bin/docx2txt {} -
   extracttext_use       docx2txt   .docx application/docx

   extracttext_external  antiword   /usr/bin/antiword -t -w 0 -m UTF-8.txt {}
   extracttext_use       antiword   .doc application/(?:vnd\.?)?ms-?word.*

   extracttext_external  unrtf      /usr/bin/unrtf --nopict {}
   extracttext_use       unrtf      .doc .rtf application/rtf text/rtf

   extracttext_external  odt2txt    /usr/bin/odt2txt --encoding=UTF-8 {}
   extracttext_use       odt2txt    .odt .ott application/.*?opendocument.*text
   extracttext_use       odt2txt    .sdw .stw application/(?:x-)?soffice application/(?:x-)?starwriter

   extracttext_external  tesseract  {OMP_THREAD_LIMIT=1} /usr/bin/tesseract -c page_separator= {} -
   extracttext_use       tesseract  .jpg .png .bmp .tif .tiff image/(?:jpeg|png|x-ms-bmp|tiff)

   # QR-code decoder
   extracttext_external  zbar       /usr/bin/zbarimg -q -D {}
   extracttext_use       zbar       .jpg .png .pdf image/(?:jpeg|png) application/pdf

   add_header   all          ExtractText-Flags _EXTRACTTEXTFLAGS_
   header       PDF_NO_TEXT  X-ExtractText-Flags =~ /\bpdftotext_NoText\b/
   describe     PDF_NO_TEXT  PDF without text
   score        PDF_NO_TEXT  0.001

   header       DOC_NO_TEXT  X-ExtractText-Flags =~ /\b(?:antiword|openxml|unrtf|odt2txt)_NoText\b/
   describe     DOC_NO_TEXT  Document without text
   score        DOC_NO_TEXT  0.001

   header       EXTRACTTEXT  exists:X-ExtractText-Flags
   describe     EXTRACTTEXT  Email processed by extracttext plugin
   score        EXTRACTTEXT  0.001

 endif

 =head1 DESCRIPTION

 This module uses external tools to extract text from message parts,
 and then sets the text as the rendered part. External tool must output
 plain text, not HTML or other non-textual result.

 How to extract text is completely configurable, and based on
 MIME part type and file name.

 =head1 CONFIGURATION

 All configuration lines in user_prefs files will be ignored.

 =over 4

 =item extracttext_maxparts (default: 10)

 Configure the maximum mime parts number to analyze, a value of 0 means all mime parts
 will be analyzed

 =item extracttext_timeout (default: 5 10)

 Configure the timeout in seconds of external tool checks, per attachment.

 Second argument speficies maximum total time for all checks.

 =back

 =head2 Tools

 =over

 =item extracttext_use

 Specifies what tool to use for what message parts.

 The general syntax is

 extracttext_use  C<name>  C<specifiers>

 =back

 =over

 =item name

 the internal name of a tool.

 =item specifiers

 File extension and regular expressions for file names and MIME
 types. The regular expressions are anchored to beginning and end.

 =back

 =head3 Examples

 	extracttext_use  antiword  .doc application/(?:vnd\.?)?ms-?word.*
 	extracttext_use  openxml   .docx .dotx .dotm application/(?:vnd\.?)openxml.*?word.*
 	extracttext_use  openxml   .doc .dot application/(?:vnd\.?)?ms-?word.*
 	extracttext_use  unrtf     .doc .rtf application/rtf text/rtf

 =over

 =item extracttext_external

 Defines an external tool.  The tool must read a document on standard input
 or from a file and write text to standard output.

 The special keyword "{}" will be substituted at runtime with the temporary
 filename to be scanned by the external tool.

 Environment variables can be defined with "{KEY=VALUE}", these strings will
 be removed from commandline.

 It is required that commandline used outputs result directly to STDOUT.

 The general syntax is

 extracttext_external C<name> C<command> C<parameters>

 =back

 =over

 =item name

 The internal name of this tool.

 =item command

 The full path to the external command to run.

 =item parameters

 Parameters for the external command. The temporary file name containing
 the document will be automatically added as last parameter.

 =back

 =head3 Examples

 	extracttext_external  antiword  /usr/bin/antiword -t -w 0 -m UTF-8.txt {} -
 	extracttext_external  unrtf     /usr/bin/unrtf --nopict {}
 	extracttext_external  odt2txt   /usr/bin/odt2txt --encoding=UTF-8 {}

 =head2 Metadata

 The plugin adds some pseudo headers to the message. These headers are seen by
 the bayes system, and can be used in normal SpamAssassin rules.

 The headers are also available as template tags as noted below.

 =head3 Example

 The fictional example headers below are based on a message containing this:

 =over

 =item 1
 A perfectly normal PDF.

 =item 2
 An OpenXML document with a word document inside.
 Neither Office document contains text.

 =back

 =head3 Headers

 =over

 =item X-ExtractText-Chars

 Tag: _EXTRACTTEXTCHARS_

 Contains a count of characters that were extracted.

 X-ExtractText-Chars: 10970

 =item X-ExtractText-Words

 Tag: _EXTRACTTEXTWORDS_

 Contains a count of "words" that were extracted.

 X-ExtractText-Chars: 1599

 =item X-ExtractText-Tools

 Tag: _EXTRACTTEXTTOOLS_

 Contains chains of tools used for extraction.

 X-ExtractText-Tools: pdftotext openxml_antiword

 =item X-ExtractText-Types

 Tag: _EXTRACTTEXTTYPES_

 Contains chains of MIME types for parts found during extraction.

 X-ExtractText-Types: application/pdf; application/vnd.openxmlformats-officedocument.wordprocessingml.document, application/ms-word

 =item X-ExtractText-Extensions

 Tag: _EXTRACTTEXTEXTENSIONS_

 Contains chains of canonicalized file extensions for parts
 found during extraction.

 X-ExtractText-Extensions: pdf docx

 =item X-ExtractText-Flags

 Tag: _EXTRACTTEXTFLAGS_

 Contains notes from the plugin.

 X-ExtractText-Flags: openxml_NoText

 =item X-ExtractText-Uris

 Tag: _EXTRACTTEXTURIS_

 Contains uris extracted from the plugin.

 X-ExtractText-Uris: https://spamassassin.apache.org

 =back

 =head3 Rules

 Example:

 	header    PDF_NO_TEXT  X-ExtractText-Flags =~ /\bpdftotext_Notext\b/
 	describe  PDF_NO_TEXT  PDF without text

 =cut

 package Mail::SpamAssassin::Plugin::ExtractText;

 use strict;
 use warnings;
 use re 'taint';

 my $VERSION = 0.001;

 use File::Basename;

 use Mail::SpamAssassin::Logger;
 use Mail::SpamAssassin::Plugin;
 use Mail::SpamAssassin::Util qw (compile_regexp untaint_var untaint_file_path
   proc_status_ok exit_status_str);

 our @ISA = qw(Mail::SpamAssassin::Plugin);

 sub new {
   my ($class, $mailsa) = @_;

   $class = ref($class) || $class;
   my $self = $class->SUPER::new($mailsa);
   bless ($self, $class);

   $self->{match} = [];
   $self->{tools} = {};
   $self->{magic} = 0;

   $self->register_method_priority('post_message_parse', -1);
   $self->set_config($mailsa->{conf});
   return $self;
 }

 sub set_config {
   my ($self, $conf) = @_;
   my @cmds;

   push(@cmds, {
     setting => 'extracttext_maxparts',
     is_admin => 1,
     default => 10,
     type => $Mail::SpamAssassin::Conf::CONF_TYPE_NUMERIC,
   });

   push(@cmds, {
     setting => 'extracttext_timeout',
     is_admin => 1,
     default => 5,
     type => $Mail::SpamAssassin::Conf::CONF_TYPE_NUMERIC,
     code => sub {
       my ($self, $key, $value, $line) = @_;
       unless (defined $value && $value !~ /^$/) {
         return $Mail::SpamAssassin::Conf::MISSING_REQUIRED_VALUE;
       }
       local ($1,$2);
       unless ($value =~ /^(\d+)(?:\s+(\d+))?$/) {
         return $Mail::SpamAssassin::Conf::INVALID_VALUE;
       }
       $self->{extracttext_timeout} = $1;
       $self->{extracttext_timeout_total} = $2;
     }
   });

   $conf->{parser}->register_commands(\@cmds);
 }

 sub parse_config {
   my ($self, $opts) = @_;

   # Ignore users's configuration lines
   return 0 if $opts->{user_config};

   if ($opts->{key} eq 'extracttext_use') {
     $self->inhibit_further_callbacks();
     # Temporary kludge to notify users. Double backslashes have zero benefit for this plugin config.
     if ($opts->{value} =~ s/\\\\/\\/g) {
       warn "extracttext: DOUBLE BACKSLASHES DEPRECATED, change config to single backslashes, autoconverted for backward compatibility: $opts->{key} $opts->{value}\n";
     }
     if ($opts->{value} =~ /(?:to|2)html\b/) {
       warn "extracttext: HTML tools are not supported, plain text output is required. Please remove: $opts->{key} $opts->{value}\n";
       return 1;
     }
     my @vals = split(/\s+/, $opts->{value});
     my $tool = lc(shift @vals);
     return 0 unless @vals;
     foreach my $what (@vals) {
       my $where;
       if (index($what, '/') >= 0) {
         $where = 'type';
       } else {
         $where = 'name';
         if ($what =~ /^\.[a-zA-Z0-9]+$/) {
           $what = ".*\\$what";
         }
       }
       my ($rec, $err) = compile_regexp('^(?i)'.$what.'$', 0);
       if (!$rec) {
         warn("invalid regexp '$what': $err\n");
         return 0;
       }
       push @{$self->{match}}, {where=>$where, what=>$rec, tool=>$tool};
       dbg('extracttext: use: %s %s %s', $tool, $where, $what);
     }
     return 1;
   }

   if ($opts->{key} eq 'extracttext_external') {
     $self->inhibit_further_callbacks();
     # Temporary kludge to notify users. Double backslashes have zero benefit for this plugin config.
     if ($opts->{value} =~ s/\\\\/\\/g) {
       warn "extracttext: DOUBLE BACKSLASHES DEPRECATED, change config to single backslashes, autoconverted for backward compatibility: $opts->{key} $opts->{value}\n";
     }
     if ($opts->{value} =~ /(?:to|2)html\b/) {
       warn "extracttext: HTML tools are not supported, plain text output is required. Please remove: $opts->{key} $opts->{value}\n";
       return 1;
     }
     my %env;
     while ($opts->{value} =~ s/\{(.+?)\}/ /g) {
       my ($k,$v) = split(/=/, $1, 2);
       $env{$k} = defined $v ? $v : '';
     }
     my @vals = split(/\s+/, $opts->{value});
     my $name = lc(shift @vals);
     return 0 unless @vals > 1;
     if ($self->{tools}->{$name}) {
       warn "extracttext: duplicate tool defined: $name\n";
       return 0;
     }
     #unless (-x $vals[0]) {
     #  warn "extracttext: missing tool: $name ($vals[0])\n";
     #  return 0;
     #}
     $self->{tools}->{$name} = {
       'name' => $name,
       'type' => 'external',
       'env' => \%env,
       'cmd' => \@vals,
     };
     dbg('extracttext: external: %s "%s"', $name, join('","', @vals));
     return 1;
   }

   return 0;
 }

 # Extract 'text' via running an external command.
 sub _extract_external {
   my ($self, $object, $tool) = @_;

   my ($errno, $pipe_errno, $tmp_file, $err_file, $pid);
   my $resp = '';
   my @cmd = @{$tool->{cmd}};

   Mail::SpamAssassin::PerMsgStatus::enter_helper_run_mode($self);

   # Set environment variables
   foreach (keys %{$tool->{env}}) {
     $ENV{$_} = $tool->{env}{$_};
   }

   my $timer = Mail::SpamAssassin::Timeout->new(
     { secs => $self->{main}->{conf}->{extracttext_timeout},
       deadline => $self->{'master_deadline'} });

   my $err = $timer->run_and_catch(sub {
     local $SIG{PIPE} = sub { die "__brokenpipe__ignore__\n" };

     ($tmp_file, my $tmp_fh) = Mail::SpamAssassin::Util::secure_tmpfile();
     $tmp_file  or die "failed to create a temporary file";
     print $tmp_fh ${$object->{data}};
     close($tmp_fh);

     ($err_file, my $err_fh) = Mail::SpamAssassin::Util::secure_tmpfile();
     $err_file  or die "failed to create a temporary file";
     close($err_fh);
     $err_file = untaint_file_path($err_file);

     foreach (@cmd) {
       # substitute "{}" with the temporary file name to pass to the external software
       s/\{\}/$tmp_file/;
       $_ = untaint_var($_);
     }

     $pid = Mail::SpamAssassin::Util::helper_app_pipe_open(*EXTRACT, undef, ">$err_file", @cmd);
     $pid or die "$!\n";

     # read+split avoids a Perl I/O bug (Bug 5985)
     my($inbuf, $nread);

     while ($nread = read(EXTRACT, $inbuf, 8192)) { $resp .= $inbuf }
     defined $nread  or die "error reading from pipe: $!";

     $errno = 0;
     close EXTRACT or $errno = $!;

     if (proc_status_ok($?, $errno)) {
       dbg("extracttext: [%s] (%s) finished successfully", $pid, $cmd[0]);
     } elsif (proc_status_ok($?, $errno, 0, 1, 4)) {  # sometimes it exits with error 1 or 4
       dbg("extracttext: [%s] (%s) finished: %s", $pid, $cmd[0], exit_status_str($?, $errno));
     } else {
       info("extracttext: [%s] (%s) error: %s", $pid, $cmd[0], exit_status_str($?, $errno));
     }
     # Save return status for later
     $pipe_errno = $?;
   });

   if (defined(fileno(*EXTRACT))) {  # still open
     if ($pid) {
       if (kill('TERM', $pid)) {
         dbg("extracttext: killed stale helper [$pid] ($cmd[0])");
       } else {
         dbg("extracttext: killing helper application [$pid] ($cmd[0]) failed: $!");
       }
     }
     $errno = 0;
     close EXTRACT or $errno = $!;
     proc_status_ok($?, $errno)
       or info("extracttext: [%s] (%s) error: %s", $pid, $cmd[0], exit_status_str($?, $errno));
   }

   Mail::SpamAssassin::PerMsgStatus::leave_helper_run_mode($self);
   unlink($tmp_file);
   # Read first line from STDERR
   my $err_resp = -s $err_file ?
     do { open(ERRF, $err_file); $_ = <ERRF>; close(ERRF); chomp; $_; } : '';
   unlink($err_file);

   if ($err_resp ne '') {
     dbg("extracttext: [$pid] ($cmd[0]) stderr output: $err_resp");
   }

   # If the output starts with the command that has been run it's
   # probably an error message
   if ($pipe_errno) {
     if ($err_resp =~ /\b(?:Usage:|No such file or directory)/) {
       warn "extracttext: error from $cmd[0], please verify configuration: $err_resp\n";
     }
     elsif ($err_resp =~ /^Syntax (?:Warning|Error): (?:May not be a PDF file|Couldn't find trailer dictionary)/) {
       # Ignore pdftotext
     }
     elsif ($err_resp =~ /^Error in (?:findFileFormatStream|fopenReadStream): (?:truncated file|file not found)/) {
       # Ignore tesseract
     }
     elsif ($err_resp =~ /^libpng error:/) {
       # Ignore tesseract
     }
     elsif ($err_resp =~ /^Corrupt JPEG data:/) {
       # Ignore tesseract
     }
     elsif ($err_resp =~ /^\S+ is not a Word Document/) {
       # Ignore antiword
     } elsif((($pipe_errno/256) eq 4) and ($cmd[0] =~ /zbarimg/)) {
       # Ignore zbarimg
     }
     elsif (!$resp) {
       warn "extracttext: error (".($pipe_errno/256).") from $cmd[0]: $err_resp\n";
     }
     return (0, $resp);
   }
   return (1, $resp);
 }

 sub _extract_object {
   my ($self, $object, $tool) = @_;
   my ($ok, $text);

   if ($tool->{type} eq 'external') {
     ($ok, $text) = $self->_extract_external($object, $tool);
   } else {
     warn "extracttext: bad tool type: $tool->{type}\n";
     return 0;
   }

   return 0 unless $ok;

   if ($text =~ /^[\s\r\n]*$/s) {
     $text = '';
   } else {
     # Remove not important html elements
     #$text =~ s/(?=<!DOCTYPE)([\s\S]*?)>//g;
     #$text =~ s/(?=<!--)([\s\S]*?)-->//g;
   }

   if ($text eq '') {
     dbg('extracttext: No text extracted');
   }

   $text = untaint_var($text);
   utf8::encode($text) if utf8::is_utf8($text);

   return (1, $text);
 }

 sub _get_extension {
   my ($self, $object) = @_;
   my $fext;
   if ($object->{name} && $object->{name} =~ /\.([^.\\\/]+)$/) {
     $fext = $1;
   }
   elsif ($object->{file} && $object->{file} =~ /\.([^.\\\/]+)$/) {
     $fext = $1;
   }
   return $fext ? ($fext) : ();
 }

 sub _extract {
   my ($self, $coll, $part, $type, $name, $data, $tool) = @_;
   my $object = {
     'data' => $data,
     'type' => $type,
     'name' => $name
   };
   my @fexts;
   my @types;

   my @tools = ($tool->{name});
   my ($ok, $text) = $self->_extract_object($object,$tool);

   # when url+text, script never returns to this point from _extract_object above
   #
   return 0 unless $ok;
   if ($text ne '' && would_log('dbg','extracttext') > 1) {
     dbg("extracttext: text extracted:\n$text");
   }

   push @{$coll->{text}}, $text;
   push @types, $type;
   push @fexts, $self->_get_extension($object);
   if ($text eq '') {
     push @{$coll->{flags}}, 'NoText';
     push @{$coll->{text}}, 'NoText';
   } else {
     if ($text =~ /<a(?:\s+[^>]+)?\s+href="([^">]*)"/) {
       push @{$coll->{flags}}, 'ActionURI';
       dbg("extracttext: ActionURI: $1");
       push @{$coll->{text}}, $text;
       push @{$coll->{uris}}, $1;
     } elsif($text =~ /QR-Code\:([^\s]*)/) {
       # zbarimg(1) prefixes the url with "QR-Code:" string
       my $qrurl = $1;
       push @{$coll->{flags}},'QR-Code';
       dbg("extracttext: QR-Code: $qrurl");
       push @{$coll->{text}}, $text;
       push @{$coll->{uris}}, $qrurl;
     }
     if ($text =~ /NoText/) {
       push @{$coll->{flags}},'NoText';
       dbg("extracttext: NoText");
       push @{$coll->{text}}, $text;
     }
     $coll->{chars} += length($text);

     # the following is safe (regarding clobbering the @_) since perl v5.11.0
     $coll->{words} += split(/\W+/s,$text) - 1;
     # $coll->{words} += scalar @{[split(/\W+/s,$text)]} - 1;  # old perl hack

     dbg("extracttext: rendering text for type $type with $tool->{name}");
     $part->set_rendered($text, $type);
   }

   if (@types) {
     push @{$coll->{types}}, join(', ', @types);
   }
   if (@fexts) {
     push @{$coll->{extensions}}, join('_', @fexts);
   }
   push @{$coll->{tools}}, join('_', @tools);
   return 1;
 }

 #
 # check attachment type and match with the right tool
 #
 sub _check_extract {
   my ($self, $coll, $checked, $part, $decoded, $data, $type, $name) = @_;
   my $ret = 0;
   return 0 unless (defined $type || defined $name);
   foreach my $match (@{$self->{match}}) {
     next unless $self->{tools}->{$match->{tool}};
     next if $checked->{$match->{tool}};

     if ($match->{where} eq 'name') {
       next unless (defined $name && $name =~ $match->{what});
     } elsif ($match->{where} eq 'type') {
       next unless (defined $type && $type =~ $match->{what});
     } else {
       next;
     }
     $checked->{$match->{tool}} = 1;
     # dbg("extracttext: coll: $coll, part: $part, type: $type, name: $name, data: $data, tool: $self->{tools}->{$match->{tool}}");
     if($self->_extract($coll,$part,$type,$name,$data,$self->{tools}->{$match->{tool}})) {
       $ret = 1;
     }
   }
   return $ret;
 }

 sub post_message_parse {
   my ($self, $opts) = @_;

   my $timer = $self->{main}->time_method("extracttext");

   my $msg = $opts->{'message'};
   $self->{'master_deadline'} = $msg->{'master_deadline'};
   my $starttime = time;

   my %collect = (
     'tools'		=> [],
     'types'		=> [],
     'extensions'	=> [],
     'flags'		=> [],
     'chars'		=> 0,
     'words'		=> 0,
     'text'		=> [],
     'uris'		=> [],
   );

   my $conf = $self->{main}->{conf};
   my $maxparts = $conf->{extracttext_maxparts};
   my $ttimeout = $conf->{extracttext_timeout_total} ||
     $conf->{extracttext_timeout} > 10 ? $conf->{extracttext_timeout} : 10;
   my $nparts = 0;
   foreach my $part ($msg->find_parts(qr/./, 1)) {
     next unless $part->is_leaf;
     if ($maxparts > 0 && ++$nparts > $maxparts) {
       dbg("extracttext: Skipping MIME parts exceeding the ${maxparts}th");
       last;
     }
     if (time - $starttime >= $ttimeout) {
       dbg("extracttext: Skipping MIME parts, total execution timeout exceeded");
       last;
     }
     my (undef,$rtd) = $part->rendered;
     next if defined $rtd;
     my %checked = ();
     my $dat = $part->decode();
     my $typ = $part->{type};
     my $nam = $part->{name};
     my $dec = 1;
     next if $self->_check_extract(\%collect,\%checked,$part,\$dec,\$dat,$typ,$nam);
   }

   return 1 unless @{$collect{tools}};

   my @uniq_tools = do { my %seen; grep { !$seen{$_}++ } @{$collect{tools}} };
   my @uniq_types = do { my %seen; grep { !$seen{$_}++ } @{$collect{types}} };
   my @uniq_ext   = do { my %seen; grep { !$seen{$_}++ } @{$collect{extensions}} };
   my @uniq_flags = do { my %seen; grep { !$seen{$_}++ } @{$collect{flags}} };
   my @uniq_uris = do { my %seen; grep { !$seen{$_}++ } @{$collect{uris}} };

   $msg->put_metadata('X-ExtractText-Words', $collect{words});
   $msg->put_metadata('X-ExtractText-Chars', $collect{chars});
   $msg->put_metadata('X-ExtractText-Tools', join(' ', @uniq_tools));
   $msg->put_metadata('X-ExtractText-Types', join(' ', @uniq_types));
   $msg->put_metadata('X-ExtractText-Extensions', join(' ', @uniq_ext));
   $msg->put_metadata('X-ExtractText-Flags', join(' ', @uniq_flags));
   $msg->put_metadata('X-ExtractText-Uris', join(' ', @uniq_uris));

   return 1;
 }

 sub parsed_metadata {
   my ($self, $opts) = @_;
   my $pms = $opts->{permsgstatus};
   my $msg = $pms->get_message();
   foreach my $tag (('Words','Chars','Tools','Types','Extensions','Flags','Uris')) {
     my $v = $msg->get_metadata("X-ExtractText-$tag");
     if (defined $v) {
       $pms->set_tag("ExtractText$tag", $v);
       dbg("extracttext: tag: $tag $v");
       if($tag eq 'Uris') {
         map { $pms->add_uri_detail_list($_) } split(/ /, $v);
       }
     }
   }
   return 1;
 }

 1;
	# <@LICENSE>
	# Licensed to the Apache Software Foundation (ASF) under one or more
	# contributor license agreements. See the NOTICE file distributed with
	# this work for additional information regarding copyright ownership.
	# The ASF licenses this file to you under the Apache License, Version 2.0
	# (the "License"); you may not use this file except in compliance with
	# the License. You may obtain a copy of the License at:
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.
	# </@LICENSE>

	# Authors: Jonas Eckerman, Dave Wreski, Giovanni Bechis

	=head1 NAME

	ExtractText - extracts text from documenmts.

	=head1 SYNOPSIS

	loadplugin Mail::SpamAssassin::Plugin::ExtractText

	ifplugin Mail::SpamAssassin::Plugin::ExtractText

	extracttext_external pdftotext /usr/bin/pdftotext -nopgbrk -layout -enc UTF-8 {} -
	extracttext_use pdftotext .pdf application/pdf

	# http://docx2txt.sourceforge.net
	extracttext_external docx2txt /usr/bin/docx2txt {} -
	extracttext_use docx2txt .docx application/docx

	extracttext_external antiword /usr/bin/antiword -t -w 0 -m UTF-8.txt {}
	extracttext_use antiword .doc application/(?:vnd\.?)?ms-?word.*

	extracttext_external unrtf /usr/bin/unrtf --nopict {}
	extracttext_use unrtf .doc .rtf application/rtf text/rtf

	extracttext_external odt2txt /usr/bin/odt2txt --encoding=UTF-8 {}
	extracttext_use odt2txt .odt .ott application/.?opendocument.text
	extracttext_use odt2txt .sdw .stw application/(?:x-)?soffice application/(?:x-)?starwriter

	extracttext_external tesseract {OMP_THREAD_LIMIT=1} /usr/bin/tesseract -c page_separator= {} -
	extracttext_use tesseract .jpg .png .bmp .tif .tiff image/(?:jpeg\|png\|x-ms-bmp\|tiff)

	# QR-code decoder
	extracttext_external zbar /usr/bin/zbarimg -q -D {}
	extracttext_use zbar .jpg .png .pdf image/(?:jpeg\|png) application/pdf

	add_header all ExtractText-Flags _EXTRACTTEXTFLAGS_
	header PDF_NO_TEXT X-ExtractText-Flags =~ /\bpdftotext_NoText\b/
	describe PDF_NO_TEXT PDF without text
	score PDF_NO_TEXT 0.001

	header DOC_NO_TEXT X-ExtractText-Flags =~ /\b(?:antiword\|openxml\|unrtf\|odt2txt)_NoText\b/
	describe DOC_NO_TEXT Document without text
	score DOC_NO_TEXT 0.001

	header EXTRACTTEXT exists:X-ExtractText-Flags
	describe EXTRACTTEXT Email processed by extracttext plugin
	score EXTRACTTEXT 0.001

	endif

	=head1 DESCRIPTION

	This module uses external tools to extract text from message parts,
	and then sets the text as the rendered part. External tool must output
	plain text, not HTML or other non-textual result.

	How to extract text is completely configurable, and based on
	MIME part type and file name.

	=head1 CONFIGURATION

	All configuration lines in user_prefs files will be ignored.

	=over 4

	=item extracttext_maxparts (default: 10)

	Configure the maximum mime parts number to analyze, a value of 0 means all mime parts
	will be analyzed

	=item extracttext_timeout (default: 5 10)

	Configure the timeout in seconds of external tool checks, per attachment.

	Second argument speficies maximum total time for all checks.

	=back

	=head2 Tools

	=over

	=item extracttext_use

	Specifies what tool to use for what message parts.

	The general syntax is

	extracttext_use C<name> C<specifiers>

	=back

	=over

	=item name

	the internal name of a tool.

	=item specifiers

	File extension and regular expressions for file names and MIME
	types. The regular expressions are anchored to beginning and end.

	=back

	=head3 Examples

	extracttext_use antiword .doc application/(?:vnd\.?)?ms-?word.*
	extracttext_use openxml .docx .dotx .dotm application/(?:vnd\.?)openxml.?word.
	extracttext_use openxml .doc .dot application/(?:vnd\.?)?ms-?word.*
	extracttext_use unrtf .doc .rtf application/rtf text/rtf

	=over

	=item extracttext_external

	Defines an external tool. The tool must read a document on standard input
	or from a file and write text to standard output.

	The special keyword "{}" will be substituted at runtime with the temporary
	filename to be scanned by the external tool.

	Environment variables can be defined with "{KEY=VALUE}", these strings will
	be removed from commandline.

	It is required that commandline used outputs result directly to STDOUT.

	The general syntax is

	extracttext_external C<name> C<command> C<parameters>

	=back

	=over

	=item name

	The internal name of this tool.

	=item command

	The full path to the external command to run.

	=item parameters

	Parameters for the external command. The temporary file name containing
	the document will be automatically added as last parameter.

	=back

	=head3 Examples

	extracttext_external antiword /usr/bin/antiword -t -w 0 -m UTF-8.txt {} -
	extracttext_external unrtf /usr/bin/unrtf --nopict {}
	extracttext_external odt2txt /usr/bin/odt2txt --encoding=UTF-8 {}

	=head2 Metadata

	The plugin adds some pseudo headers to the message. These headers are seen by
	the bayes system, and can be used in normal SpamAssassin rules.

	The headers are also available as template tags as noted below.

	=head3 Example

	The fictional example headers below are based on a message containing this:

	=over

	=item 1
	A perfectly normal PDF.

	=item 2
	An OpenXML document with a word document inside.
	Neither Office document contains text.

	=back

	=head3 Headers

	=over

	=item X-ExtractText-Chars

	Tag: _EXTRACTTEXTCHARS_

	Contains a count of characters that were extracted.

	X-ExtractText-Chars: 10970

	=item X-ExtractText-Words

	Tag: _EXTRACTTEXTWORDS_

	Contains a count of "words" that were extracted.

	X-ExtractText-Chars: 1599

	=item X-ExtractText-Tools

	Tag: _EXTRACTTEXTTOOLS_

	Contains chains of tools used for extraction.

	X-ExtractText-Tools: pdftotext openxml_antiword

	=item X-ExtractText-Types

	Tag: _EXTRACTTEXTTYPES_

	Contains chains of MIME types for parts found during extraction.

	X-ExtractText-Types: application/pdf; application/vnd.openxmlformats-officedocument.wordprocessingml.document, application/ms-word

	=item X-ExtractText-Extensions

	Tag: _EXTRACTTEXTEXTENSIONS_

	Contains chains of canonicalized file extensions for parts
	found during extraction.

	X-ExtractText-Extensions: pdf docx

	=item X-ExtractText-Flags

	Tag: _EXTRACTTEXTFLAGS_

	Contains notes from the plugin.

	X-ExtractText-Flags: openxml_NoText

	=item X-ExtractText-Uris

	Tag: _EXTRACTTEXTURIS_

	Contains uris extracted from the plugin.

	X-ExtractText-Uris: https://spamassassin.apache.org

	=back

	=head3 Rules

	Example:

	header PDF_NO_TEXT X-ExtractText-Flags =~ /\bpdftotext_Notext\b/
	describe PDF_NO_TEXT PDF without text

	=cut

	package Mail::SpamAssassin::Plugin::ExtractText;

	use strict;
	use warnings;
	use re 'taint';

	my $VERSION = 0.001;

	use File::Basename;

	use Mail::SpamAssassin::Logger;
	use Mail::SpamAssassin::Plugin;
	use Mail::SpamAssassin::Util qw (compile_regexp untaint_var untaint_file_path
	proc_status_ok exit_status_str);

	our @ISA = qw(Mail::SpamAssassin::Plugin);

	sub new {
	my ($class, $mailsa) = @_;

	$class = ref($class) \|\| $class;
	my $self = $class->SUPER::new($mailsa);
	bless ($self, $class);

	$self->{match} = [];
	$self->{tools} = {};
	$self->{magic} = 0;

	$self->register_method_priority('post_message_parse', -1);
	$self->set_config($mailsa->{conf});
	return $self;
	}

	sub set_config {
	my ($self, $conf) = @_;
	my @cmds;

	push(@cmds, {
	setting => 'extracttext_maxparts',
	is_admin => 1,
	default => 10,
	type => $Mail::SpamAssassin::Conf::CONF_TYPE_NUMERIC,
	});

	push(@cmds, {
	setting => 'extracttext_timeout',
	is_admin => 1,
	default => 5,
	type => $Mail::SpamAssassin::Conf::CONF_TYPE_NUMERIC,
	code => sub {
	my ($self, $key, $value, $line) = @_;
	unless (defined $value && $value !~ /^$/) {
	return $Mail::SpamAssassin::Conf::MISSING_REQUIRED_VALUE;
	}
	local ($1,$2);
	unless ($value =~ /^(\d+)(?:\s+(\d+))?$/) {
	return $Mail::SpamAssassin::Conf::INVALID_VALUE;
	}
	$self->{extracttext_timeout} = $1;
	$self->{extracttext_timeout_total} = $2;
	}
	});

	$conf->{parser}->register_commands(\@cmds);
	}

	sub parse_config {
	my ($self, $opts) = @_;

	# Ignore users's configuration lines
	return 0 if $opts->{user_config};

	if ($opts->{key} eq 'extracttext_use') {
	$self->inhibit_further_callbacks();
	# Temporary kludge to notify users. Double backslashes have zero benefit for this plugin config.
	if ($opts->{value} =~ s/\\\\/\\/g) {
	warn "extracttext: DOUBLE BACKSLASHES DEPRECATED, change config to single backslashes, autoconverted for backward compatibility: $opts->{key} $opts->{value}\n";
	}
	if ($opts->{value} =~ /(?:to\|2)html\b/) {
	warn "extracttext: HTML tools are not supported, plain text output is required. Please remove: $opts->{key} $opts->{value}\n";
	return 1;
	}
	my @vals = split(/\s+/, $opts->{value});
	my $tool = lc(shift @vals);
	return 0 unless @vals;
	foreach my $what (@vals) {
	my $where;
	if (index($what, '/') >= 0) {
	$where = 'type';
	} else {
	$where = 'name';
	if ($what =~ /^\.[a-zA-Z0-9]+$/) {
	$what = ".*\\$what";
	}
	}
	my ($rec, $err) = compile_regexp('^(?i)'.$what.'$', 0);
	if (!$rec) {
	warn("invalid regexp '$what': $err\n");
	return 0;
	}
	push @{$self->{match}}, {where=>$where, what=>$rec, tool=>$tool};
	dbg('extracttext: use: %s %s %s', $tool, $where, $what);
	}
	return 1;
	}

	if ($opts->{key} eq 'extracttext_external') {
	$self->inhibit_further_callbacks();
	# Temporary kludge to notify users. Double backslashes have zero benefit for this plugin config.
	if ($opts->{value} =~ s/\\\\/\\/g) {
	warn "extracttext: DOUBLE BACKSLASHES DEPRECATED, change config to single backslashes, autoconverted for backward compatibility: $opts->{key} $opts->{value}\n";
	}
	if ($opts->{value} =~ /(?:to\|2)html\b/) {
	warn "extracttext: HTML tools are not supported, plain text output is required. Please remove: $opts->{key} $opts->{value}\n";
	return 1;
	}
	my %env;
	while ($opts->{value} =~ s/\{(.+?)\}/ /g) {
	my ($k,$v) = split(/=/, $1, 2);
	$env{$k} = defined $v ? $v : '';
	}
	my @vals = split(/\s+/, $opts->{value});
	my $name = lc(shift @vals);
	return 0 unless @vals > 1;
	if ($self->{tools}->{$name}) {
	warn "extracttext: duplicate tool defined: $name\n";
	return 0;
	}
	#unless (-x $vals[0]) {
	# warn "extracttext: missing tool: $name ($vals[0])\n";
	# return 0;
	#}
	$self->{tools}->{$name} = {
	'name' => $name,
	'type' => 'external',
	'env' => \%env,
	'cmd' => \@vals,
	};
	dbg('extracttext: external: %s "%s"', $name, join('","', @vals));
	return 1;
	}

	return 0;
	}

	# Extract 'text' via running an external command.
	sub _extract_external {
	my ($self, $object, $tool) = @_;

	my ($errno, $pipe_errno, $tmp_file, $err_file, $pid);
	my $resp = '';
	my @cmd = @{$tool->{cmd}};

	Mail::SpamAssassin::PerMsgStatus::enter_helper_run_mode($self);

	# Set environment variables
	foreach (keys %{$tool->{env}}) {
	$ENV{$_} = $tool->{env}{$_};
	}

	my $timer = Mail::SpamAssassin::Timeout->new(
	{ secs => $self->{main}->{conf}->{extracttext_timeout},
	deadline => $self->{'master_deadline'} });

	my $err = $timer->run_and_catch(sub {
	local $SIG{PIPE} = sub { die "__brokenpipe__ignore__\n" };

	($tmp_file, my $tmp_fh) = Mail::SpamAssassin::Util::secure_tmpfile();
	$tmp_file or die "failed to create a temporary file";
	print $tmp_fh ${$object->{data}};
	close($tmp_fh);

	($err_file, my $err_fh) = Mail::SpamAssassin::Util::secure_tmpfile();
	$err_file or die "failed to create a temporary file";
	close($err_fh);
	$err_file = untaint_file_path($err_file);

	foreach (@cmd) {
	# substitute "{}" with the temporary file name to pass to the external software
	s/\{\}/$tmp_file/;
	$_ = untaint_var($_);
	}

	$pid = Mail::SpamAssassin::Util::helper_app_pipe_open(*EXTRACT, undef, ">$err_file", @cmd);
	$pid or die "$!\n";

	# read+split avoids a Perl I/O bug (Bug 5985)
	my($inbuf, $nread);

	while ($nread = read(EXTRACT, $inbuf, 8192)) { $resp .= $inbuf }
	defined $nread or die "error reading from pipe: $!";

	$errno = 0;
	close EXTRACT or $errno = $!;

	if (proc_status_ok($?, $errno)) {
	dbg("extracttext: [%s] (%s) finished successfully", $pid, $cmd[0]);
	} elsif (proc_status_ok($?, $errno, 0, 1, 4)) { # sometimes it exits with error 1 or 4
	dbg("extracttext: [%s] (%s) finished: %s", $pid, $cmd[0], exit_status_str($?, $errno));
	} else {
	info("extracttext: [%s] (%s) error: %s", $pid, $cmd[0], exit_status_str($?, $errno));
	}
	# Save return status for later
	$pipe_errno = $?;
	});

	if (defined(fileno(*EXTRACT))) { # still open
	if ($pid) {
	if (kill('TERM', $pid)) {
	dbg("extracttext: killed stale helper [$pid] ($cmd[0])");
	} else {
	dbg("extracttext: killing helper application [$pid] ($cmd[0]) failed: $!");
	}
	}
	$errno = 0;
	close EXTRACT or $errno = $!;
	proc_status_ok($?, $errno)
	or info("extracttext: [%s] (%s) error: %s", $pid, $cmd[0], exit_status_str($?, $errno));
	}

	Mail::SpamAssassin::PerMsgStatus::leave_helper_run_mode($self);
	unlink($tmp_file);
	# Read first line from STDERR
	my $err_resp = -s $err_file ?
	do { open(ERRF, $err_file); $_ = <ERRF>; close(ERRF); chomp; $_; } : '';
	unlink($err_file);

	if ($err_resp ne '') {
	dbg("extracttext: [$pid] ($cmd[0]) stderr output: $err_resp");
	}

	# If the output starts with the command that has been run it's
	# probably an error message
	if ($pipe_errno) {
	if ($err_resp =~ /\b(?:Usage:\|No such file or directory)/) {
	warn "extracttext: error from $cmd[0], please verify configuration: $err_resp\n";
	}
	elsif ($err_resp =~ /^Syntax (?:Warning\|Error): (?:May not be a PDF file\|Couldn't find trailer dictionary)/) {
	# Ignore pdftotext
	}
	elsif ($err_resp =~ /^Error in (?:findFileFormatStream\|fopenReadStream): (?:truncated file\|file not found)/) {
	# Ignore tesseract
	}
	elsif ($err_resp =~ /^libpng error:/) {
	# Ignore tesseract
	}
	elsif ($err_resp =~ /^Corrupt JPEG data:/) {
	# Ignore tesseract
	}
	elsif ($err_resp =~ /^\S+ is not a Word Document/) {
	# Ignore antiword
	} elsif((($pipe_errno/256) eq 4) and ($cmd[0] =~ /zbarimg/)) {
	# Ignore zbarimg
	}
	elsif (!$resp) {
	warn "extracttext: error (".($pipe_errno/256).") from $cmd[0]: $err_resp\n";
	}
	return (0, $resp);
	}
	return (1, $resp);
	}

	sub _extract_object {
	my ($self, $object, $tool) = @_;
	my ($ok, $text);

	if ($tool->{type} eq 'external') {
	($ok, $text) = $self->_extract_external($object, $tool);
	} else {
	warn "extracttext: bad tool type: $tool->{type}\n";
	return 0;
	}

	return 0 unless $ok;

	if ($text =~ /^[\s\r\n]*$/s) {
	$text = '';
	} else {
	# Remove not important html elements
	#$text =~ s/(?=<!DOCTYPE)([\s\S]*?)>//g;
	#$text =~ s/(?=<!--)([\s\S]*?)-->//g;
	}

	if ($text eq '') {
	dbg('extracttext: No text extracted');
	}

	$text = untaint_var($text);
	utf8::encode($text) if utf8::is_utf8($text);

	return (1, $text);
	}

	sub _get_extension {
	my ($self, $object) = @_;
	my $fext;
	if ($object->{name} && $object->{name} =~ /\.([^.\\\/]+)$/) {
	$fext = $1;
	}
	elsif ($object->{file} && $object->{file} =~ /\.([^.\\\/]+)$/) {
	$fext = $1;
	}
	return $fext ? ($fext) : ();
	}

	sub _extract {
	my ($self, $coll, $part, $type, $name, $data, $tool) = @_;
	my $object = {
	'data' => $data,
	'type' => $type,
	'name' => $name
	};
	my @fexts;
	my @types;

	my @tools = ($tool->{name});
	my ($ok, $text) = $self->_extract_object($object,$tool);

	# when url+text, script never returns to this point from _extract_object above
	#
	return 0 unless $ok;
	if ($text ne '' && would_log('dbg','extracttext') > 1) {
	dbg("extracttext: text extracted:\n$text");
	}

	push @{$coll->{text}}, $text;
	push @types, $type;
	push @fexts, $self->_get_extension($object);
	if ($text eq '') {
	push @{$coll->{flags}}, 'NoText';
	push @{$coll->{text}}, 'NoText';
	} else {
	if ($text =~ /<a(?:\s+[^>]+)?\s+href="([^">]*)"/) {
	push @{$coll->{flags}}, 'ActionURI';
	dbg("extracttext: ActionURI: $1");
	push @{$coll->{text}}, $text;
	push @{$coll->{uris}}, $1;
	} elsif($text =~ /QR-Code\:([^\s]*)/) {
	# zbarimg(1) prefixes the url with "QR-Code:" string
	my $qrurl = $1;
	push @{$coll->{flags}},'QR-Code';
	dbg("extracttext: QR-Code: $qrurl");
	push @{$coll->{text}}, $text;
	push @{$coll->{uris}}, $qrurl;
	}
	if ($text =~ /NoText/) {
	push @{$coll->{flags}},'NoText';
	dbg("extracttext: NoText");
	push @{$coll->{text}}, $text;
	}
	$coll->{chars} += length($text);

	# the following is safe (regarding clobbering the @_) since perl v5.11.0
	$coll->{words} += split(/\W+/s,$text) - 1;
	# $coll->{words} += scalar @{[split(/\W+/s,$text)]} - 1; # old perl hack

	dbg("extracttext: rendering text for type $type with $tool->{name}");
	$part->set_rendered($text, $type);
	}

	if (@types) {
	push @{$coll->{types}}, join(', ', @types);
	}
	if (@fexts) {
	push @{$coll->{extensions}}, join('_', @fexts);
	}
	push @{$coll->{tools}}, join('_', @tools);
	return 1;
	}

	#
	# check attachment type and match with the right tool
	#
	sub _check_extract {
	my ($self, $coll, $checked, $part, $decoded, $data, $type, $name) = @_;
	my $ret = 0;
	return 0 unless (defined $type \|\| defined $name);
	foreach my $match (@{$self->{match}}) {
	next unless $self->{tools}->{$match->{tool}};
	next if $checked->{$match->{tool}};

	if ($match->{where} eq 'name') {
	next unless (defined $name && $name =~ $match->{what});
	} elsif ($match->{where} eq 'type') {
	next unless (defined $type && $type =~ $match->{what});
	} else {
	next;
	}
	$checked->{$match->{tool}} = 1;
	# dbg("extracttext: coll: $coll, part: $part, type: $type, name: $name, data: $data, tool: $self->{tools}->{$match->{tool}}");
	if($self->_extract($coll,$part,$type,$name,$data,$self->{tools}->{$match->{tool}})) {
	$ret = 1;
	}
	}
	return $ret;
	}

	sub post_message_parse {
	my ($self, $opts) = @_;

	my $timer = $self->{main}->time_method("extracttext");

	my $msg = $opts->{'message'};
	$self->{'master_deadline'} = $msg->{'master_deadline'};
	my $starttime = time;

	my %collect = (
	'tools' => [],
	'types' => [],
	'extensions' => [],
	'flags' => [],
	'chars' => 0,
	'words' => 0,
	'text' => [],
	'uris' => [],
	);

	my $conf = $self->{main}->{conf};
	my $maxparts = $conf->{extracttext_maxparts};
	my $ttimeout = $conf->{extracttext_timeout_total} \|\|
	$conf->{extracttext_timeout} > 10 ? $conf->{extracttext_timeout} : 10;
	my $nparts = 0;
	foreach my $part ($msg->find_parts(qr/./, 1)) {
	next unless $part->is_leaf;
	if ($maxparts > 0 && ++$nparts > $maxparts) {
	dbg("extracttext: Skipping MIME parts exceeding the ${maxparts}th");
	last;
	}
	if (time - $starttime >= $ttimeout) {
	dbg("extracttext: Skipping MIME parts, total execution timeout exceeded");
	last;
	}
	my (undef,$rtd) = $part->rendered;
	next if defined $rtd;
	my %checked = ();
	my $dat = $part->decode();
	my $typ = $part->{type};
	my $nam = $part->{name};
	my $dec = 1;
	next if $self->_check_extract(\%collect,\%checked,$part,\$dec,\$dat,$typ,$nam);
	}

	return 1 unless @{$collect{tools}};

	my @uniq_tools = do { my %seen; grep { !$seen{$_}++ } @{$collect{tools}} };
	my @uniq_types = do { my %seen; grep { !$seen{$_}++ } @{$collect{types}} };
	my @uniq_ext = do { my %seen; grep { !$seen{$_}++ } @{$collect{extensions}} };
	my @uniq_flags = do { my %seen; grep { !$seen{$_}++ } @{$collect{flags}} };
	my @uniq_uris = do { my %seen; grep { !$seen{$_}++ } @{$collect{uris}} };

	$msg->put_metadata('X-ExtractText-Words', $collect{words});
	$msg->put_metadata('X-ExtractText-Chars', $collect{chars});
	$msg->put_metadata('X-ExtractText-Tools', join(' ', @uniq_tools));
	$msg->put_metadata('X-ExtractText-Types', join(' ', @uniq_types));
	$msg->put_metadata('X-ExtractText-Extensions', join(' ', @uniq_ext));
	$msg->put_metadata('X-ExtractText-Flags', join(' ', @uniq_flags));
	$msg->put_metadata('X-ExtractText-Uris', join(' ', @uniq_uris));

	return 1;
	}

	sub parsed_metadata {
	my ($self, $opts) = @_;
	my $pms = $opts->{permsgstatus};
	my $msg = $pms->get_message();
	foreach my $tag (('Words','Chars','Tools','Types','Extensions','Flags','Uris')) {
	my $v = $msg->get_metadata("X-ExtractText-$tag");
	if (defined $v) {
	$pms->set_tag("ExtractText$tag", $v);
	dbg("extracttext: tag: $tag $v");
	if($tag eq 'Uris') {
	map { $pms->add_uri_detail_list($_) } split(/ /, $v);
	}
	}
	}
	return 1;
	}

	1;