lib/Mail/SpamAssassin/Locales.pm - spamassassin - Git at Google

 # <@LICENSE>
 # Licensed to the Apache Software Foundation (ASF) under one or more
 # contributor license agreements.  See the NOTICE file distributed with
 # this work for additional information regarding copyright ownership.
 # The ASF licenses this file to you under the Apache License, Version 2.0
 # (the "License"); you may not use this file except in compliance with
 # the License.  You may obtain a copy of the License at:
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # </@LICENSE>

 package Mail::SpamAssassin::Locales;

 use strict;
 use warnings;
 # use bytes;
 use re 'taint';

 use vars qw{
   %charsets_for_locale
 };

 ###########################################################################

 # A mapping of known country codes to frequent charsets used therein.
 # note that the ISO and CP charsets will already have been permitted,
 # so only "unusual" charsets should be listed here.
 #
 # Country codes should be lowercase, charsets uppercase.
 #
 # A good listing is in /usr/share/config/charsets from KDE 2.2.1
 #
 %charsets_for_locale = (

   # Japanese: Peter Evans writes: iso-2022-jp = rfc approved, rfc 1468, created
   # by Jun Murai in 1993 back when he didnt have white hair!  rfc approved.
   # (rfc 2237) <-- by M$.
   'ja' => 'EUCJP JISX020119760 JISX020819830 JISX020819900 JISX020819970 '.
 	'JISX021219900 JISX021320001 JISX021320002 SHIFT_JIS SHIFTJIS '.
 	'ISO2022JP SJIS JIS7 JISX0201 JISX0208 JISX0212',

   # Korea
   'ko' => 'EUCKR KSC56011987',

   # Cyrillic: Andrew Vasilyev notes CP866 is common (bug 2278)
   'ru' => 'KOI8R KOI8U KOI8T ISOIR111 CP1251 GEORGIANPS CP1251 PT154 CP866',
   'ka' => 'KOI8R KOI8U KOI8T ISOIR111 CP1251 GEORGIANPS CP1251 PT154 CP866',
   'tg' => 'KOI8R KOI8U KOI8T ISOIR111 CP1251 GEORGIANPS CP1251 PT154 CP866',
   'be' => 'KOI8R KOI8U KOI8T ISOIR111 CP1251 GEORGIANPS CP1251 PT154 CP866',
   'uk' => 'KOI8R KOI8U KOI8T ISOIR111 CP1251 GEORGIANPS CP1251 PT154 CP866',
   'bg' => 'KOI8R KOI8U KOI8T ISOIR111 CP1251 GEORGIANPS CP1251 PT154 CP866',

   # Thai
   'th' => 'TIS620',

   # Chinese (simplified and traditional).   Peter Evans writes: new government
   # mandated chinese encoding = gb18030, chinese mail is supposed to be
   # iso-2022-cn (rfc 1922?)
   'zh' => 'GB1988 GB2312 GB231219800 GB18030 GBK BIG5HKSCS BIG5 EUCTW ISO2022CN',

   # Chinese Traditional charsets only
   'zh.big5' => 'BIG5HKSCS BIG5 EUCTW',

   # Chinese Simplified charsets only
   'zh.gb2312' => 'GB1988 GB2312 GB231219800 GB18030 GBK ISO2022CN',
 );

 ###########################################################################

 sub is_charset_ok_for_locales {
   my ($cs, @locales) = @_;

   $cs = uc $cs; $cs =~ s/[^A-Z0-9]//g;
   $cs =~ s/^3D//gs;		# broken by quoted-printable
   $cs =~ s/:.*$//gs;            # trim off multiple charsets, just use 1st

   study $cs;  # study is a no-op since perl 5.16.0, eliminating related bugs
   #warn "JMD $cs";

   # always OK (the net speaks mostly roman charsets)
   return 1 if ($cs eq 'USASCII');
   return 1 if ($cs =~ /^ISO8859/);
   return 1 if ($cs =~ /^ISO10646/);
   return 1 if ($cs =~ /^UTF/);
   return 1 if ($cs =~ /^UCS/);
   return 1 if ($cs =~ /^CP125/);
   return 1 if ($cs =~ /^WINDOWS/);      # argh, Windows
   return 1 if ($cs eq 'IBM852');
   return 1 if ($cs =~ /^UNICODE11UTF[78]/);	# wtf? never heard of it
   return 1 if ($cs eq 'XUNKNOWN'); # added by sendmail when converting to 8bit
   return 1 if ($cs eq 'ISO');	# Magellan, sending as 'charset=iso 8859-15'. grr

   foreach my $locale (@locales) {
     if (!defined($locale) || $locale eq 'C') { $locale = 'en'; }
     $locale =~ s/^([a-z][a-z]).*$/$1/;	# zh_TW... => zh

     my $ok_for_loc = $charsets_for_locale{$locale};
     next if (!defined $ok_for_loc);

     if ($ok_for_loc =~ /(?:^| )\Q${cs}\E(?:$| )/) {
       return 1;
     }
   }

   return 0;
 }

 1;
	# <@LICENSE>
	# Licensed to the Apache Software Foundation (ASF) under one or more
	# contributor license agreements. See the NOTICE file distributed with
	# this work for additional information regarding copyright ownership.
	# The ASF licenses this file to you under the Apache License, Version 2.0
	# (the "License"); you may not use this file except in compliance with
	# the License. You may obtain a copy of the License at:
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.
	# </@LICENSE>

	package Mail::SpamAssassin::Locales;

	use strict;
	use warnings;
	# use bytes;
	use re 'taint';

	use vars qw{
	%charsets_for_locale
	};

	###########################################################################

	# A mapping of known country codes to frequent charsets used therein.
	# note that the ISO and CP charsets will already have been permitted,
	# so only "unusual" charsets should be listed here.
	#
	# Country codes should be lowercase, charsets uppercase.
	#
	# A good listing is in /usr/share/config/charsets from KDE 2.2.1
	#
	%charsets_for_locale = (

	# Japanese: Peter Evans writes: iso-2022-jp = rfc approved, rfc 1468, created
	# by Jun Murai in 1993 back when he didnt have white hair! rfc approved.
	# (rfc 2237) <-- by M$.
	'ja' => 'EUCJP JISX020119760 JISX020819830 JISX020819900 JISX020819970 '.
	'JISX021219900 JISX021320001 JISX021320002 SHIFT_JIS SHIFTJIS '.
	'ISO2022JP SJIS JIS7 JISX0201 JISX0208 JISX0212',

	# Korea
	'ko' => 'EUCKR KSC56011987',

	# Cyrillic: Andrew Vasilyev notes CP866 is common (bug 2278)
	'ru' => 'KOI8R KOI8U KOI8T ISOIR111 CP1251 GEORGIANPS CP1251 PT154 CP866',
	'ka' => 'KOI8R KOI8U KOI8T ISOIR111 CP1251 GEORGIANPS CP1251 PT154 CP866',
	'tg' => 'KOI8R KOI8U KOI8T ISOIR111 CP1251 GEORGIANPS CP1251 PT154 CP866',
	'be' => 'KOI8R KOI8U KOI8T ISOIR111 CP1251 GEORGIANPS CP1251 PT154 CP866',
	'uk' => 'KOI8R KOI8U KOI8T ISOIR111 CP1251 GEORGIANPS CP1251 PT154 CP866',
	'bg' => 'KOI8R KOI8U KOI8T ISOIR111 CP1251 GEORGIANPS CP1251 PT154 CP866',

	# Thai
	'th' => 'TIS620',

	# Chinese (simplified and traditional). Peter Evans writes: new government
	# mandated chinese encoding = gb18030, chinese mail is supposed to be
	# iso-2022-cn (rfc 1922?)
	'zh' => 'GB1988 GB2312 GB231219800 GB18030 GBK BIG5HKSCS BIG5 EUCTW ISO2022CN',

	# Chinese Traditional charsets only
	'zh.big5' => 'BIG5HKSCS BIG5 EUCTW',

	# Chinese Simplified charsets only
	'zh.gb2312' => 'GB1988 GB2312 GB231219800 GB18030 GBK ISO2022CN',
	);

	###########################################################################

	sub is_charset_ok_for_locales {
	my ($cs, @locales) = @_;

	$cs = uc $cs; $cs =~ s/[^A-Z0-9]//g;
	$cs =~ s/^3D//gs; # broken by quoted-printable
	$cs =~ s/:.*$//gs; # trim off multiple charsets, just use 1st

	study $cs; # study is a no-op since perl 5.16.0, eliminating related bugs
	#warn "JMD $cs";

	# always OK (the net speaks mostly roman charsets)
	return 1 if ($cs eq 'USASCII');
	return 1 if ($cs =~ /^ISO8859/);
	return 1 if ($cs =~ /^ISO10646/);
	return 1 if ($cs =~ /^UTF/);
	return 1 if ($cs =~ /^UCS/);
	return 1 if ($cs =~ /^CP125/);
	return 1 if ($cs =~ /^WINDOWS/); # argh, Windows
	return 1 if ($cs eq 'IBM852');
	return 1 if ($cs =~ /^UNICODE11UTF[78]/); # wtf? never heard of it
	return 1 if ($cs eq 'XUNKNOWN'); # added by sendmail when converting to 8bit
	return 1 if ($cs eq 'ISO'); # Magellan, sending as 'charset=iso 8859-15'. grr

	foreach my $locale (@locales) {
	if (!defined($locale) \|\| $locale eq 'C') { $locale = 'en'; }
	$locale =~ s/^([a-z][a-z]).*$/$1/; # zh_TW... => zh

	my $ok_for_loc = $charsets_for_locale{$locale};
	next if (!defined $ok_for_loc);

	if ($ok_for_loc =~ /(?:^\| )\Q${cs}\E(?:$\| )/) {
	return 1;
	}
	}

	return 0;
	}

	1;