blob: 47236f11907ca007391a06b95df658b7b34130f9 [file] [log] [blame]
"""ASCII, Dammit
Stupid library to turn MS chars (like smart quotes) and ISO-Latin
chars into ASCII, dammit. Will do plain text approximations, or more
accurate HTML representations. Can also be jiggered to just fix the
smart quotes and leave the rest of ISO-Latin alone.
Sources:
http://www.cs.tut.fi/~jkorpela/latin1/all.html
http://www.webreference.com/html/reference/character/isolat1.html
1.0 Initial Release (2004-11-28)
The author hereby irrevocably places this work in the public domain.
To the extent that this statement does not divest the copyright,
the copyright holder hereby grants irrevocably to every recipient
all rights in this work otherwise reserved under copyright.
"""
__author__ = "Leonard Richardson (leonardr@segfault.org)"
__version__ = "$Revision: 1.3 $"
__date__ = "$Date: 2009/04/28 10:45:03 $"
__license__ = "Public domain"
import re
import string
import types
CHARS = { '\x80' : ('EUR', 'euro'),
'\x81' : ' ',
'\x82' : (',', 'sbquo'),
'\x83' : ('f', 'fnof'),
'\x84' : (',,', 'bdquo'),
'\x85' : ('...', 'hellip'),
'\x86' : ('+', 'dagger'),
'\x87' : ('++', 'Dagger'),
'\x88' : ('^', 'caret'),
'\x89' : '%',
'\x8A' : ('S', 'Scaron'),
'\x8B' : ('<', 'lt;'),
'\x8C' : ('OE', 'OElig'),
'\x8D' : '?',
'\x8E' : 'Z',
'\x8F' : '?',
'\x90' : '?',
'\x91' : ("'", 'lsquo'),
'\x92' : ("'", 'rsquo'),
'\x93' : ('"', 'ldquo'),
'\x94' : ('"', 'rdquo'),
'\x95' : ('*', 'bull'),
'\x96' : ('-', 'ndash'),
'\x97' : ('--', 'mdash'),
'\x98' : ('~', 'tilde'),
'\x99' : ('(TM)', 'trade'),
'\x9a' : ('s', 'scaron'),
'\x9b' : ('>', 'gt'),
'\x9c' : ('oe', 'oelig'),
'\x9d' : '?',
'\x9e' : 'z',
'\x9f' : ('Y', 'Yuml'),
'\xa0' : (' ', 'nbsp'),
'\xa1' : ('!', 'iexcl'),
'\xa2' : ('c', 'cent'),
'\xa3' : ('GBP', 'pound'),
'\xa4' : ('$', 'curren'), #This approximation is especially lame.
'\xa5' : ('YEN', 'yen'),
'\xa6' : ('|', 'brvbar'),
'\xa7' : ('S', 'sect'),
'\xa8' : ('..', 'uml'),
'\xa9' : ('', 'copy'),
'\xaa' : ('(th)', 'ordf'),
'\xab' : ('<<', 'laquo'),
'\xac' : ('!', 'not'),
'\xad' : (' ', 'shy'),
'\xae' : ('(R)', 'reg'),
'\xaf' : ('-', 'macr'),
'\xb0' : ('o', 'deg'),
'\xb1' : ('+-', 'plusmm'),
'\xb2' : ('2', 'sup2'),
'\xb3' : ('3', 'sup3'),
'\xb4' : ("'", 'acute'),
'\xb5' : ('u', 'micro'),
'\xb6' : ('P', 'para'),
'\xb7' : ('*', 'middot'),
'\xb8' : (',', 'cedil'),
'\xb9' : ('1', 'sup1'),
'\xba' : ('(th)', 'ordm'),
'\xbb' : ('>>', 'raquo'),
'\xbc' : ('1/4', 'frac14'),
'\xbd' : ('1/2', 'frac12'),
'\xbe' : ('3/4', 'frac34'),
'\xbf' : ('?', 'iquest'),
'\xc0' : ('A', "Agrave"),
'\xc1' : ('A', "Aacute"),
'\xc2' : ('A', "Acirc"),
'\xc3' : ('A', "Atilde"),
'\xc4' : ('A', "Auml"),
'\xc5' : ('A', "Aring"),
'\xc6' : ('AE', "Aelig"),
'\xc7' : ('C', "Ccedil"),
'\xc8' : ('E', "Egrave"),
'\xc9' : ('E', "Eacute"),
'\xca' : ('E', "Ecirc"),
'\xcb' : ('E', "Euml"),
'\xcc' : ('I', "Igrave"),
'\xcd' : ('I', "Iacute"),
'\xce' : ('I', "Icirc"),
'\xcf' : ('I', "Iuml"),
'\xd0' : ('D', "Eth"),
'\xd1' : ('N', "Ntilde"),
'\xd2' : ('O', "Ograve"),
'\xd3' : ('O', "Oacute"),
'\xd4' : ('O', "Ocirc"),
'\xd5' : ('O', "Otilde"),
'\xd6' : ('O', "Ouml"),
'\xd7' : ('*', "times"),
'\xd8' : ('O', "Oslash"),
'\xd9' : ('U', "Ugrave"),
'\xda' : ('U', "Uacute"),
'\xdb' : ('U', "Ucirc"),
'\xdc' : ('U', "Uuml"),
'\xdd' : ('Y', "Yacute"),
'\xde' : ('b', "Thorn"),
'\xdf' : ('B', "szlig"),
'\xe0' : ('a', "agrave"),
'\xe1' : ('a', "aacute"),
'\xe2' : ('a', "acirc"),
'\xe3' : ('a', "atilde"),
'\xe4' : ('a', "auml"),
'\xe5' : ('a', "aring"),
'\xe6' : ('ae', "aelig"),
'\xe7' : ('c', "ccedil"),
'\xe8' : ('e', "egrave"),
'\xe9' : ('e', "eacute"),
'\xea' : ('e', "ecirc"),
'\xeb' : ('e', "euml"),
'\xec' : ('i', "igrave"),
'\xed' : ('i', "iacute"),
'\xee' : ('i', "icirc"),
'\xef' : ('i', "iuml"),
'\xf0' : ('o', "eth"),
'\xf1' : ('n', "ntilde"),
'\xf2' : ('o', "ograve"),
'\xf3' : ('o', "oacute"),
'\xf4' : ('o', "ocirc"),
'\xf5' : ('o', "otilde"),
'\xf6' : ('o', "ouml"),
'\xf7' : ('/', "divide"),
'\xf8' : ('o', "oslash"),
'\xf9' : ('u', "ugrave"),
'\xfa' : ('u', "uacute"),
'\xfb' : ('u', "ucirc"),
'\xfc' : ('u', "uuml"),
'\xfd' : ('y', "yacute"),
'\xfe' : ('b', "thorn"),
'\xff' : ('y', "yuml"),
}
def _makeRE(limit):
"""Returns a regular expression object that will match special characters
up to the given limit."""
return re.compile("([\x80-\\x%s])" % limit, re.M)
ALL = _makeRE('ff')
ONLY_WINDOWS = _makeRE('9f')
def _replHTML(match):
"Replace the matched character with its HTML equivalent."
return _repl(match, 1)
def _repl(match, html=0):
"Replace the matched character with its HTML or ASCII equivalent."
g = match.group(0)
a = CHARS.get(g,g)
if type(a) == types.TupleType:
a = a[html]
if html:
a = '&' + a + ';'
return a
def _dammit(t, html=0, fixWindowsOnly=0):
"Turns ISO-Latin-1 into an ASCII representation, dammit."
r = ALL
if fixWindowsOnly:
r = ONLY_WINDOWS
m = _repl
if html:
m = _replHTML
return re.sub(r, m, t)
def asciiDammit(t, fixWindowsOnly=0):
"Turns ISO-Latin-1 into a plain ASCII approximation, dammit."
return _dammit(t, 0, fixWindowsOnly)
def htmlDammit(t, fixWindowsOnly=0):
"Turns ISO-Latin-1 into plain ASCII with HTML codes, dammit."
return _dammit(t, 1, fixWindowsOnly=fixWindowsOnly)
def demoronise(t):
"""Helper method named in honor of the original smart quotes
remover, The Demoroniser:
http://www.fourmilab.ch/webtools/demoroniser/"""
return asciiDammit(t, 1)
if __name__ == '__main__':
french = '\x93Sacr\xe9 bleu!\x93'
print "First we mangle some French."
print asciiDammit(french)
print htmlDammit(french)
print
print "And now we fix the MS-quotes but leave the French alone."
print demoronise(french)
print htmlDammit(french, 1)