You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 
 

294 lines
11 KiB

# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License along
# with this program; if not, write to the Free Software Foundation, Inc.,
# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
import codecs
import sys
import traceback
# data from
# http://snoops.roy202.org/testerman/browser/trunk/plugins/codecs/gsm0338.py
# default GSM 03.38 -> unicode
def_regular_decode_dict = {
'\x00': u'\u0040', # COMMERCIAL AT
'\x01': u'\u00A3', # POUND SIGN
'\x02': u'\u0024', # DOLLAR SIGN
'\x03': u'\u00A5', # YEN SIGN
'\x04': u'\u00E8', # LATIN SMALL LETTER E WITH GRAVE
'\x05': u'\u00E9', # LATIN SMALL LETTER E WITH ACUTE
'\x06': u'\u00F9', # LATIN SMALL LETTER U WITH GRAVE
'\x07': u'\u00EC', # LATIN SMALL LETTER I WITH GRAVE
'\x08': u'\u00F2', # LATIN SMALL LETTER O WITH GRAVE
'\x09': u'\u00C7', # LATIN CAPITAL LETTER C WITH CEDILLA
# The Unicode page suggests this is a mistake: but
# it's still in the latest version of the spec and
# our implementation has to be exact.
'\x0A': u'\u000A', # LINE FEED
'\x0B': u'\u00D8', # LATIN CAPITAL LETTER O WITH STROKE
'\x0C': u'\u00F8', # LATIN SMALL LETTER O WITH STROKE
'\x0D': u'\u000D', # CARRIAGE RETURN
'\x0E': u'\u00C5', # LATIN CAPITAL LETTER A WITH RING ABOVE
'\x0F': u'\u00E5', # LATIN SMALL LETTER A WITH RING ABOVE
'\x10': u'\u0394', # GREEK CAPITAL LETTER DELTA
'\x11': u'\u005F', # LOW LINE
'\x12': u'\u03A6', # GREEK CAPITAL LETTER PHI
'\x13': u'\u0393', # GREEK CAPITAL LETTER GAMMA
'\x14': u'\u039B', # GREEK CAPITAL LETTER LAMDA
'\x15': u'\u03A9', # GREEK CAPITAL LETTER OMEGA
'\x16': u'\u03A0', # GREEK CAPITAL LETTER PI
'\x17': u'\u03A8', # GREEK CAPITAL LETTER PSI
'\x18': u'\u03A3', # GREEK CAPITAL LETTER SIGMA
'\x19': u'\u0398', # GREEK CAPITAL LETTER THETA
'\x1A': u'\u039E', # GREEK CAPITAL LETTER XI
'\x1C': u'\u00C6', # LATIN CAPITAL LETTER AE
'\x1D': u'\u00E6', # LATIN SMALL LETTER AE
'\x1E': u'\u00DF', # LATIN SMALL LETTER SHARP S (German)
'\x1F': u'\u00C9', # LATIN CAPITAL LETTER E WITH ACUTE
'\x20': u'\u0020', # SPACE
'\x21': u'\u0021', # EXCLAMATION MARK
'\x22': u'\u0022', # QUOTATION MARK
'\x23': u'\u0023', # NUMBER SIGN
'\x24': u'\u00A4', # CURRENCY SIGN
'\x25': u'\u0025', # PERCENT SIGN
'\x26': u'\u0026', # AMPERSAND
'\x27': u'\u0027', # APOSTROPHE
'\x28': u'\u0028', # LEFT PARENTHESIS
'\x29': u'\u0029', # RIGHT PARENTHESIS
'\x2A': u'\u002A', # ASTERISK
'\x2B': u'\u002B', # PLUS SIGN
'\x2C': u'\u002C', # COMMA
'\x2D': u'\u002D', # HYPHEN-MINUS
'\x2E': u'\u002E', # FULL STOP
'\x2F': u'\u002F', # SOLIDUS
'\x30': u'\u0030', # DIGIT ZERO
'\x31': u'\u0031', # DIGIT ONE
'\x32': u'\u0032', # DIGIT TWO
'\x33': u'\u0033', # DIGIT THREE
'\x34': u'\u0034', # DIGIT FOUR
'\x35': u'\u0035', # DIGIT FIVE
'\x36': u'\u0036', # DIGIT SIX
'\x37': u'\u0037', # DIGIT SEVEN
'\x38': u'\u0038', # DIGIT EIGHT
'\x39': u'\u0039', # DIGIT NINE
'\x3A': u'\u003A', # COLON
'\x3B': u'\u003B', # SEMICOLON
'\x3C': u'\u003C', # LESS-THAN SIGN
'\x3D': u'\u003D', # EQUALS SIGN
'\x3E': u'\u003E', # GREATER-THAN SIGN
'\x3F': u'\u003F', # QUESTION MARK
'\x40': u'\u00A1', # INVERTED EXCLAMATION MARK
'\x41': u'\u0041', # LATIN CAPITAL LETTER A
'\x42': u'\u0042', # LATIN CAPITAL LETTER B
'\x43': u'\u0043', # LATIN CAPITAL LETTER C
'\x44': u'\u0044', # LATIN CAPITAL LETTER D
'\x45': u'\u0045', # LATIN CAPITAL LETTER E
'\x46': u'\u0046', # LATIN CAPITAL LETTER F
'\x47': u'\u0047', # LATIN CAPITAL LETTER G
'\x48': u'\u0048', # LATIN CAPITAL LETTER H
'\x49': u'\u0049', # LATIN CAPITAL LETTER I
'\x4A': u'\u004A', # LATIN CAPITAL LETTER J
'\x4B': u'\u004B', # LATIN CAPITAL LETTER K
'\x4C': u'\u004C', # LATIN CAPITAL LETTER L
'\x4D': u'\u004D', # LATIN CAPITAL LETTER M
'\x4E': u'\u004E', # LATIN CAPITAL LETTER N
'\x4F': u'\u004F', # LATIN CAPITAL LETTER O
'\x50': u'\u0050', # LATIN CAPITAL LETTER P
'\x51': u'\u0051', # LATIN CAPITAL LETTER Q
'\x52': u'\u0052', # LATIN CAPITAL LETTER R
'\x53': u'\u0053', # LATIN CAPITAL LETTER S
'\x54': u'\u0054', # LATIN CAPITAL LETTER T
'\x55': u'\u0055', # LATIN CAPITAL LETTER U
'\x56': u'\u0056', # LATIN CAPITAL LETTER V
'\x57': u'\u0057', # LATIN CAPITAL LETTER W
'\x58': u'\u0058', # LATIN CAPITAL LETTER X
'\x59': u'\u0059', # LATIN CAPITAL LETTER Y
'\x5A': u'\u005A', # LATIN CAPITAL LETTER Z
'\x5B': u'\u00C4', # LATIN CAPITAL LETTER A WITH DIAERESIS
'\x5C': u'\u00D6', # LATIN CAPITAL LETTER O WITH DIAERESIS
'\x5D': u'\u00D1', # LATIN CAPITAL LETTER N WITH TILDE
'\x5E': u'\u00DC', # LATIN CAPITAL LETTER U WITH DIAERESIS
'\x5F': u'\u00A7', # SECTION SIGN
'\x60': u'\u00BF', # INVERTED QUESTION MARK
'\x61': u'\u0061', # LATIN SMALL LETTER A
'\x62': u'\u0062', # LATIN SMALL LETTER B
'\x63': u'\u0063', # LATIN SMALL LETTER C
'\x64': u'\u0064', # LATIN SMALL LETTER D
'\x65': u'\u0065', # LATIN SMALL LETTER E
'\x66': u'\u0066', # LATIN SMALL LETTER F
'\x67': u'\u0067', # LATIN SMALL LETTER G
'\x68': u'\u0068', # LATIN SMALL LETTER H
'\x69': u'\u0069', # LATIN SMALL LETTER I
'\x6A': u'\u006A', # LATIN SMALL LETTER J
'\x6B': u'\u006B', # LATIN SMALL LETTER K
'\x6C': u'\u006C', # LATIN SMALL LETTER L
'\x6D': u'\u006D', # LATIN SMALL LETTER M
'\x6E': u'\u006E', # LATIN SMALL LETTER N
'\x6F': u'\u006F', # LATIN SMALL LETTER O
'\x70': u'\u0070', # LATIN SMALL LETTER P
'\x71': u'\u0071', # LATIN SMALL LETTER Q
'\x72': u'\u0072', # LATIN SMALL LETTER R
'\x73': u'\u0073', # LATIN SMALL LETTER S
'\x74': u'\u0074', # LATIN SMALL LETTER T
'\x75': u'\u0075', # LATIN SMALL LETTER U
'\x76': u'\u0076', # LATIN SMALL LETTER V
'\x77': u'\u0077', # LATIN SMALL LETTER W
'\x78': u'\u0078', # LATIN SMALL LETTER X
'\x79': u'\u0079', # LATIN SMALL LETTER Y
'\x7A': u'\u007A', # LATIN SMALL LETTER Z
'\x7B': u'\u00E4', # LATIN SMALL LETTER A WITH DIAERESIS
'\x7C': u'\u00F6', # LATIN SMALL LETTER O WITH DIAERESIS
'\x7D': u'\u00F1', # LATIN SMALL LETTER N WITH TILDE
'\x7E': u'\u00FC', # LATIN SMALL LETTER U WITH DIAERESIS
'\x7F': u'\u00E0', # LATIN SMALL LETTER A WITH GRAVE
}
# default GSM 03.38 escaped characters -> unicode
def_escape_decode_dict = {
'\x0A': u'\u000C', # FORM FEED
'\x14': u'\u005E', # CIRCUMFLEX ACCENT
'\x28': u'\u007B', # LEFT CURLY BRACKET
'\x29': u'\u007D', # RIGHT CURLY BRACKET
'\x2F': u'\u005C', # REVERSE SOLIDUS
'\x3C': u'\u005B', # LEFT SQUARE BRACKET
'\x3D': u'\u007E', # TILDE
'\x3E': u'\u005D', # RIGHT SQUARE BRACKET
'\x40': u'\u007C', # VERTICAL LINE
'\x65': u'\u20AC', # EURO SIGN
}
# Replacement characters, default is question mark. Used when it is not too
# important to ensure exact UTF-8 -> GSM -> UTF-8 equivilence, such as when
# humans read and write SMS. But for USSD and other M2M applications it's
# important to ensure the conversion is exact.
def_replace_encode_dict = {
u'\u00E7': '\x09', # LATIN SMALL LETTER C WITH CEDILLA
u'\u0391': '\x41', # GREEK CAPITAL LETTER ALPHA
u'\u0392': '\x42', # GREEK CAPITAL LETTER BETA
u'\u0395': '\x45', # GREEK CAPITAL LETTER EPSILON
u'\u0397': '\x48', # GREEK CAPITAL LETTER ETA
u'\u0399': '\x49', # GREEK CAPITAL LETTER IOTA
u'\u039A': '\x4B', # GREEK CAPITAL LETTER KAPPA
u'\u039C': '\x4D', # GREEK CAPITAL LETTER MU
u'\u039D': '\x4E', # GREEK CAPITAL LETTER NU
u'\u039F': '\x4F', # GREEK CAPITAL LETTER OMICRON
u'\u03A1': '\x50', # GREEK CAPITAL LETTER RHO
u'\u03A4': '\x54', # GREEK CAPITAL LETTER TAU
u'\u03A7': '\x58', # GREEK CAPITAL LETTER CHI
u'\u03A5': '\x59', # GREEK CAPITAL LETTER UPSILON
u'\u0396': '\x5A', # GREEK CAPITAL LETTER ZETA
}
QUESTION_MARK = chr(0x3f)
# unicode -> default GSM 03.38
def_regular_encode_dict = \
dict((u, g) for g, u in iter(def_regular_decode_dict.items()))
# unicode -> default escaped GSM 03.38 characters
def_escape_encode_dict = \
dict((u, g) for g, u in iter(def_escape_decode_dict.items()))
def encode(input_, errors='strict'):
"""
:type input_: unicode
:return: string
"""
result = []
for c in input_:
try:
result.append(def_regular_encode_dict[c])
except KeyError:
if c in def_escape_encode_dict:
# OK, let's encode it as an escaped characters
result.append('\x1b')
result.append(def_escape_encode_dict[c])
else:
if errors == 'strict':
raise UnicodeError("Invalid GSM character")
elif errors == 'replace':
result.append(
def_replace_encode_dict.get(c, QUESTION_MARK))
elif errors == 'ignore':
pass
else:
raise UnicodeError("Unknown error handling")
ret = ''.join(result)
return ret, len(ret)
def decode(input_, errors='strict'):
"""
:type input_: str
:return: unicode
"""
result = []
index = 0
while index < len(input_):
c = input_[index]
index += 1
if c == '\x1b':
if index < len(input_):
c = input_[index]
index += 1
result.append(def_escape_decode_dict.get(chr(c), u'\xa0'))
else:
result.append(u'\xa0')
else:
try:
result.append(def_regular_decode_dict.get(chr(c)))
except KeyError:
# error handling: unassigned byte, must be > 0x7f
if errors == 'strict':
raise UnicodeError("Unrecognized GSM character")
elif errors == 'replace':
result.append('?')
elif errors == 'ignore':
pass
else:
raise UnicodeError("Unknown error handling")
ret = u''.join(result)
return ret, len(ret)
# encodings module API
def getregentry(encoding):
if encoding == 'gsm0338':
return codecs.CodecInfo(name='gsm0338',
encode=encode,
decode=decode)
# Codec registration
codecs.register(getregentry)
def is_gsm_text(text):
"""Returns True if ``text`` can be encoded as gsm text"""
try:
tx = text.encode("gsm0338")
print(tx, type(tx), len(tx))
except UnicodeError:
return False
except:
traceback.print_exc(file=sys.stdout)
return False
return True