import codecs
import re
import struct
import logging
logger = logging.getLogger( __name__ )
# Python doesn't provide a programmatic way of fetching the supported codec list.
# The below list is taken from the 3.7 manual.
CODECS = [
'ascii',
'big5',
'big5hkscs',
'cp037',
'cp273',
'cp424',
'cp437',
'cp500',
'cp720',
'cp737',
'cp775',
'cp850',
'cp852',
'cp855',
'cp856',
'cp857',
'cp858',
'cp860',
'cp861',
'cp862',
'cp863',
'cp864',
'cp865',
'cp866',
'cp869',
'cp874',
'cp875',
'cp932',
'cp949',
'cp950',
'cp1006',
'cp1026',
'cp1125',
'cp1140',
'cp1250',
'cp1251',
'cp1252',
'cp1253',
'cp1254',
'cp1255',
'cp1256',
'cp1257',
'cp1258',
'euc_jp',
'euc_jis_2004',
'euc_jisx0213',
'euc_kr',
'gb2312',
'gbk',
'gb18030',
'hz',
'iso2022_jp',
'iso2022_jp_1',
'iso2022_jp_2',
'iso2022_jp_2004',
'iso2022_jp_3',
'iso2022_jp_ext',
'iso2022_kr',
'latin_1',
'iso8859_2',
'iso8859_3',
'iso8859_4',
'iso8859_5',
'iso8859_6',
'iso8859_7',
'iso8859_8',
'iso8859_9',
'iso8859_10',
'iso8859_11',
'iso8859_13',
'iso8859_14',
'iso8859_15',
'iso8859_16',
'johab',
'koi8_r',
'koi8_t',
'koi8_u',
'kz1048',
'mac_cyrillic',
'mac_greek',
'mac_iceland',
'mac_latin2',
'mac_roman',
'mac_turkish',
'ptcp154',
'shift_jis',
'shift_jis_2004',
'shift_jisx0213',
'utf_32',
'utf_32_be',
'utf_32_le',
'utf_16',
'utf_16_be',
'utf_16_le',
'utf_7',
'utf_8',
'utf_8_sig'
]
REGEX_CHARS = """()[]{}?*+-|^$\\.&~#="""
byte_escape = lambda char: '\\x{:02x}'.format( char ).encode( 'utf8' )
[docs]def regex_pattern_to_bytes( pattern, encoding='utf8', fixed_string=False, hex_format=False ):
result = bytearray()
# for hex format mode, strip out all whitespace characters first
if hex_format:
pattern = pattern.replace( ' ', '' ).replace( '\t', '' ).replace( '\n', '' ).replace( '\r', '' )
# strip out the automatic byte-order mark
encoding_test = encoding.lower().replace( ' ', '' ).replace( '-', '' ).replace( '_', '' )
if encoding_test == 'utf16':
encoding = 'utf-16-le'
elif encoding_test == 'utf32':
encoding = 'utf-32-le'
pointer = 0
repeat_block = False
while pointer < len( pattern ):
if pattern[pointer] == '\\' and not hex_format and not fixed_string:
# an escaped character!
if re.match( r'\\x[0-9A-Fa-f]{2}', pattern[pointer:pointer+4] ):
# escaped hex byte
result.extend( byte_escape( bytes.fromhex( pattern[pointer+2:pointer+4] )[0] ) )
pointer += 4
elif re.match( r'\\[\\\'"abfnrtv]', pattern[pointer:pointer+2] ):
# escaped single character
char_id, char_raw = '\\\'"abfnrtv', '\\\'"\a\b\f\n\r\t\v'
char_map = {char_id[i]: ord( char_raw[i] ) for i in range( len( char_id ) )}
result.extend( byte_escape( char_map[pattern[pointer+1]] ) )
pointer += 2
elif pattern[pointer+1] in REGEX_CHARS:
# escaped character that's also a regex char
result.extend( byte_escape( ord( pattern[pointer+1] ) ) )
pointer += 2
else:
raise ValueError( 'Unknown escape sequence \\{}'.format( pattern[pointer+1] ) )
elif pattern[pointer] in REGEX_CHARS and not fixed_string:
# a regex special character! inject it into the output unchanged
if pattern[pointer] == '{':
repeat_block = True
elif pattern[pointer] == '}':
repeat_block = False
result.extend( pattern[pointer].encode( 'utf8' ) )
pointer += 1
elif repeat_block:
# inside a repeat block, don't encode anything
result.extend( pattern[pointer].encode( 'utf8' ) )
pointer += 1
elif hex_format:
# we're in hex string mode; treat as raw hexadecimal
if not re.match( r'[0-9A-Fa-f]{2}', pattern[pointer:pointer+2] ):
raise ValueError( 'Sequence {} is not valid hexadecimal'.format( pattern[pointer:pointer+2] ) )
result.extend( byte_escape( int( pattern[pointer:pointer+2], 16 ) ) )
pointer += 2
else:
# a normal character! encode as bytes, and inject escaped digits into the output
for char in pattern[pointer].encode( encoding ):
result.extend( byte_escape( char ) )
pointer += 1
return bytes( result )
[docs]def regex_unknown_encoding_match( string, char_size=1 ):
match_map = {}
pattern = bytearray()
for i, char in enumerate( string ):
if char not in match_map:
match_id = len( match_map )
match_group = '?P<p{}>.'.format( match_id ).encode( 'utf8' )
if char_size != 1:
match_group += b'{' + '{}'.format( char_size ).encode( 'utf8' ) + b'}'
if len( pattern ) == 0:
pattern += b'(' + match_group + b')'
else:
pattern += b'(' + match_group + b'(?<!'
pattern += b'|'.join( ['(?P=p{})'.format( match_map[c] ).encode( 'utf8' ) for c in match_map if c != char] )
pattern += b'))'
match_map[char] = match_id
else:
pattern += '(?P=p{})'.format( match_map[char] ).encode( 'utf8' )
if len( string ) == len( match_map ):
logger.warning( 'Input has no repeated characters! This can make an enormous number of false matches, and is likely not what you want' )
return match_map, bytes( pattern )
RAW_TYPE_NAME = {
(int, 1, 'signed', 'little'): 'int8',
(int, 1, 'unsigned', 'little'): 'uint8',
(int, 1, 'signed', 'big'): 'int8',
(int, 1, 'unsigned', 'big'): 'uint8',
(int, 1, 'signed', None): 'int8',
(int, 1, 'unsigned', None): 'uint8',
(int, 2, 'signed', 'little'): 'int16_le',
(int, 3, 'signed', 'little'): 'int24_le',
(int, 4, 'signed', 'little'): 'int32_le',
(int, 8, 'signed', 'little'): 'int64_le',
(int, 2, 'unsigned', 'little'): 'uint16_le',
(int, 3, 'unsigned', 'little'): 'uint24_le',
(int, 4, 'unsigned', 'little'): 'uint32_le',
(int, 8, 'unsigned', 'little'): 'uint64_le',
(float, 4, 'signed', 'little'): 'float32_le',
(float, 8, 'signed', 'little'): 'float64_le',
(int, 2, 'signed', 'big'): 'int16_be',
(int, 3, 'signed', 'big'): 'int24_be',
(int, 4, 'signed', 'big'): 'int32_be',
(int, 8, 'signed', 'big'): 'int64_be',
(int, 2, 'unsigned', 'big'): 'uint16_be',
(int, 3, 'unsigned', 'big'): 'uint24_be',
(int, 4, 'unsigned', 'big'): 'uint32_be',
(int, 8, 'unsigned', 'big'): 'uint64_be',
(float, 4, 'signed', 'big'): 'float32_be',
(float, 8, 'signed', 'big'): 'float64_be',
}
RAW_TYPE_NAME_REVERSE = {v: k for k, v in RAW_TYPE_NAME.items()}
RAW_TYPE_STRUCT = {
(int, 1, 'unsigned'): 'B',
(int, 1, 'signed'): 'b',
(int, 2, 'unsigned'): 'H',
(int, 2, 'signed'): 'h',
(int, 4, 'unsigned'): 'I',
(int, 4, 'signed'): 'i',
(int, 8, 'unsigned'): 'Q',
(int, 8, 'signed'): 'q',
(float, 4, 'signed'): 'f',
(float, 8, 'signed'): 'd',
}
FROM_RAW_TYPE = {}
TO_RAW_TYPE = {}
FROM_RAW_TYPE_ARRAY = {}
TO_RAW_TYPE_ARRAY = {}
[docs]def get_raw_type_struct( format_type, field_size, signedness, endian, count=None ):
return '{}{}{}'.format(
'>' if endian == 'big' else '<',
count if count is not None else '',
RAW_TYPE_STRUCT[(format_type, field_size, signedness)]
)
[docs]def get_raw_type_description( format_type, field_size, signedness, endian ):
TYPE_NAMES = {
int: 'integer',
float: 'floating-point number',
}
type_name = TYPE_NAMES[format_type]
return ('{}{}-bit {}{}'.format(
('signed ' if signedness == 'signed' else 'unsigned ') if format_type == int else '',
field_size*8,
type_name,
' ({}-endian)'.format(endian) if field_size>1 else ''
), type_name)
def _from_raw_type( type_id ):
result = lambda buffer: struct.unpack( get_raw_type_struct( *type_id ), buffer )[0]
result.__doc__ = 'Convert a {0} byte string to a Python {1}.'.format(
*get_raw_type_description( *type_id )
)
return result
def _to_raw_type( type_id ):
result = lambda value: struct.pack( get_raw_type_struct( *type_id ), value )
result.__doc__ = 'Convert a Python {1} to a {0} byte string.'.format(
*get_raw_type_description( *type_id )
)
return result
def _from_raw_type_array( type_id ):
result = lambda buffer: list( struct.unpack( get_raw_type_struct( *type_id, count=len( buffer )//type_id[1] ), buffer ) )
result.__doc__ = 'Convert a {0} byte string to a Python list of {1}s.'.format(
*get_raw_type_description( *type_id )
)
return result
def _to_raw_type_array( type_id ):
result = lambda value_list: struct.pack( get_raw_type_struct( *type_id, count=len( value_list ) ), *value_list )
result.__doc__ = 'Convert a Python list of {1}s to a {0} byte string.'.format(
*get_raw_type_description( *type_id )
)
return result
def _from_generic_array( type_id, from_raw ):
result = lambda buffer: [from_raw( buffer[i:i+type_id[1]] ) for i in range( 0, len( buffer ), type_id[1] )]
result.__doc__ = 'Convert a {0} byte string to a Python list of {1}s.'.format(
*get_raw_type_description( *type_id )
)
return result
def _to_generic_array( type_id, to_raw ):
result = lambda value_list: b''.join( [to_raw( value ) for value in value_list] )
result.__doc__ = 'Convert a Python list of {1}s to a {0} byte string.'.format(
*get_raw_type_description( *type_id )
)
return result
# autogenerate conversion methods based on struct
for format_type, field_size, signedness in RAW_TYPE_STRUCT:
endian_choices = [None, 'little', 'big'] if field_size == 1 else ['little', 'big']
for endian in endian_choices:
type_id = (format_type, field_size, signedness, endian)
FROM_RAW_TYPE[type_id] = _from_raw_type( type_id )
TO_RAW_TYPE[type_id] = _to_raw_type( type_id )
FROM_RAW_TYPE_ARRAY[type_id] = _from_raw_type_array( type_id )
TO_RAW_TYPE_ARRAY[type_id] = _to_raw_type_array( type_id )
# 24-bit types
RAW_24 = ['int24_le', 'uint24_le', 'int24_be', 'uint24_be']
def _from_raw_24( type_id ):
format_type, field_size, signedness, endian = type_id
assert format_type == int
assert field_size == 3
assert endian in ('little', 'big')
assert signedness in ('signed', 'unsigned')
def result( buffer ):
if endian == 'little':
buffer = buffer + (b'\xff' if (signedness == 'signed' and buffer[2] >= 0x80) else b'\x00')
elif endian == 'big':
buffer = (b'\xff' if (signedness == 'signed' and buffer[0] >= 0x80) else b'\x00') + buffer
return FROM_RAW_TYPE[(format_type, 4, signedness, endian)]( buffer )
result.__doc__ = 'Convert a {0} byte string to a Python {1}.'.format(
*get_raw_type_description( *type_id )
)
return result
def _to_raw_24( type_id ):
format_type, field_size, signedness, endian = type_id
assert format_type == int
assert field_size == 3
assert endian in ('little', 'big')
assert signedness in ('signed', 'unsigned')
def result( value ):
if signedness == 'signed':
assert value in range( -1<<23, 1<<23 )
else:
assert value in range( 0, 1<<24 )
output = TO_RAW_TYPE[(format_type, 4, signedness, endian)]( value )
if endian == 'little':
output = output[:3]
elif endian == 'big':
output = output[1:]
return output
result.__doc__ = 'Convert a Python {1} to a {0} byte string.'.format(
*get_raw_type_description( *type_id )
)
return result
for code in RAW_24:
type_id = RAW_TYPE_NAME_REVERSE[code]
FROM_RAW_TYPE[type_id] = _from_raw_24( type_id )
TO_RAW_TYPE[type_id] = _to_raw_24( type_id )
FROM_RAW_TYPE_ARRAY[type_id] = _from_generic_array( type_id, FROM_RAW_TYPE[type_id] )
TO_RAW_TYPE_ARRAY[type_id] = _to_generic_array( type_id, TO_RAW_TYPE[type_id] )
def _load_raw_types():
result = {}
for type_id, from_func in FROM_RAW_TYPE.items():
result['from_{}'.format( RAW_TYPE_NAME[type_id] )] = from_func
for type_id, to_func in TO_RAW_TYPE.items():
result['to_{}'.format( RAW_TYPE_NAME[type_id] )] = to_func
for type_id, from_func in FROM_RAW_TYPE_ARRAY.items():
result['from_{}_array'.format( RAW_TYPE_NAME[type_id] )] = from_func
for type_id, to_func in TO_RAW_TYPE_ARRAY.items():
result['to_{}_array'.format( RAW_TYPE_NAME[type_id] )] = to_func
return result
[docs]def unpack( type_id, value ):
if isinstance( type_id, str ):
type_id = RAW_TYPE_NAME_REVERSE[type_id]
return FROM_RAW_TYPE[type_id]( value )
[docs]def pack( type_id, value ):
if isinstance( type_id, str ):
type_id = RAW_TYPE_NAME_REVERSE[type_id]
return TO_RAW_TYPE[type_id]( value )
[docs]def unpack_array( type_id, values ):
if isinstance( type_id, str ):
type_id = RAW_TYPE_NAME_REVERSE[type_id]
return FROM_RAW_TYPE_ARRAY[type_id]( values )
[docs]def pack_array( type_id, values ):
if isinstance( type_id, str ):
type_id = RAW_TYPE_NAME_REVERSE[type_id]
return TO_RAW_TYPE_ARRAY[type_id]( values )