from __future__ import annotations
import logging
import re
import struct
from typing import Callable, Optional, Sequence, Tuple, Type, Union
from typing_extensions import Literal
from mrcrowbar.common import BytesReadType
logger = logging.getLogger( __name__ )
NumberType = Union[Type[int], Type[float]]
Number = Union[int, float]
SignedEncoding = Literal["signed", "unsigned"]
EndianEncoding = Literal["big", "little"]
NumberEncoding = Tuple[NumberType, int, SignedEncoding, Optional[EndianEncoding]]
# Python doesn't provide a programmatic way of fetching the supported codec list.
# The below list is taken from the 3.7 manual.
CODECS = [
"ascii",
"big5",
"big5hkscs",
"cp037",
"cp273",
"cp424",
"cp437",
"cp500",
"cp720",
"cp737",
"cp775",
"cp850",
"cp852",
"cp855",
"cp856",
"cp857",
"cp858",
"cp860",
"cp861",
"cp862",
"cp863",
"cp864",
"cp865",
"cp866",
"cp869",
"cp874",
"cp875",
"cp932",
"cp949",
"cp950",
"cp1006",
"cp1026",
"cp1125",
"cp1140",
"cp1250",
"cp1251",
"cp1252",
"cp1253",
"cp1254",
"cp1255",
"cp1256",
"cp1257",
"cp1258",
"euc_jp",
"euc_jis_2004",
"euc_jisx0213",
"euc_kr",
"gb2312",
"gbk",
"gb18030",
"hz",
"iso2022_jp",
"iso2022_jp_1",
"iso2022_jp_2",
"iso2022_jp_2004",
"iso2022_jp_3",
"iso2022_jp_ext",
"iso2022_kr",
"latin_1",
"iso8859_2",
"iso8859_3",
"iso8859_4",
"iso8859_5",
"iso8859_6",
"iso8859_7",
"iso8859_8",
"iso8859_9",
"iso8859_10",
"iso8859_11",
"iso8859_13",
"iso8859_14",
"iso8859_15",
"iso8859_16",
"johab",
"koi8_r",
"koi8_t",
"koi8_u",
"kz1048",
"mac_cyrillic",
"mac_greek",
"mac_iceland",
"mac_latin2",
"mac_roman",
"mac_turkish",
"ptcp154",
"shift_jis",
"shift_jis_2004",
"shift_jisx0213",
"utf_32",
"utf_32_be",
"utf_32_le",
"utf_16",
"utf_16_be",
"utf_16_le",
"utf_7",
"utf_8",
"utf_8_sig",
]
REGEX_CHARS = """()[]{}?*+-|^$\\.&~#="""
byte_escape: Callable[[int], bytes] = lambda char: f"\\x{char:02x}".encode()
[docs]def regex_pattern_to_bytes(
pattern: str,
encoding: str = "utf8",
fixed_string: bool = False,
hex_format: bool = False,
) -> bytes:
result = bytearray()
# for hex format mode, strip out all whitespace characters first
if hex_format:
pattern = (
pattern.replace( " ", "" )
.replace( "\t", "" )
.replace( "\n", "" )
.replace( "\r", "" )
)
# strip out the automatic byte-order mark
encoding_test = (
encoding.lower().replace( " ", "" ).replace( "-", "" ).replace( "_", "" )
)
if encoding_test == "utf16":
encoding = "utf-16-le"
elif encoding_test == "utf32":
encoding = "utf-32-le"
pointer = 0
repeat_block = False
while pointer < len( pattern ):
if pattern[pointer] == "\\" and not hex_format and not fixed_string:
# an escaped character!
if re.match( r"\\x[0-9A-Fa-f]{2}", pattern[pointer : pointer + 4] ):
# escaped hex byte
result.extend(
byte_escape(
bytes.fromhex( pattern[pointer + 2 : pointer + 4] )[0]
)
)
pointer += 4
elif re.match( r'\\[\\\'"abfnrtv]', pattern[pointer : pointer + 2] ):
# escaped single character
char_id, char_raw = "\\'\"abfnrtv", "\\'\"\a\b\f\n\r\t\v"
char_map = {
char_id[i]: ord( char_raw[i] ) for i in range( len( char_id ) )
}
result.extend( byte_escape( char_map[pattern[pointer + 1]] ) )
pointer += 2
elif pattern[pointer + 1] in REGEX_CHARS:
# escaped character that's also a regex char
result.extend( byte_escape( ord( pattern[pointer + 1] ) ) )
pointer += 2
else:
raise ValueError( f"Unknown escape sequence \\{pattern[pointer + 1]}" )
elif pattern[pointer] in REGEX_CHARS and not fixed_string:
# a regex special character! inject it into the output unchanged
if pattern[pointer] == "{":
repeat_block = True
elif pattern[pointer] == "}":
repeat_block = False
result.extend( pattern[pointer].encode( "utf8" ) )
pointer += 1
elif repeat_block:
# inside a repeat block, don't encode anything
result.extend( pattern[pointer].encode( "utf8" ) )
pointer += 1
elif hex_format:
# we're in hex string mode; treat as raw hexadecimal
if not re.match( r"[0-9A-Fa-f]{2}", pattern[pointer : pointer + 2] ):
raise ValueError(
f"Sequence {pattern[pointer:pointer + 2]} is not valid hexadecimal"
)
result.extend( byte_escape( int( pattern[pointer : pointer + 2], 16 ) ) )
pointer += 2
else:
# a normal character! encode as bytes, and inject escaped digits into the output
for char in pattern[pointer].encode( encoding ):
result.extend( byte_escape( char ) )
pointer += 1
return bytes( result )
[docs]def regex_unknown_encoding_match(
source: str, char_size: int = 1
) -> tuple[dict[str, int], bytes]:
match_map: dict[str, int] = {}
pattern = bytearray()
for char in source:
if char not in match_map:
match_id = len( match_map )
match_group = f"?P<p{match_id}>.".encode()
if char_size != 1:
match_group += b"{" + f"{char_size}".encode() + b"}"
if len( pattern ) == 0:
pattern += b"(" + match_group + b")"
else:
pattern += b"(" + match_group + b"(?<!"
pattern += b"|".join(
[f"(?P=p{match_map[c]})".encode() for c in match_map if c != char]
)
pattern += b"))"
match_map[char] = match_id
else:
pattern += f"(?P=p{match_map[char]})".encode()
if len( source ) == len( match_map ):
logger.warning(
"Input has no repeated characters! This can make an enormous number of false matches, and is likely not what you want"
)
return match_map, bytes( pattern )
RAW_TYPE_NAME: dict[NumberEncoding, str] = {
(int, 1, "signed", "little"): "int8",
(int, 1, "unsigned", "little"): "uint8",
(int, 1, "signed", "big"): "int8",
(int, 1, "unsigned", "big"): "uint8",
(int, 1, "signed", None): "int8",
(int, 1, "unsigned", None): "uint8",
(int, 2, "signed", "little"): "int16_le",
(int, 3, "signed", "little"): "int24_le",
(int, 4, "signed", "little"): "int32_le",
(int, 8, "signed", "little"): "int64_le",
(int, 2, "unsigned", "little"): "uint16_le",
(int, 3, "unsigned", "little"): "uint24_le",
(int, 4, "unsigned", "little"): "uint32_le",
(int, 8, "unsigned", "little"): "uint64_le",
(float, 4, "signed", "little"): "float32_le",
(float, 8, "signed", "little"): "float64_le",
(int, 2, "signed", "big"): "int16_be",
(int, 3, "signed", "big"): "int24_be",
(int, 4, "signed", "big"): "int32_be",
(int, 8, "signed", "big"): "int64_be",
(int, 2, "unsigned", "big"): "uint16_be",
(int, 3, "unsigned", "big"): "uint24_be",
(int, 4, "unsigned", "big"): "uint32_be",
(int, 8, "unsigned", "big"): "uint64_be",
(float, 4, "signed", "big"): "float32_be",
(float, 8, "signed", "big"): "float64_be",
}
RAW_TYPE_NAME_REVERSE = {v: k for k, v in RAW_TYPE_NAME.items()}
RAW_TYPE_STRUCT: dict[tuple[NumberType, int, SignedEncoding], str] = {
(int, 1, "unsigned"): "B",
(int, 1, "signed"): "b",
(int, 2, "unsigned"): "H",
(int, 2, "signed"): "h",
(int, 4, "unsigned"): "I",
(int, 4, "signed"): "i",
(int, 8, "unsigned"): "Q",
(int, 8, "signed"): "q",
(float, 4, "signed"): "f",
(float, 8, "signed"): "d",
}
FROM_RAW_TYPE: dict[NumberEncoding, Callable[[BytesReadType], Number]] = {}
TO_RAW_TYPE: dict[NumberEncoding, Callable[[Number], bytes]] = {}
FROM_RAW_TYPE_ARRAY: dict[NumberEncoding, Callable[[BytesReadType], list[Number]]] = {}
TO_RAW_TYPE_ARRAY: dict[NumberEncoding, Callable[[Sequence[Number]], bytes]] = {}
[docs]def get_raw_type_struct(
format_type: NumberType,
field_size: int,
signedness: SignedEncoding,
endian: EndianEncoding,
count: int | None = None,
) -> str:
endianness = ">" if endian == "big" else "<"
count_str = count if count is not None else ""
return f"{endianness}{count_str}{RAW_TYPE_STRUCT[(format_type, field_size, signedness)]}"
[docs]def get_raw_type_description(
format_type: NumberType,
field_size: int,
signedness: SignedEncoding,
endian: EndianEncoding,
) -> tuple[str, str]:
TYPE_NAMES: dict[NumberType, str] = {
int: "integer",
float: "floating-point number",
}
type_name = TYPE_NAMES[format_type]
prefix = (
("signed " if signedness == "signed" else "unsigned ")
if format_type == int
else ""
)
suffix = f" ({endian}-endian)" if field_size > 1 else ""
return f"{prefix}{field_size * 8}-bit {type_name}{suffix}", type_name
def _from_raw_type(
format_type: NumberType,
field_size: int,
signedness: SignedEncoding,
endian: EndianEncoding,
) -> Callable[[BytesReadType], Number]:
result: Callable[[BytesReadType], Number] = lambda buffer: struct.unpack(
get_raw_type_struct( format_type, field_size, signedness, endian ), buffer
)[0]
result.__doc__ = "Convert a {} byte string to a Python {}.".format(
*get_raw_type_description( format_type, field_size, signedness, endian )
)
return result
def _to_raw_type(
format_type: NumberType,
field_size: int,
signedness: SignedEncoding,
endian: EndianEncoding,
) -> Callable[[Number], bytes]:
result: Callable[[Number], bytes] = lambda value: struct.pack(
get_raw_type_struct( format_type, field_size, signedness, endian ), value
)
result.__doc__ = "Convert a Python {1} to a {0} byte string.".format(
*get_raw_type_description( format_type, field_size, signedness, endian )
)
return result
def _from_raw_type_array(
format_type: NumberType,
field_size: int,
signedness: SignedEncoding,
endian: EndianEncoding,
) -> Callable[[BytesReadType], list[Number]]:
result: Callable[[BytesReadType], list[Number]] = lambda buffer: list(
struct.unpack(
get_raw_type_struct(
format_type,
field_size,
signedness,
endian,
count=len( buffer ) // field_size,
),
buffer,
)
)
result.__doc__ = "Convert a {} byte string to a Python list of {}s.".format(
*get_raw_type_description( format_type, field_size, signedness, endian )
)
return result
def _to_raw_type_array(
format_type: NumberType,
field_size: int,
signedness: SignedEncoding,
endian: EndianEncoding,
) -> Callable[[Sequence[Number]], bytes]:
result: Callable[[Sequence[Number]], bytes] = lambda value_list: struct.pack(
get_raw_type_struct(
format_type, field_size, signedness, endian, count=len( value_list )
),
*value_list,
)
result.__doc__ = "Convert a Python list of {1}s to a {0} byte string.".format(
*get_raw_type_description( format_type, field_size, signedness, endian )
)
return result
def _from_generic_array(
type_id: NumberEncoding, from_raw: Callable[[BytesReadType], Number]
):
result: Callable[[BytesReadType], list[Number]] = lambda buffer: [
from_raw( buffer[i : i + type_id[1]] )
for i in range( 0, len( buffer ), type_id[1] )
]
result.__doc__ = "Convert a {} byte string to a Python list of {}s.".format(
*get_raw_type_description( *type_id )
)
return result
def _to_generic_array( type_id: NumberEncoding, to_raw: Callable[[Number], bytes] ):
result: Callable[[Sequence[Number]], bytes] = lambda value_list: b"".join(
[to_raw( value ) for value in value_list]
)
result.__doc__ = "Convert a Python list of {1}s to a {0} byte string.".format(
*get_raw_type_description( *type_id )
)
return result
# autogenerate conversion methods based on struct
for format_type, field_size, signedness in RAW_TYPE_STRUCT:
endian_choices: list[EndianEncoding] = (
[None, "little", "big"] if field_size == 1 else ["little", "big"]
)
endian: EndianEncoding
for endian in endian_choices:
type_id = (format_type, field_size, signedness, endian)
FROM_RAW_TYPE[type_id] = _from_raw_type( *type_id )
TO_RAW_TYPE[type_id] = _to_raw_type( *type_id )
FROM_RAW_TYPE_ARRAY[type_id] = _from_raw_type_array( *type_id )
TO_RAW_TYPE_ARRAY[type_id] = _to_raw_type_array( *type_id )
# 24-bit types
RAW_24 = ["int24_le", "uint24_le", "int24_be", "uint24_be"]
def _from_raw_24( type_id: NumberEncoding ):
signedness: SignedEncoding
endian: EndianEncoding
format_type, field_size, signedness, endian = type_id
assert format_type == int
assert field_size == 3
assert endian in ("little", "big")
assert signedness in ("signed", "unsigned")
def result( buffer: BytesReadType ):
if endian == "little":
buffer = bytes( buffer ) + (
b"\xff" if (signedness == "signed" and buffer[2] >= 0x80) else b"\x00"
)
elif endian == "big":
buffer = (
b"\xff" if (signedness == "signed" and buffer[0] >= 0x80) else b"\x00"
) + bytes(buffer)
return FROM_RAW_TYPE[(format_type, 4, signedness, endian)]( buffer )
result.__doc__ = "Convert a {} byte string to a Python {}.".format(
*get_raw_type_description( *type_id )
)
return result
def _to_raw_24( type_id: NumberEncoding ):
signedness: SignedEncoding
endian: EndianEncoding
format_type, field_size, signedness, endian = type_id
assert format_type == int
assert field_size == 3
assert endian in ("little", "big")
assert signedness in ("signed", "unsigned")
def result( value: Number ):
if signedness == "signed":
assert value in range( -1 << 23, 1 << 23 )
else:
assert value in range( 0, 1 << 24 )
output = TO_RAW_TYPE[(format_type, 4, signedness, endian)]( value )
if endian == "little":
output = output[:3]
elif endian == "big":
output = output[1:]
return output
result.__doc__ = "Convert a Python {1} to a {0} byte string.".format(
*get_raw_type_description( *type_id )
)
return result
for code in RAW_24:
type_id = RAW_TYPE_NAME_REVERSE[code]
FROM_RAW_TYPE[type_id] = _from_raw_24( type_id )
TO_RAW_TYPE[type_id] = _to_raw_24( type_id )
FROM_RAW_TYPE_ARRAY[type_id] = _from_generic_array(
type_id, FROM_RAW_TYPE[type_id]
)
TO_RAW_TYPE_ARRAY[type_id] = _to_generic_array( type_id, TO_RAW_TYPE[type_id] )
[docs]def unpack( type_id: NumberEncoding, value: bytes ):
if isinstance( type_id, str ):
type_id = RAW_TYPE_NAME_REVERSE[type_id]
return FROM_RAW_TYPE[type_id]( value )
[docs]def pack( type_id: NumberEncoding, value: Number ):
if isinstance( type_id, str ):
type_id = RAW_TYPE_NAME_REVERSE[type_id]
return TO_RAW_TYPE[type_id]( value )
[docs]def unpack_array( type_id: NumberEncoding, values: bytes ):
if isinstance( type_id, str ):
type_id = RAW_TYPE_NAME_REVERSE[type_id]
return FROM_RAW_TYPE_ARRAY[type_id]( values )
[docs]def pack_array( type_id: NumberEncoding, values: list[Number] ):
if isinstance( type_id, str ):
type_id = RAW_TYPE_NAME_REVERSE[type_id]
return TO_RAW_TYPE_ARRAY[type_id]( values )