feat:more encodings

JarbasHiveMind · Jan 3, 2025 · d98b3f7 · d98b3f7
1 parent 3767782
commit d98b3f7
Show file tree

Hide file tree

Showing 13 changed files with 757 additions and 104 deletions.
diff --git a/hivemind_bus_client/encodings/__init__.py b/hivemind_bus_client/encodings/__init__.py
@@ -0,0 +1,4 @@
+from hivemind_bus_client.encodings.z85b import Z85B
+from hivemind_bus_client.encodings.z85p import Z85P
+from hivemind_bus_client.encodings.b91 import B91
+from hivemind_bus_client.encodings.b100p import B100P
diff --git a/hivemind_bus_client/encodings/b100p.py b/hivemind_bus_client/encodings/b100p.py
@@ -0,0 +1,92 @@
+from typing import Union
+
+
+class B100P:
+    """
+    B100P is a class that provides encoding and decoding methods for transforming text into an emoji-based representation
+    with a custom padding mechanism. The first byte of the encoded data indicates how many padding bytes were added
+    during encoding, which is then removed during decoding.
+
+    The padding is added to make the data length a multiple of 4, and the padding size is included as part of the encoded data.
+    When decoding, the padding size is read from the first byte and used to strip the padding from the decoded data.
+    """
+
+    @classmethod
+    def encode(cls, data: Union[str, bytes], encoding: str = "utf-8") -> bytes:
+        """
+        Encodes text into an emoji representation with padding, and prepends the padding size.
+
+        Args:
+            data (Union[str, bytes]): The input text to be encoded. This can either be a string (plaintext) or bytes.
+            encoding (str): The encoding to use if `data` is provided as a string. Default is 'utf-8'.
+
+        Returns:
+            bytes: The emoji-encoded byte sequence with appropriate padding and padding size indication.
+
+        Notes:
+            The padding is applied to ensure the length of the encoded data is a multiple of 4. The first byte in the
+            returned byte sequence represents the number of padding bytes added. This allows for proper decoding with
+            padding removal.
+        """
+        if isinstance(data, str):
+            data = data.encode(encoding)
+
+        padding = (4 - len(data) % 4) % 4  # Padding to make the length a multiple of 4
+        data += b'\x00' * padding
+
+        # The first byte indicates how many padding bytes were added
+        encoded_data = [padding] + [240, 159, 0, 0] * len(data)
+
+        for i, b in enumerate(data):
+            encoded_data[4 * i + 3] = (b + 55) // 64 + 143
+            encoded_data[4 * i + 4] = (b + 55) % 64 + 128
+
+        return bytes(encoded_data)
+
+    @classmethod
+    def decode(cls, encoded_data: Union[str, bytes], encoding: str = "utf-8") -> bytes:
+        """
+        Decodes an emoji representation back into text, removing padding as indicated by the first byte.
+
+        Args:
+            encoded_data (Union[str, bytes]): The emoji-encoded byte sequence or string to be decoded.
+            encoding (str): The encoding to use if `encoded_data` is provided as a string. Default is 'utf-8'.
+
+        Returns:
+            bytes: The decoded byte sequence of text with padding removed.
+
+        Raises:
+            ValueError: If the length of the input data is not divisible by 4 or contains invalid emoji encoding.
+
+        Notes:
+            The first byte of the encoded data indicates the padding size, and this padding is removed during decoding.
+        """
+        if isinstance(encoded_data, str):
+            encoded_data = encoded_data.encode(encoding)
+
+        if len(encoded_data) == 0:
+            return encoded_data
+
+        # Ensure the length of data is divisible by 4 (with 1 extra byte for padding size)
+        if len(encoded_data) % 4 != 1:
+            raise ValueError('Invalid data length, should be divisible by 4 with 1 extra byte for padding indicator.')
+
+        padding = encoded_data[0]  # Read the padding size from the first byte
+        if padding < 0 or padding > 3:
+            raise ValueError('Padding size must be between 0 and 3.')
+
+        # Extract the actual encoded data (excluding the padding size byte)
+        encoded_data = encoded_data[1:]
+
+        tmp = 0
+        out = [None] * (len(encoded_data) // 4)
+
+        for i, b in enumerate(encoded_data):
+            if i % 4 == 2:
+                tmp = ((b - 143) * 64) % 256
+            elif i % 4 == 3:
+                out[i // 4] = (b - 128 + tmp - 55) & 0xff
+
+        # Return decoded bytes, removing the indicated padding
+        decoded = bytes(out)
+        return decoded[:-padding] if padding else decoded  # Remove the padding
diff --git a/hivemind_bus_client/encodings/b91.py b/hivemind_bus_client/encodings/b91.py
@@ -0,0 +1,100 @@
+from typing import Union
+
+
+class B91:
+    ALPHABET = [
+        'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M',
+        'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z',
+        'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm',
+        'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
+        '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '!', '#', '$',
+        '%', '&', '(', ')', '*', '+', ',', '.', '/', ':', ';', '<', '=',
+        '>', '?', '@', '[', ']', '^', '_', '`', '{', '|', '}', '~', '"'
+    ]
+
+    DECODE_TABLE = {char: idx for idx, char in enumerate(ALPHABET)}
+
+    @classmethod
+    def decode(cls, encoded_data: Union[str, bytes], encoding: str = "utf-8") -> bytes:
+        """
+        Decodes a Base91-encoded string into its original binary form.
+
+        Args:
+            encoded_data (Union[str, bytes]): Base91-encoded input data. If `bytes`, it is decoded as UTF-8.
+            encoding (str): The encoding to use if `encoded_data` is provided as a string. Default is 'utf-8'.
+
+        Returns:
+            bytes: The decoded binary data.
+
+        Raises:
+            ValueError: If the input contains invalid Base91 characters.
+        """
+        if isinstance(encoded_data, bytes):
+            encoded_data = encoded_data.decode(encoding)
+
+        v = -1
+        b = 0
+        n = 0
+        out = bytearray()
+
+        for char in encoded_data:
+            if char not in cls.DECODE_TABLE:
+                raise ValueError(f"Invalid Base91 character: {char}")
+            c = cls.DECODE_TABLE[char]
+            if v < 0:
+                v = c
+            else:
+                v += c * 91
+                b |= v << n
+                n += 13 if (v & 8191) > 88 else 14
+                while n >= 8:
+                    out.append(b & 255)
+                    b >>= 8
+                    n -= 8
+                v = -1
+
+        if v >= 0:
+            out.append((b | v << n) & 255)
+
+        return bytes(out)
+
+    @classmethod
+    def encode(cls, data: Union[bytes, str], encoding: str = "utf-8") -> bytes:
+        """
+        Encodes binary data into a Base91-encoded string.
+
+        Args:
+            data (Union[bytes, str]): Input binary data to encode. If `str`, it is encoded as UTF-8.
+            encoding (str): The encoding to use if `data` is provided as a string. Default is 'utf-8'.
+
+        Returns:
+            str: The Base91-encoded string.
+        """
+        if isinstance(data, str):
+            data = data.encode(encoding)
+
+        b = 0
+        n = 0
+        out = []
+
+        for byte in data:
+            b |= byte << n
+            n += 8
+            if n > 13:
+                v = b & 8191
+                if v > 88:
+                    b >>= 13
+                    n -= 13
+                else:
+                    v = b & 16383
+                    b >>= 14
+                    n -= 14
+                out.append(cls.ALPHABET[v % 91])
+                out.append(cls.ALPHABET[v // 91])
+
+        if n:
+            out.append(cls.ALPHABET[b % 91])
+            if n > 7 or b > 90:
+                out.append(cls.ALPHABET[b // 91])
+
+        return ''.join(out).encode(encoding)
diff --git a/hivemind_bus_client/encodings/z85b.py b/hivemind_bus_client/encodings/z85b.py
@@ -0,0 +1,108 @@
+"""
+Python implementation of Z85b 85-bit encoding.
+
+Z85b is a variation of ZMQ RFC 32 Z85 85-bit encoding with the following differences:
+1. Little-endian encoding (to facilitate alignment with lower byte indices).
+2. No requirement for a multiple of 4/5 length.
+3. `decode_z85b()` eliminates whitespace from the input.
+4. `decode_z85b()` raises a clear exception if invalid characters are encountered.
+
+This file is a derivative work of https://gist.github.com/minrk/6357188?permalink_comment_id=2366506#gistcomment-2366506
+
+Copyright (c) 2013 Brian Granger, Min Ragan-Kelley
+Distributed under the terms of the New BSD License.
+"""
+import re
+import struct
+from typing import Union
+
+from hivemind_bus_client.exceptions import Z85DecodeError
+
+
+class Z85B:
+    # Z85CHARS is the base 85 symbol table
+    Z85CHARS = bytearray(b"0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ.-:+=^!/*?&<>()[]{}@%$#")
+
+    # Z85MAP maps integers in [0, 84] to the appropriate character in Z85CHARS
+    Z85MAP = {char: idx for idx, char in enumerate(Z85CHARS)}
+
+    # Powers of 85 for encoding/decoding
+    _85s = [85 ** i for i in range(5)]
+
+    # Padding lengths for encoding and decoding
+    _E_PADDING = [0, 3, 2, 1]
+    _D_PADDING = [0, 4, 3, 2, 1]
+
+    @classmethod
+    def encode(cls, data: Union[str, bytes], encoding: str = "utf-8") -> bytes:
+        """
+        Encode raw bytes into Z85b format.
+
+        Args:
+            data (Union[str, bytes]): Input data to encode.
+            encoding (str): The encoding to use if `data` is provided as a string. Default is 'utf-8'.
+
+        Returns:
+            bytes: Z85b-encoded bytes.
+        """
+        if isinstance(data, str):
+            data = data.encode(encoding)
+        data = bytearray(data)
+        padding = cls._E_PADDING[len(data) % 4]
+        data += b'\x00' * padding
+        nvalues = len(data) // 4
+
+        # Pack the raw bytes into little-endian 32-bit integers
+        values = struct.unpack(f'<{nvalues}I', data)
+        encoded = bytearray()
+
+        for value in values:
+            for offset in cls._85s:
+                encoded.append(cls.Z85CHARS[(value // offset) % 85])
+
+        # Remove padding characters from the encoded output
+        if padding:
+            encoded = encoded[:-padding]
+        return bytes(encoded)
+
+    @classmethod
+    def decode(cls, encoded_data: Union[str, bytes], encoding: str = "utf-8") -> bytes:
+        """
+        Decode Z85b-encoded bytes into raw bytes.
+
+        Args:
+            encoded_data (Union[str, bytes]): Z85b-encoded data.
+            encoding (str): The encoding to use if `encoded_data` is provided as a string. Default is 'utf-8'.
+
+        Returns:
+            bytes: Decoded raw bytes.
+
+        Raises:
+            Z85DecodeError: If invalid characters are encountered during decoding.
+        """
+        # Normalize input by removing whitespace
+        encoded_data = bytearray(re.sub(rb'\s+', b'',
+                                        encoded_data if isinstance(encoded_data, bytes)
+                                        else encoded_data.encode(encoding)))
+        padding = cls._D_PADDING[len(encoded_data) % 5]
+        nvalues = (len(encoded_data) + padding) // 5
+
+        values = []
+        for i in range(0, len(encoded_data), 5):
+            value = 0
+            for j, offset in enumerate(cls._85s):
+                try:
+                    value += cls.Z85MAP[encoded_data[i + j]] * offset
+                except IndexError:
+                    break  # End of input reached
+                except KeyError as e:
+                    raise Z85DecodeError(f"Invalid byte code: {e.args[0]!r}")
+            values.append(value)
+
+        # Unpack the values back into raw bytes
+        decoded = struct.pack(f'<{nvalues}I', *values)
+
+        # Remove padding from the decoded output
+        if padding:
+            decoded = decoded[:-padding]
+        return decoded
diff --git a/hivemind_bus_client/encodings/z85p.py b/hivemind_bus_client/encodings/z85p.py
@@ -0,0 +1,88 @@
+from typing import Union
+import struct
+
+class Z85P:
+    """
+    Z85 is a class that provides encoding and decoding methods for transforming raw bytes into the Z85 encoding format.
+    Z85 encoding represents 32-bit chunks of input bytes into a base85-encoded string with padding applied.
+    The padding is added to ensure the encoded data's length is a multiple of 4 characters.
+    The first byte of the encoded data indicates how many padding characters were added, which can be removed during decoding.
+    """
+    Z85CHARS = b"0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ.-:+=^!/*?&<>()[]{}@%$#"
+    Z85MAP = {c: idx for idx, c in enumerate(Z85CHARS)}
+
+    _85s = [85 ** i for i in range(5)][::-1]
+
+    @classmethod
+    def encode(cls, rawbytes: Union[str, bytes]) -> bytes:
+        """
+        Encodes raw bytes into Z85 encoding format with padding, and prepends the padding size.
+
+        Args:
+            rawbytes (Union[str, bytes]): The input raw bytes to be encoded.
+
+        Returns:
+            bytes: The Z85-encoded byte sequence with appropriate padding and padding size indication.
+
+        Notes:
+            The padding is applied to ensure the length of the encoded data is a multiple of 5. The first byte in the
+            returned byte sequence represents the number of padding characters added.
+        """
+        if isinstance(rawbytes, str):
+            rawbytes = rawbytes.encode("utf-8")
+
+        padding = (4 - len(rawbytes) % 4) % 4  # Padding to make the length a multiple of 4
+        rawbytes += b'\x00' * padding
+
+        # The first byte indicates how many padding characters were added
+        nvalues = len(rawbytes) // 4
+        values = struct.unpack('>%dI' % nvalues, rawbytes)
+        encoded = [padding]
+
+        for v in values:
+            for offset in cls._85s:
+                encoded.append(cls.Z85CHARS[(v // offset) % 85])
+
+        return bytes(encoded)
+
+    @classmethod
+    def decode(cls, z85bytes: Union[str, bytes]) -> bytes:
+        """
+        Decodes a Z85-encoded byte sequence back into raw bytes, removing padding as indicated by the first byte.
+
+        Args:
+            z85bytes (Union[str, bytes]): The Z85-encoded byte sequence to be decoded.
+
+        Returns:
+            bytes: The decoded raw byte sequence with padding removed.
+
+        Raises:
+            ValueError: If the length of the input data is not divisible by 5 or contains invalid Z85 encoding.
+
+        Notes:
+            The first byte of the encoded data indicates the padding size, and this padding is removed during decoding.
+        """
+        if isinstance(z85bytes, str):
+            z85bytes = z85bytes.encode("utf-8")
+
+        if len(z85bytes) == 0:
+            return z85bytes
+
+        if len(z85bytes) % 5 != 1:
+            raise ValueError('Invalid data length, should be divisible by 5 with 1 extra byte for padding indicator.')
+
+        padding = z85bytes[0]  # Read the padding size from the first byte
+        if padding < 0 or padding > 4:
+            raise ValueError('Padding size must be between 0 and 4.')
+
+        z85bytes = z85bytes[1:]  # Remove the first byte (padding size byte)
+
+        values = []
+        for i in range(0, len(z85bytes), 5):
+            value = 0
+            for j, offset in enumerate(cls._85s):
+                value += cls.Z85MAP[z85bytes[i + j]] * offset
+            values.append(value)
+
+        decoded = struct.pack('>%dI' % len(values), *values)
+        return decoded[:-padding] if padding else decoded  # Remove padding