Skip to content

Commit

Permalink
feat(handler): add geom handler for uzip, lzma and zstd compression
Browse files Browse the repository at this point in the history
Geom_uzip is a FreeBSD feature for creating compressed disk images
(usually containing UFS). The compression is done in blocks, and
the resulting .uzip file can be mounted via the GEOM framework on
FreeBSD.

The mkuzip header includes a table with block counts and sizes.
The header declares the block size (size of decompressed blocks)
and total number of blocks. Block size must be a multiple of 512
and defaults to 16384 in mkuzip.
It has the following structure:
> Magic, which is a shebang & compression identifier stored on 16 bytes.
> Format, which is a shell command that provides some general information.
> Block size, stored on 4 bytes.
> Block count, stored on 4 bytes.
> Table of content (TOC), which depends on the file lentgh.
The TOC is a list of uint64_t offsets into the file for each block.
To determine the length of a given block, read the next TOC entry
and subtract the current offset from the next offset (this is why
there is an extra TOC entry at the end). Each block is compressed
using zlib. A standard zlib decompressor will decode them to a block
of size block_size.

Unblob parses the TOC to determine end & start offset of the compressed
file. It detects the compression method (zlib, lzma or zstd). Finally
the chunks are decompressed to revocer the inital file. Empty chunks are
ignored, which is why the decompressed file with unlbob can be a little
bit lighter than the original one.

[Sources]
https://github.com/mikeryan/unuzip
https://www.baeldung.com/linux/filesystem-in-a-file
https://docs.python.org/3/library/zlib.html
https://github.com/freebsd/freebsd-src/blob/master/sys/geom/uzip/g_uzip.c
https://parchive.sourceforge.net/docs/specifications/parity-volume-spec/article-spec.html
https://www.mail-archive.com/dev-commits-src-main@freebsd.org/msg34955.html
  • Loading branch information
rxpha3l committed Mar 4, 2025
1 parent 90dd7fa commit ea64c85
Show file tree
Hide file tree
Showing 10 changed files with 202 additions and 0 deletions.
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ dependencies = [
"pyfatfs>=1.0.5",
"pyperscan>=0.3.0",
"python-magic>=0.4.27",
"pyzstd",
"rarfile>=4.1",
"rich>=13.3.5",
"structlog>=24.1.0",
Expand Down
2 changes: 2 additions & 0 deletions python/unblob/handlers/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
from .compression import (
bzip2,
compress,
geom,
gzip,
lz4,
lzh,
Expand Down Expand Up @@ -116,6 +117,7 @@
zlib.ZlibHandler,
engenius.EngeniusHandler,
ecc.AutelECCHandler,
geom.GEOMHandler,
)

BUILTIN_DIR_HANDLERS: DirectoryHandlers = (
Expand Down
89 changes: 89 additions & 0 deletions python/unblob/handlers/compression/geom.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,89 @@
import io
import lzma
import zlib
from pathlib import Path
from typing import Optional

import pyzstd

from unblob.file_utils import (
Endian,
FileSystem,
StructParser,
iterate_file,
read_until_past,
)
from unblob.models import (
Extractor,
ExtractResult,
File,
Regex,
StructHandler,
ValidChunk,
)

C_DEFINITIONS = r"""
typedef struct geom_header{
char magic[16]; /* 16 bytes */
char format[112]; /* 112 bytes */
uint32_t block_size;
uint32_t block_count;
uint64_t toc[block_count]; /* table of content */
} geom_header_t;
"""

HEADER_STRUCT = "geom_header_t"

VERSION_ZLIB = b"#!/bin/sh\x0a#V2.0\x20"
VERSION_LZMA = b"#!/bin/sh\x0a#L3.0\x0a"
VERSION_ZSTD = b"#!/bin/sh\x0a#Z4.0\x20"


class GEOMExtractor(Extractor):
def extract(self, inpath: Path, outdir: Path):
infile = File.from_path(inpath)
parser = StructParser(C_DEFINITIONS)
header = parser.parse(HEADER_STRUCT, infile, Endian.BIG)
fs = FileSystem(outdir)
outpath = Path(inpath.stem)
with fs.open(outpath, "wb+") as outfile:
for current_offset, next_offset in zip(header.toc[:-1], header.toc[1:]):
compressed_len = next_offset - current_offset
if compressed_len == 0:
continue
if header.magic == VERSION_ZLIB:
decompressor = zlib.decompressobj()
elif header.magic == VERSION_LZMA:
decompressor = lzma.LZMADecompressor()
elif header.magic == VERSION_ZSTD:
decompressor = pyzstd.ZstdDecompressor()
for chunk in iterate_file(infile, current_offset, compressed_len):
outfile.write(decompressor.decompress(chunk))
return ExtractResult(reports=fs.problems)


class GEOMHandler(StructHandler):
NAME = "geom"
PATTERNS = [
Regex(r"^#!/bin/sh\x0A#V2.0"),
Regex(r"^#!/bin/sh\x0A#L3.0"),
Regex(r"^#!/bin/sh\x0A#Z4.0"),
]

HEADER_STRUCT = HEADER_STRUCT
C_DEFINITIONS = C_DEFINITIONS
EXTRACTOR = GEOMExtractor()

def calculate_chunk(self, file: File, start_offset: int) -> Optional[ValidChunk]:
header = self.parse_header(file, Endian.BIG)
# take the last TOC block offset, end of file is that block offset + null byte padding (if present),
# starting from the start offset
if header.block_count > 0:
end_offset = start_offset + header.toc[-1]
file.seek(end_offset, io.SEEK_SET)
# if file doesn't contain compressed blocks, goes directly to eof
end_offset = read_until_past(file, b"\x00")
return ValidChunk(
start_offset=start_offset,
end_offset=end_offset,
)
Git LFS file not shown
Git LFS file not shown
Git LFS file not shown
Git LFS file not shown
Git LFS file not shown
Git LFS file not shown
92 changes: 92 additions & 0 deletions uv.lock

Large diffs are not rendered by default.

0 comments on commit ea64c85

Please sign in to comment.