-
Notifications
You must be signed in to change notification settings - Fork 82
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
feat(handler): add geom handler for uzip, lzma and zstd compression
Geom_uzip is a FreeBSD feature for creating compressed disk images (usually containing UFS). The compression is done in blocks, and the resulting .uzip file can be mounted via the GEOM framework on FreeBSD. The mkuzip header includes a table with block counts and sizes. The header declares the block size (size of decompressed blocks) and total number of blocks. Block size must be a multiple of 512 and defaults to 16384 in mkuzip. It has the following structure: > Magic, which is a shebang & compression identifier stored on 16 bytes. > Format, which is a shell command that provides some general information. > Block size, stored on 4 bytes. > Block count, stored on 4 bytes. > Table of content (TOC), which depends on the file lentgh. The TOC is a list of uint64_t offsets into the file for each block. To determine the length of a given block, read the next TOC entry and subtract the current offset from the next offset (this is why there is an extra TOC entry at the end). Each block is compressed using zlib. A standard zlib decompressor will decode them to a block of size block_size. Unblob parses the TOC to determine end & start offset of the compressed file. It detects the compression method (zlib, lzma or zstd). Finally the chunks are decompressed to revocer the inital file. Empty chunks are ignored, which is why the decompressed file with unlbob can be a little bit lighter than the original one. [Sources] https://github.com/mikeryan/unuzip https://www.baeldung.com/linux/filesystem-in-a-file https://docs.python.org/3/library/zlib.html https://github.com/freebsd/freebsd-src/blob/master/sys/geom/uzip/g_uzip.c https://parchive.sourceforge.net/docs/specifications/parity-volume-spec/article-spec.html https://www.mail-archive.com/dev-commits-src-main@freebsd.org/msg34955.html
- Loading branch information
Showing
10 changed files
with
202 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,89 @@ | ||
import io | ||
import lzma | ||
import zlib | ||
from pathlib import Path | ||
from typing import Optional | ||
|
||
import pyzstd | ||
|
||
from unblob.file_utils import ( | ||
Endian, | ||
FileSystem, | ||
StructParser, | ||
iterate_file, | ||
read_until_past, | ||
) | ||
from unblob.models import ( | ||
Extractor, | ||
ExtractResult, | ||
File, | ||
Regex, | ||
StructHandler, | ||
ValidChunk, | ||
) | ||
|
||
C_DEFINITIONS = r""" | ||
typedef struct geom_header{ | ||
char magic[16]; /* 16 bytes */ | ||
char format[112]; /* 112 bytes */ | ||
uint32_t block_size; | ||
uint32_t block_count; | ||
uint64_t toc[block_count]; /* table of content */ | ||
} geom_header_t; | ||
""" | ||
|
||
HEADER_STRUCT = "geom_header_t" | ||
|
||
VERSION_ZLIB = b"#!/bin/sh\x0a#V2.0\x20" | ||
VERSION_LZMA = b"#!/bin/sh\x0a#L3.0\x0a" | ||
VERSION_ZSTD = b"#!/bin/sh\x0a#Z4.0\x20" | ||
|
||
|
||
class GEOMExtractor(Extractor): | ||
def extract(self, inpath: Path, outdir: Path): | ||
infile = File.from_path(inpath) | ||
parser = StructParser(C_DEFINITIONS) | ||
header = parser.parse(HEADER_STRUCT, infile, Endian.BIG) | ||
fs = FileSystem(outdir) | ||
outpath = Path(inpath.stem) | ||
with fs.open(outpath, "wb+") as outfile: | ||
for current_offset, next_offset in zip(header.toc[:-1], header.toc[1:]): | ||
compressed_len = next_offset - current_offset | ||
if compressed_len == 0: | ||
continue | ||
if header.magic == VERSION_ZLIB: | ||
decompressor = zlib.decompressobj() | ||
elif header.magic == VERSION_LZMA: | ||
decompressor = lzma.LZMADecompressor() | ||
elif header.magic == VERSION_ZSTD: | ||
decompressor = pyzstd.ZstdDecompressor() | ||
for chunk in iterate_file(infile, current_offset, compressed_len): | ||
outfile.write(decompressor.decompress(chunk)) | ||
return ExtractResult(reports=fs.problems) | ||
|
||
|
||
class GEOMHandler(StructHandler): | ||
NAME = "geom" | ||
PATTERNS = [ | ||
Regex(r"^#!/bin/sh\x0A#V2.0"), | ||
Regex(r"^#!/bin/sh\x0A#L3.0"), | ||
Regex(r"^#!/bin/sh\x0A#Z4.0"), | ||
] | ||
|
||
HEADER_STRUCT = HEADER_STRUCT | ||
C_DEFINITIONS = C_DEFINITIONS | ||
EXTRACTOR = GEOMExtractor() | ||
|
||
def calculate_chunk(self, file: File, start_offset: int) -> Optional[ValidChunk]: | ||
header = self.parse_header(file, Endian.BIG) | ||
# take the last TOC block offset, end of file is that block offset + null byte padding (if present), | ||
# starting from the start offset | ||
if header.block_count > 0: | ||
end_offset = start_offset + header.toc[-1] | ||
file.seek(end_offset, io.SEEK_SET) | ||
# if file doesn't contain compressed blocks, goes directly to eof | ||
end_offset = read_until_past(file, b"\x00") | ||
return ValidChunk( | ||
start_offset=start_offset, | ||
end_offset=end_offset, | ||
) |
3 changes: 3 additions & 0 deletions
3
tests/integration/compression/geom/ulzma/__input__/myfs.img.ulzma
Git LFS file not shown
3 changes: 3 additions & 0 deletions
3
tests/integration/compression/geom/ulzma/__output__/myfs.img.ulzma_extract/myfs.img
Git LFS file not shown
3 changes: 3 additions & 0 deletions
3
tests/integration/compression/geom/uzip/__input__/myfs.img.uzip
Git LFS file not shown
3 changes: 3 additions & 0 deletions
3
tests/integration/compression/geom/uzip/__output__/myfs.img.uzip_extract/myfs.img
Git LFS file not shown
3 changes: 3 additions & 0 deletions
3
tests/integration/compression/geom/uzst/__input__/myfs.img.uzst
Git LFS file not shown
3 changes: 3 additions & 0 deletions
3
tests/integration/compression/geom/uzst/__output__/myfs.img.uzst_extract/myfs.img
Git LFS file not shown