Skip to content

Commit

Permalink
replace miniz with hand crafted parser
Browse files Browse the repository at this point in the history
  • Loading branch information
Hugoberry committed Apr 12, 2024
1 parent 334104d commit 25b837f
Show file tree
Hide file tree
Showing 19 changed files with 2,358 additions and 6 deletions.
4 changes: 3 additions & 1 deletion .vscode/settings.json
Original file line number Diff line number Diff line change
Expand Up @@ -93,6 +93,8 @@
"*.tcc": "cpp",
"memory_resource": "cpp",
"string_view": "cpp",
"shared_mutex": "cpp"
"shared_mutex": "cpp",
"any": "cpp",
"valarray": "cpp"
}
}
7 changes: 6 additions & 1 deletion CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,9 @@ set(XPRESS9_SOURCES
third_party/xpress9/Xpress9Misc.c
third_party/xpress9/Xpress9Wrapper.cpp)

set(KAITAI_SOURCES
third_party/kaitai/kaitaistream.cpp)

SET(ABF_SOURCES
src/abf/AbfParser.cpp
src/abf/BackupFile.cpp
Expand All @@ -45,7 +48,8 @@ SET(ABF_SOURCES
src/abf/FileList.cpp
src/abf/Languages.cpp
src/abf/LogBackupFile.cpp
src/abf/VirtualDirectory.cpp)
src/abf/VirtualDirectory.cpp
src/abf/pbix.cpp)

set(EXTENSION_SOURCES
src/pbix_extension.cpp
Expand All @@ -57,6 +61,7 @@ set(EXTENSION_SOURCES
${SQLITE_SOURCES}
${TINYXML2_SOURCES}
${XPRESS9_SOURCES}
${KAITAI_SOURCES}
${ABF_SOURCES})

# build_static_extension(${TARGET_NAME} ${EXTENSION_SOURCES})
Expand Down
91 changes: 91 additions & 0 deletions ksy/zip_central_dir.ksy
Original file line number Diff line number Diff line change
@@ -0,0 +1,91 @@
meta:
id: pbix
title: PBIX archive file
file-extension: pbix
endian: le
bit-endian: le
seq:
- id: sections
type: pk_section
repeat: eos
types:
pk_section:
seq:
- id: magic
contents: 'PK'
- id: section_type
type: u2
- id: body
type:
switch-on: section_type
cases:
0x0201: central_dir_entry
0x0403: local_file
0x0605: end_of_central_dir
0x0807: data_descriptor
data_descriptor:
seq:
- id: data_descriptor_obs
size: 12
local_file:
seq:
- id: header
type: local_file_header
- id: body
size: header.len_body_compressed
local_file_header:
seq:
- id: header_trimmed
size: 14
- id: len_body_compressed
type: u4
- id: len_body_uncompressed
type: u4
- id: len_file_name
type: u2
- id: len_extra
type: u2
- id: file_name
size: len_file_name
- id: extra
size: len_extra
central_dir_entry:
seq:
- id: header_obs
size: 12
- id: crc32
type: u4
- id: len_body_compressed
type: u4
- id: len_body_uncompressed
type: u4
- id: len_file_name
type: u2
- id: len_extra
type: u2
- id: len_comment
type: u2
- id: disk_number_start
type: u2
- id: int_file_attr
type: u2
- id: ext_file_attr
type: u4
- id: ofs_local_header
type: s4
- id: file_name
type: str
size: len_file_name
encoding: UTF-8
- id: extra
size: len_extra
- id: comment
size: len_comment
end_of_central_dir:
seq:
- id: header_obs
size: 16
- id: len_comment
type: u2
- id: comment
size: len_comment
63 changes: 63 additions & 0 deletions src/abf/AbfParser.cpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
#include "AbfParser.h"


using namespace tinyxml2;

std::vector<uint8_t> AbfParser::read_buffer_bytes(const std::vector<uint8_t> &buffer, uint64_t offset, int size)
Expand Down Expand Up @@ -109,6 +110,28 @@ std::pair<uint64_t, uint64_t> AbfParser::initialize_zip_and_locate_datamodel(con
return {file_stat.m_local_header_ofs, file_stat.m_comp_size};
}

std::pair<uint64_t, uint64_t> AbfParser::locate_datamodel(duckdb::ClientContext &context,const std::string &path) {
auto &fs = duckdb::FileSystem::GetFileSystem(context);
auto file_handle = fs.OpenFile(path, duckdb::FileFlags::FILE_FLAGS_READ);
duckdb::FileHandleStream file_stream(file_handle.get());

kaitai::kstream ks(&file_stream);

pbix_t pbix(&ks);

for(auto &section : *pbix.sections()) {
if (section->section_type() == 513) { //Central Directory
auto central_directory = static_cast<pbix_t::central_dir_entry_t*>(section->body());
//check if the file is DataModel
if(central_directory->file_name() == "DataModel") {
return {central_directory->ofs_local_header(), central_directory->len_body_compressed()};
}
}
}
throw std::runtime_error("DataModel not found in the zip file.");

}

void AbfParser::read_compressed_datamodel_header(std::ifstream &entryStream, uint64_t &datamodel_ofs) {
// Read compressed DataModel header to adjust offset
entryStream.seekg(datamodel_ofs+ZIP_LOCAL_FILE_HEADER_FIXED);
Expand Down Expand Up @@ -228,6 +251,46 @@ std::vector<uint8_t> AbfParser::get_sqlite(const std::string &path, const int tr
// Prefix all_decompressed_buffer with initial_decompressed_buffer in case we have only one block
all_decompressed_buffer.insert(all_decompressed_buffer.begin(), initial_decompressed_buffer.begin(), initial_decompressed_buffer.end());

if (skip_offset + all_decompressed_buffer.size() < virtual_directory_offset + virtual_directory_size)
{
throw std::runtime_error("Could not parse the entire DataModel.");
}
// Finally, extract the SQLite buffer from the decompressed data
return extract_sqlite_buffer(all_decompressed_buffer, skip_offset, virtual_directory_offset, virtual_directory_size);
}
std::vector<uint8_t> AbfParser::get_sqlite_v2(duckdb::ClientContext &context, const std::string &path, const int trailing_blocks=15)
{
// Initialize zip and locate DataModel
auto [datamodel_ofs, datamodel_size] = locate_datamodel(context,path);

// Open file stream
std::ifstream entryStream(path, std::ios::binary);
if (!entryStream.is_open()) {
throw std::runtime_error("Could not open pbix file for reading compressed DataModel header.");
}

// Read compressed DataModel header to adjust offset
read_compressed_datamodel_header(entryStream, datamodel_ofs);

XPress9Wrapper xpress9_wrapper;
if (!xpress9_wrapper.Initialize())
{
throw std::runtime_error("Failed to initialize XPress9Wrapper");
}

// Decompress initial block to get the virtual directory info
auto initial_decompressed_buffer = decompress_initial_block(entryStream, datamodel_ofs, xpress9_wrapper);

// Process backup log header to get virtual directory offset and size
auto [virtual_directory_offset, virtual_directory_size] = process_backup_log_header(initial_decompressed_buffer);

uint64_t skip_offset = 0; //optimization for skipping blocks
// Iterate through the remaining blocks and decompress them
auto all_decompressed_buffer = iterate_and_decompress_blocks(entryStream, datamodel_ofs, datamodel_size, xpress9_wrapper, virtual_directory_offset, virtual_directory_size, trailing_blocks, skip_offset);

// Prefix all_decompressed_buffer with initial_decompressed_buffer in case we have only one block
all_decompressed_buffer.insert(all_decompressed_buffer.begin(), initial_decompressed_buffer.begin(), initial_decompressed_buffer.end());

if (skip_offset + all_decompressed_buffer.size() < virtual_directory_offset + virtual_directory_size)
{
throw std::runtime_error("Could not parse the entire DataModel.");
Expand Down
8 changes: 8 additions & 0 deletions src/abf/AbfParser.h
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,12 @@

#include "Xpress9Wrapper.h"
#include "Crc32.h"
#include "duckdb.hpp"
#include "pbix.h"
#include "FileHandleStream.h"
#include <sstream>
#include "kaitai/kaitaistream.h"


// Constants related to ZIP file parsing
constexpr unsigned char ZIP_LOCAL_FILE_HEADER_FIXED = 26;
Expand All @@ -28,13 +34,15 @@ constexpr unsigned short ABF_BACKUP_LOG_HEADER_SIZE = 0x1000 - ABF_BACKUP_LOG_HE
class AbfParser {
public:
static std::vector<uint8_t> get_sqlite(const std::string &path, const int trailing_chunks);
static std::vector<uint8_t> get_sqlite_v2(duckdb::ClientContext &context,const std::string &path, const int trailing_chunks);
private:
static void patch_header_of_compressed_buffer(std::vector<uint8_t> &compressed_buffer, uint32_t& block_index_iterator);
static std::vector<uint8_t> read_buffer_bytes(const std::vector<uint8_t>& buffer, uint64_t offset, int size);
static std::vector<uint8_t> trim_buffer(const std::vector<uint8_t>& buffer);
static std::tuple<uint64_t,int> process_backup_log_header(const std::vector<uint8_t> &buffer);
static std::vector<uint8_t> extract_sqlite_buffer(const std::vector<uint8_t> &buffer, uint64_t skip_offset, uint64_t virtual_directory_offset, int virtual_directory_size);
static std::pair<uint64_t, uint64_t> initialize_zip_and_locate_datamodel(const std::string &path);
static std::pair<uint64_t, uint64_t> locate_datamodel(duckdb::ClientContext &context, const std::string &path);
static void read_compressed_datamodel_header(std::ifstream &entryStream, uint64_t &datamodel_ofs);
static std::vector<uint8_t> decompress_initial_block(std::ifstream &entryStream, uint64_t datamodel_ofs, XPress9Wrapper &xpress9_wrapper);
static std::vector<uint8_t> iterate_and_decompress_blocks(std::ifstream &entryStream, uint64_t datamodel_ofs, uint64_t datamodel_size, XPress9Wrapper &xpress9_wrapper, uint64_t virtual_directory_offset, int virtual_directory_size, const int trailing_blocks, uint64_t &skip_offset);
Expand Down
54 changes: 54 additions & 0 deletions src/abf/FileHandleStream.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
#include <iostream>
#include <streambuf>
#include "duckdb/common/file_system.hpp"

class FileHandleStreamBuf : public std::streambuf {
public:
FileHandleStreamBuf(duckdb::FileHandle* handle) : handle(handle) {}

protected:
// Override underflow() to handle reading from the custom file system
int underflow() override {
if (gptr() < egptr()) {
return traits_type::to_int_type(*gptr());
}
char* base = &buffer[0];
setg(base, base, base);
std::size_t n = handle->Read(base, buffer.size());
if (n == 0) return traits_type::eof();
setg(base, base, base + n);
return traits_type::to_int_type(*gptr());
}

// Override overflow() if you need write capability
int overflow(int c = traits_type::eof()) override {
return traits_type::eof(); // Indicate always full buffer (no writing)
}

// Override seekoff to handle seeking
std::streampos seekoff(std::streamoff off, std::ios_base::seekdir dir,
std::ios_base::openmode which = std::ios_base::in | std::ios_base::out) override {
if (dir == std::ios_base::cur) {
off += handle->SeekPosition();
} else if (dir == std::ios_base::end) {
off += handle->GetFileSize();
}
handle->Seek(off);
return handle->SeekPosition();
}

private:
duckdb::FileHandle* handle;
std::vector<char> buffer = std::vector<char>(1024);
};

class FileHandleStream : public std::istream {
public:
FileHandleStream(duckdb::FileHandle* handle)
: std::istream(nullptr), buf(handle) {
rdbuf(&buf);
}

private:
FileHandleStreamBuf buf;
};
38 changes: 38 additions & 0 deletions src/abf/FileHandleStream.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
#pragma once

#include <istream>
#include <streambuf>
#include <vector>
#include "duckdb/common/file_system.hpp"

namespace duckdb {

class FileHandleStreamBuf : public std::streambuf {
public:
explicit FileHandleStreamBuf(FileHandle* handle);

protected:
// Override underflow() to handle reading from the custom file system
int underflow() override;

// Override overflow() if you need write capability
int overflow(int c = traits_type::eof()) override;

// Override seekoff to handle seeking
std::streampos seekoff(std::streamoff off, std::ios_base::seekdir dir,
std::ios_base::openmode which = std::ios_base::in | std::ios_base::out) override;

private:
FileHandle* handle;
std::vector<char> buffer;
};

class FileHandleStream : public std::istream {
public:
explicit FileHandleStream(FileHandle* handle);

private:
FileHandleStreamBuf buf;
};

} // namespace duckdb
56 changes: 56 additions & 0 deletions src/abf/ZipUtils.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
#include "ZipUtils.h"
#include <iostream>
#include <fstream>
#include <vector>
#include <algorithm>
#include <cstring>

class ZipUtils {
public:
static bool findEndOfCentralDirectory(std::istream& stream, EndOfCentralDirectoryRecord& eocd) {
const uint32_t signatureEOCD = 0x06054b50;
std::vector<char> buffer(4096);
stream.seekg(0, std::ios::end);
std::streampos fileSize = stream.tellg();
int64_t searchOffset = std::min(static_cast<int64_t>(fileSize), static_cast<int64_t>(buffer.size()));

stream.seekg(-searchOffset, std::ios::end);
stream.read(buffer.data(), searchOffset);
auto foundPos = std::search(buffer.rbegin(), buffer.rend(), reinterpret_cast<const char*>(&signatureEOCD), reinterpret_cast<const char*>(&signatureEOCD) + sizeof(signatureEOCD));

if (foundPos != buffer.rend()) {
size_t offset = std::distance(buffer.begin(), foundPos.base()) - sizeof(signatureEOCD);
stream.seekg(-searchOffset + offset, std::ios::end);
stream.read(reinterpret_cast<char*>(&eocd), sizeof(eocd));
return true;
}
return false;
}

static std::pair<uint32_t, uint32_t> findDataModel(std::istream& zipStream) {
EndOfCentralDirectoryRecord eocd;
if (!findEndOfCentralDirectory(zipStream, eocd)) {
throw std::runtime_error("End of central directory not found.");
}

zipStream.seekg(eocd.centralDirectoryOffset, std::ios::beg);
CentralDirectoryFileHeader cdHeader;

for (int i = 0; i < eocd.numEntries; ++i) {
zipStream.read(reinterpret_cast<char*>(&cdHeader), sizeof(cdHeader));
if (cdHeader.signature != 0x02014b50) {
throw std::runtime_error("Invalid central directory file header signature.");
}

std::vector<char> filename(cdHeader.fileNameLength);
zipStream.read(filename.data(), cdHeader.fileNameLength);
zipStream.ignore(cdHeader.extraFieldLength + cdHeader.fileCommentLength);

if (std::string(filename.begin(), filename.end()) == "DataModel") {
return {cdHeader.localHeaderOffset, cdHeader.compressedSize};
}
}

throw std::runtime_error("DataModel not found in the zip file.");
}
};
Loading

0 comments on commit 25b837f

Please sign in to comment.