From 013ffb6402813317f26233405cd5331f78b7fc31 Mon Sep 17 00:00:00 2001 From: Jess <jess@xeraxera.me> Date: Tue, 14 May 2024 12:56:58 +1200 Subject: [PATCH] Parse pack idx files for resolving truncated oids Can now parse pack .idx files (somewhat) and use them resolving short oids to full length ones. Yay! --- Cargo.lock | 1 + Cargo.toml | 1 + src/store/mod.rs | 3 +- src/store/pack.rs | 154 ++++++++++++++++++++++++++++++++++++++++++++++ src/store/util.rs | 68 +++++++++++++++++++- 5 files changed, 224 insertions(+), 3 deletions(-) create mode 100644 src/store/pack.rs diff --git a/Cargo.lock b/Cargo.lock index c91831e..e2fbec9 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -189,6 +189,7 @@ dependencies = [ name = "gitty" version = "0.1.0" dependencies = [ + "byteorder", "clap", "compress", "hex", diff --git a/Cargo.toml b/Cargo.toml index 1cab1dd..6dae1b2 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -6,6 +6,7 @@ edition = "2021" # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html [dependencies] +byteorder = "1.5.0" clap = { version = "4.4.11", features = ["derive"] } compress = "0.2.1" hex = "0.4.3" diff --git a/src/store/mod.rs b/src/store/mod.rs index e5edd77..78cb595 100644 --- a/src/store/mod.rs +++ b/src/store/mod.rs @@ -1,4 +1,5 @@ mod loose; +mod pack; pub mod util; use std::fmt::Display; @@ -16,7 +17,7 @@ pub enum StoreBackend { Loose } -#[derive(Debug, PartialEq, Copy, Clone)] +#[derive(Eq, PartialEq, Hash, Copy, Clone)] pub struct ObjectId([u8; SHA1_HASH_SIZE]); #[derive(Debug)] diff --git a/src/store/pack.rs b/src/store/pack.rs new file mode 100644 index 0000000..76ba135 --- /dev/null +++ b/src/store/pack.rs @@ -0,0 +1,154 @@ +use std::fs::File; +use std::collections::HashMap; +use crate::store::ObjectId; +use std::io::{BufReader, Read}; +use byteorder::{BigEndian, ReadBytesExt}; + +// A 4-byte magic number \377tOc +const PACK_IDX_MAGIC: u32 = 0xff744f63; + +pub struct GitPackIdx { + // A map of ObjectId's to object offsets within a packfile + pub locations: HashMap<ObjectId, usize> +} + +pub fn parse_pack_idx(idx_file_stream: File) -> Option<GitPackIdx> { + let mut idx_reader = BufReader::new(idx_file_stream); + + let first_word = idx_reader.read_u32::<BigEndian>().ok()?; + + match first_word { + PACK_IDX_MAGIC => parse_pack_idx_modern(idx_reader), + _ => parse_pack_idx_legacy(idx_reader, first_word) + } +} + +// Pack idx v1 +// I haven't found any v1 idx files to test with :( +// hopefully works first time! +pub fn parse_pack_idx_legacy(mut idx_reader: BufReader<File>, _fanout_zero: u32) -> Option<GitPackIdx> { + + let mut locations = HashMap::new(); + let mut oid = [0u8; 20]; + + // The header consists of 256 4-byte network byte order integers. N-th entry + // of this table records the number of objects in the corresponding pack, the + // first byte of whose object name is less than or equal to N. This is called + // the first-level fan-out table. + // TODO: actually use this for binary searches etc + idx_reader.seek_relative(254 * 4).ok()?; // 256 - 2 words for the magic check + final entry + + let oid_entry_count = idx_reader.read_u32::<BigEndian>().ok()?; + + // The header is followed by sorted 24-byte entries, one entry per object in + // the pack. Each entry is: + for _ in 0..oid_entry_count { + // 4-byte network byte order integer, recording where the + // object is stored in the packfile as the offset from the + // beginning. + let offset = idx_reader.read_u32::<BigEndian>().ok()?; + + // one object name of the appropriate size. + idx_reader.read_exact(&mut oid).ok()?; + let oid: ObjectId = oid.into(); + + locations.insert(oid, offset as usize); + } + + Some(GitPackIdx { + locations + }) +} + +pub fn parse_pack_idx_modern(mut idx_reader: BufReader<File>) -> Option<GitPackIdx> { + // A 4-byte version number + let version_number = idx_reader.read_u32::<BigEndian>().ok()?; + + match version_number { + 2 => parse_pack_idx_v2(idx_reader), + _ => { + eprintln!("Gitty currently supports only pack idx formats of v{{1,2}}"); + None + } + } +} + +// Pack idx v2 +pub fn parse_pack_idx_v2(mut idx_reader: BufReader<File>) -> Option<GitPackIdx> { + // A 256-entry fan-out table just like v1. + // TODO: actually use this for binary searches etc + idx_reader.seek_relative(255 * 4).ok()?; + + let oid_entry_count = idx_reader.read_u32::<BigEndian>().ok()?; + + let mut locations = HashMap::new(); + let mut oids = Vec::new(); + let mut oid = [0u8; 20]; + + // map from index in 8-byte table -> oid + let mut offsets_to_patch = HashMap::new(); + + // A table of sorted object names. These are packed together without offset + // values to reduce the cache footprint of the binary search for a specific + // object name. + for _ in 0..oid_entry_count { + idx_reader.read_exact(&mut oid).ok()?; + let oid: ObjectId = oid.into(); + + oids.push(oid); + } + + // A table of 4-byte CRC32 values of the packed object data. This is new in + // v2 so compressed data can be copied directly from pack to pack during + // repacking without undetected data corruption. + // TODO: implement validating these or something? + idx_reader.seek_relative((oid_entry_count * 4) as i64).ok()?; + + // A table of 4-byte offset values (in network byte order). These are usually + // 31-bit pack file offsets, but large offsets are encoded as an index into + // the next table with the msbit set. + for table_index in 0..oid_entry_count { + let offset = idx_reader.read_i32::<BigEndian>().ok()?; + + let table_index = table_index as usize; + + // Equivalent to checking the msb + if offset.is_negative() { + // The index into the 8-byte offset table (mask off the msb) + let offset = (offset & !(1 << 31)) as u32; + + // Defer until we parse the 8-byte table + offsets_to_patch.insert(offset, oids[table_index]); + } else { + locations.insert(oids[table_index], offset as usize); + } + } + + // A table of 8-byte offset entries (empty for pack files less than 2 GiB). + // Pack files are organized with heavily used objects toward the front, so + // most object references should not need to refer to this table. + if !offsets_to_patch.is_empty() { + let mut patchable_indices: Vec<&u32> = offsets_to_patch.keys().collect(); + let mut curr_table_idx = 0; + + // Visit the low indices first + patchable_indices.sort(); + + for idx_to_patch in patchable_indices { + // Consume entries until we hit our one + while *idx_to_patch != curr_table_idx { + idx_reader.read_u64::<BigEndian>().ok()?; + curr_table_idx += 1; + } + + let oid = offsets_to_patch.get(idx_to_patch)?; + let offset = idx_reader.read_u64::<BigEndian>().ok()?; + + locations.insert(*oid, offset as usize); + } + } + + Some(GitPackIdx { + locations + }) +} diff --git a/src/store/util.rs b/src/store/util.rs index 1e90499..c514417 100644 --- a/src/store/util.rs +++ b/src/store/util.rs @@ -1,12 +1,14 @@ use crate::MIN_USER_HASH_LEN; -use std::{fs, fmt}; +use std::{fs::{self, File}, fmt}; use crate::store::{ StoreBackend, - ObjectId + ObjectId, + pack::parse_pack_idx }; use crate::SHA1_HASH_SIZE; use std::array::TryFromSliceError; +// Resolves an arbitrary length hex encoded string to an oid pub fn resolve_id(id_str: &str) -> Option<ObjectId> { let id_len = id_str.len(); @@ -18,6 +20,7 @@ pub fn resolve_id(id_str: &str) -> Option<ObjectId> { let mut candidates = Vec::new(); candidates.append(&mut resolve_id_loose(id_str)); + candidates.append(&mut resolve_id_packed(id_str)); if candidates.len() == 0 { eprintln!("Can't find object"); @@ -72,6 +75,55 @@ fn match_loose_ids(matches: &mut Vec<ObjectId>, target_id: &str) -> Option<()> { Some(()) } +fn resolve_id_packed(id_str: &str) -> Vec<ObjectId> { + let mut matches = Vec::new(); + + match_pack_idx_ids(&mut matches, id_str); + + matches +} + +fn match_pack_idx_ids(matches: &mut Vec<ObjectId>, target_id: &str) -> Option<()> { + let idx_files = fs::read_dir(".git/objects/pack/").ok()?; + + for entry in idx_files { + let entry = entry.ok()?; + + let filename = entry + .file_name() + .into_string() + .ok()?; + + let is_idxfile = filename + .to_lowercase().ends_with(".idx"); + + if !is_idxfile { + continue; + } + + let filename = format!(".git/objects/pack/{}", filename); + + let file_stream = File::open(filename).ok()?; + + // TODO: fix: we disregard offsets, and therefore do unnecessary work here :( + let pack_idx = parse_pack_idx(file_stream)?; + + let objectids: Vec<ObjectId> = pack_idx.locations + .into_keys() + .collect(); + + let target_id = hex::decode(target_id).ok()?; + + for oid in objectids { + if oid.0.starts_with(&target_id) { + matches.push(oid); + } + } + } + + Some(()) +} + // TODO: implement this pub fn find_backend(_id: ObjectId) -> Option<StoreBackend> { Some(StoreBackend::Loose) @@ -98,8 +150,20 @@ impl TryFrom<&[u8]> for ObjectId { } } +impl From<[u8; 20]> for ObjectId { + fn from(value: [u8; 20]) -> ObjectId { + ObjectId(value) + } +} + impl fmt::Display for ObjectId { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { write!(f, "{}", hex::encode(self.0)) } } + +impl fmt::Debug for ObjectId { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!(f, "{}", self) + } +}