Skip to content

Commit

Permalink
Parse pack idx files for resolving truncated oids
Browse files Browse the repository at this point in the history
Can now parse pack .idx files (somewhat) and use them resolving short
oids to full length ones. Yay!
  • Loading branch information
ttrssreal committed May 14, 2024
1 parent 0b05e5c commit 013ffb6
Show file tree
Hide file tree
Showing 5 changed files with 224 additions and 3 deletions.
1 change: 1 addition & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ edition = "2021"
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html

[dependencies]
byteorder = "1.5.0"
clap = { version = "4.4.11", features = ["derive"] }
compress = "0.2.1"
hex = "0.4.3"
Expand Down
3 changes: 2 additions & 1 deletion src/store/mod.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
mod loose;
mod pack;
pub mod util;

use std::fmt::Display;
Expand All @@ -16,7 +17,7 @@ pub enum StoreBackend {
Loose
}

#[derive(Debug, PartialEq, Copy, Clone)]
#[derive(Eq, PartialEq, Hash, Copy, Clone)]
pub struct ObjectId([u8; SHA1_HASH_SIZE]);

#[derive(Debug)]
Expand Down
154 changes: 154 additions & 0 deletions src/store/pack.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,154 @@
use std::fs::File;
use std::collections::HashMap;
use crate::store::ObjectId;
use std::io::{BufReader, Read};
use byteorder::{BigEndian, ReadBytesExt};

// A 4-byte magic number \377tOc
const PACK_IDX_MAGIC: u32 = 0xff744f63;

pub struct GitPackIdx {
// A map of ObjectId's to object offsets within a packfile
pub locations: HashMap<ObjectId, usize>
}

pub fn parse_pack_idx(idx_file_stream: File) -> Option<GitPackIdx> {
let mut idx_reader = BufReader::new(idx_file_stream);

let first_word = idx_reader.read_u32::<BigEndian>().ok()?;

match first_word {
PACK_IDX_MAGIC => parse_pack_idx_modern(idx_reader),
_ => parse_pack_idx_legacy(idx_reader, first_word)
}
}

// Pack idx v1
// I haven't found any v1 idx files to test with :(
// hopefully works first time!
pub fn parse_pack_idx_legacy(mut idx_reader: BufReader<File>, _fanout_zero: u32) -> Option<GitPackIdx> {

let mut locations = HashMap::new();
let mut oid = [0u8; 20];

// The header consists of 256 4-byte network byte order integers. N-th entry
// of this table records the number of objects in the corresponding pack, the
// first byte of whose object name is less than or equal to N. This is called
// the first-level fan-out table.
// TODO: actually use this for binary searches etc
idx_reader.seek_relative(254 * 4).ok()?; // 256 - 2 words for the magic check + final entry

let oid_entry_count = idx_reader.read_u32::<BigEndian>().ok()?;

// The header is followed by sorted 24-byte entries, one entry per object in
// the pack. Each entry is:
for _ in 0..oid_entry_count {
// 4-byte network byte order integer, recording where the
// object is stored in the packfile as the offset from the
// beginning.
let offset = idx_reader.read_u32::<BigEndian>().ok()?;

// one object name of the appropriate size.
idx_reader.read_exact(&mut oid).ok()?;
let oid: ObjectId = oid.into();

locations.insert(oid, offset as usize);
}

Some(GitPackIdx {
locations
})
}

pub fn parse_pack_idx_modern(mut idx_reader: BufReader<File>) -> Option<GitPackIdx> {
// A 4-byte version number
let version_number = idx_reader.read_u32::<BigEndian>().ok()?;

match version_number {
2 => parse_pack_idx_v2(idx_reader),
_ => {
eprintln!("Gitty currently supports only pack idx formats of v{{1,2}}");
None
}
}
}

// Pack idx v2
pub fn parse_pack_idx_v2(mut idx_reader: BufReader<File>) -> Option<GitPackIdx> {
// A 256-entry fan-out table just like v1.
// TODO: actually use this for binary searches etc
idx_reader.seek_relative(255 * 4).ok()?;

let oid_entry_count = idx_reader.read_u32::<BigEndian>().ok()?;

let mut locations = HashMap::new();
let mut oids = Vec::new();
let mut oid = [0u8; 20];

// map from index in 8-byte table -> oid
let mut offsets_to_patch = HashMap::new();

// A table of sorted object names. These are packed together without offset
// values to reduce the cache footprint of the binary search for a specific
// object name.
for _ in 0..oid_entry_count {
idx_reader.read_exact(&mut oid).ok()?;
let oid: ObjectId = oid.into();

oids.push(oid);
}

// A table of 4-byte CRC32 values of the packed object data. This is new in
// v2 so compressed data can be copied directly from pack to pack during
// repacking without undetected data corruption.
// TODO: implement validating these or something?
idx_reader.seek_relative((oid_entry_count * 4) as i64).ok()?;

// A table of 4-byte offset values (in network byte order). These are usually
// 31-bit pack file offsets, but large offsets are encoded as an index into
// the next table with the msbit set.
for table_index in 0..oid_entry_count {
let offset = idx_reader.read_i32::<BigEndian>().ok()?;

let table_index = table_index as usize;

// Equivalent to checking the msb
if offset.is_negative() {
// The index into the 8-byte offset table (mask off the msb)
let offset = (offset & !(1 << 31)) as u32;

// Defer until we parse the 8-byte table
offsets_to_patch.insert(offset, oids[table_index]);
} else {
locations.insert(oids[table_index], offset as usize);
}
}

// A table of 8-byte offset entries (empty for pack files less than 2 GiB).
// Pack files are organized with heavily used objects toward the front, so
// most object references should not need to refer to this table.
if !offsets_to_patch.is_empty() {
let mut patchable_indices: Vec<&u32> = offsets_to_patch.keys().collect();
let mut curr_table_idx = 0;

// Visit the low indices first
patchable_indices.sort();

for idx_to_patch in patchable_indices {
// Consume entries until we hit our one
while *idx_to_patch != curr_table_idx {
idx_reader.read_u64::<BigEndian>().ok()?;
curr_table_idx += 1;
}

let oid = offsets_to_patch.get(idx_to_patch)?;
let offset = idx_reader.read_u64::<BigEndian>().ok()?;

locations.insert(*oid, offset as usize);
}
}

Some(GitPackIdx {
locations
})
}
68 changes: 66 additions & 2 deletions src/store/util.rs
Original file line number Diff line number Diff line change
@@ -1,12 +1,14 @@
use crate::MIN_USER_HASH_LEN;
use std::{fs, fmt};
use std::{fs::{self, File}, fmt};
use crate::store::{
StoreBackend,
ObjectId
ObjectId,
pack::parse_pack_idx
};
use crate::SHA1_HASH_SIZE;
use std::array::TryFromSliceError;

// Resolves an arbitrary length hex encoded string to an oid
pub fn resolve_id(id_str: &str) -> Option<ObjectId> {
let id_len = id_str.len();

Expand All @@ -18,6 +20,7 @@ pub fn resolve_id(id_str: &str) -> Option<ObjectId> {
let mut candidates = Vec::new();

candidates.append(&mut resolve_id_loose(id_str));
candidates.append(&mut resolve_id_packed(id_str));

if candidates.len() == 0 {
eprintln!("Can't find object");
Expand Down Expand Up @@ -72,6 +75,55 @@ fn match_loose_ids(matches: &mut Vec<ObjectId>, target_id: &str) -> Option<()> {
Some(())
}

fn resolve_id_packed(id_str: &str) -> Vec<ObjectId> {
let mut matches = Vec::new();

match_pack_idx_ids(&mut matches, id_str);

matches
}

fn match_pack_idx_ids(matches: &mut Vec<ObjectId>, target_id: &str) -> Option<()> {
let idx_files = fs::read_dir(".git/objects/pack/").ok()?;

for entry in idx_files {
let entry = entry.ok()?;

let filename = entry
.file_name()
.into_string()
.ok()?;

let is_idxfile = filename
.to_lowercase().ends_with(".idx");

if !is_idxfile {
continue;
}

let filename = format!(".git/objects/pack/{}", filename);

let file_stream = File::open(filename).ok()?;

// TODO: fix: we disregard offsets, and therefore do unnecessary work here :(
let pack_idx = parse_pack_idx(file_stream)?;

let objectids: Vec<ObjectId> = pack_idx.locations
.into_keys()
.collect();

let target_id = hex::decode(target_id).ok()?;

for oid in objectids {
if oid.0.starts_with(&target_id) {
matches.push(oid);
}
}
}

Some(())
}

// TODO: implement this
pub fn find_backend(_id: ObjectId) -> Option<StoreBackend> {
Some(StoreBackend::Loose)
Expand All @@ -98,8 +150,20 @@ impl TryFrom<&[u8]> for ObjectId {
}
}

impl From<[u8; 20]> for ObjectId {
fn from(value: [u8; 20]) -> ObjectId {
ObjectId(value)
}
}

impl fmt::Display for ObjectId {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
write!(f, "{}", hex::encode(self.0))
}
}

impl fmt::Debug for ObjectId {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
write!(f, "{}", self)
}
}

0 comments on commit 013ffb6

Please sign in to comment.