From 013ffb6402813317f26233405cd5331f78b7fc31 Mon Sep 17 00:00:00 2001
From: Jess <jess@xeraxera.me>
Date: Tue, 14 May 2024 12:56:58 +1200
Subject: [PATCH] Parse pack idx files for resolving truncated oids

Can now parse pack .idx files (somewhat) and use them resolving short
oids to full length ones. Yay!
---
 Cargo.lock        |   1 +
 Cargo.toml        |   1 +
 src/store/mod.rs  |   3 +-
 src/store/pack.rs | 154 ++++++++++++++++++++++++++++++++++++++++++++++
 src/store/util.rs |  68 +++++++++++++++++++-
 5 files changed, 224 insertions(+), 3 deletions(-)
 create mode 100644 src/store/pack.rs

diff --git a/Cargo.lock b/Cargo.lock
index c91831e..e2fbec9 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -189,6 +189,7 @@ dependencies = [
 name = "gitty"
 version = "0.1.0"
 dependencies = [
+ "byteorder",
  "clap",
  "compress",
  "hex",
diff --git a/Cargo.toml b/Cargo.toml
index 1cab1dd..6dae1b2 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -6,6 +6,7 @@ edition = "2021"
 # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
 
 [dependencies]
+byteorder = "1.5.0"
 clap = { version = "4.4.11", features = ["derive"] }
 compress = "0.2.1"
 hex = "0.4.3"
diff --git a/src/store/mod.rs b/src/store/mod.rs
index e5edd77..78cb595 100644
--- a/src/store/mod.rs
+++ b/src/store/mod.rs
@@ -1,4 +1,5 @@
 mod loose;
+mod pack;
 pub mod util;
 
 use std::fmt::Display;
@@ -16,7 +17,7 @@ pub enum StoreBackend {
     Loose
 }
 
-#[derive(Debug, PartialEq, Copy, Clone)]
+#[derive(Eq, PartialEq, Hash, Copy, Clone)]
 pub struct ObjectId([u8; SHA1_HASH_SIZE]);
 
 #[derive(Debug)]
diff --git a/src/store/pack.rs b/src/store/pack.rs
new file mode 100644
index 0000000..76ba135
--- /dev/null
+++ b/src/store/pack.rs
@@ -0,0 +1,154 @@
+use std::fs::File;
+use std::collections::HashMap;
+use crate::store::ObjectId;
+use std::io::{BufReader, Read};
+use byteorder::{BigEndian, ReadBytesExt};
+
+// A 4-byte magic number \377tOc
+const PACK_IDX_MAGIC: u32 = 0xff744f63;
+
+pub struct GitPackIdx {
+    // A map of ObjectId's to object offsets within a packfile
+    pub locations: HashMap<ObjectId, usize>
+}
+
+pub fn parse_pack_idx(idx_file_stream: File) -> Option<GitPackIdx> {
+    let mut idx_reader = BufReader::new(idx_file_stream);
+
+    let first_word = idx_reader.read_u32::<BigEndian>().ok()?;
+
+    match first_word {
+        PACK_IDX_MAGIC => parse_pack_idx_modern(idx_reader),
+        _ => parse_pack_idx_legacy(idx_reader, first_word)
+    }
+}
+
+// Pack idx v1
+// I haven't found any v1 idx files to test with :(
+// hopefully works first time!
+pub fn parse_pack_idx_legacy(mut idx_reader: BufReader<File>, _fanout_zero: u32) -> Option<GitPackIdx> {
+
+    let mut locations = HashMap::new();
+    let mut oid = [0u8; 20];
+
+    // The header consists of 256 4-byte network byte order integers. N-th entry
+    // of this table records the number of objects in the corresponding pack, the
+    // first byte of whose object name is less than or equal to N. This is called
+    // the first-level fan-out table.
+    // TODO: actually use this for binary searches etc
+    idx_reader.seek_relative(254 * 4).ok()?; //  256 - 2 words for the magic check + final entry
+
+    let oid_entry_count = idx_reader.read_u32::<BigEndian>().ok()?;
+
+    // The header is followed by sorted 24-byte entries, one entry per object in
+    // the pack. Each entry is:
+    for _ in 0..oid_entry_count {
+        // 4-byte network byte order integer, recording where the
+        // object is stored in the packfile as the offset from the
+        // beginning.
+        let offset = idx_reader.read_u32::<BigEndian>().ok()?;
+
+        // one object name of the appropriate size.
+        idx_reader.read_exact(&mut oid).ok()?;
+        let oid: ObjectId = oid.into();
+
+        locations.insert(oid, offset as usize);
+    }
+
+    Some(GitPackIdx {
+        locations
+    })
+}
+
+pub fn parse_pack_idx_modern(mut idx_reader: BufReader<File>) -> Option<GitPackIdx> {
+    // A 4-byte version number
+    let version_number = idx_reader.read_u32::<BigEndian>().ok()?;
+
+    match version_number {
+        2 => parse_pack_idx_v2(idx_reader),
+        _ => {
+            eprintln!("Gitty currently supports only pack idx formats of v{{1,2}}");
+            None
+        }
+    }
+}
+
+// Pack idx v2
+pub fn parse_pack_idx_v2(mut idx_reader: BufReader<File>) -> Option<GitPackIdx> {
+    // A 256-entry fan-out table just like v1.
+    // TODO: actually use this for binary searches etc
+    idx_reader.seek_relative(255 * 4).ok()?;
+
+    let oid_entry_count = idx_reader.read_u32::<BigEndian>().ok()?;
+
+    let mut locations = HashMap::new();
+    let mut oids = Vec::new();
+    let mut oid = [0u8; 20];
+
+    // map from index in 8-byte table -> oid
+    let mut offsets_to_patch = HashMap::new();
+
+    // A table of sorted object names. These are packed together without offset
+    // values to reduce the cache footprint of the binary search for a specific
+    // object name.
+    for _ in 0..oid_entry_count {
+        idx_reader.read_exact(&mut oid).ok()?;
+        let oid: ObjectId = oid.into();
+
+        oids.push(oid);
+    }
+
+    // A table of 4-byte CRC32 values of the packed object data. This is new in
+    // v2 so compressed data can be copied directly from pack to pack during
+    // repacking without undetected data corruption.
+    // TODO: implement validating these or something?
+    idx_reader.seek_relative((oid_entry_count * 4) as i64).ok()?;
+
+    // A table of 4-byte offset values (in network byte order). These are usually
+    // 31-bit pack file offsets, but large offsets are encoded as an index into
+    // the next table with the msbit set.
+    for table_index in 0..oid_entry_count {
+        let offset = idx_reader.read_i32::<BigEndian>().ok()?;
+
+        let table_index = table_index as usize;
+
+        // Equivalent to checking the msb
+        if offset.is_negative() {
+            // The index into the 8-byte offset table (mask off the msb)
+            let offset = (offset & !(1 << 31)) as u32;
+
+            // Defer until we parse the 8-byte table
+            offsets_to_patch.insert(offset, oids[table_index]);
+        } else {
+            locations.insert(oids[table_index], offset as usize);
+        }
+    }
+
+    // A table of 8-byte offset entries (empty for pack files less than 2 GiB).
+    // Pack files are organized with heavily used objects toward the front, so
+    // most object references should not need to refer to this table.
+    if !offsets_to_patch.is_empty() {
+        let mut patchable_indices: Vec<&u32> = offsets_to_patch.keys().collect();
+        let mut curr_table_idx = 0; 
+
+        // Visit the low indices first
+        patchable_indices.sort();
+
+        for idx_to_patch in patchable_indices {
+            // Consume entries until we hit our one
+            while *idx_to_patch != curr_table_idx {
+                idx_reader.read_u64::<BigEndian>().ok()?;
+                curr_table_idx += 1;
+            }
+
+            let oid = offsets_to_patch.get(idx_to_patch)?;
+            let offset = idx_reader.read_u64::<BigEndian>().ok()?;
+
+            locations.insert(*oid, offset as usize);
+        }
+    }
+
+    Some(GitPackIdx {
+        locations
+    })
+}
diff --git a/src/store/util.rs b/src/store/util.rs
index 1e90499..c514417 100644
--- a/src/store/util.rs
+++ b/src/store/util.rs
@@ -1,12 +1,14 @@
 use crate::MIN_USER_HASH_LEN;
-use std::{fs, fmt};
+use std::{fs::{self, File}, fmt};
 use crate::store::{
     StoreBackend,
-    ObjectId
+    ObjectId,
+    pack::parse_pack_idx
 };
 use crate::SHA1_HASH_SIZE;
 use std::array::TryFromSliceError;
 
+// Resolves an arbitrary length hex encoded string to an oid
 pub fn resolve_id(id_str: &str) -> Option<ObjectId> {
     let id_len = id_str.len();
 
@@ -18,6 +20,7 @@ pub fn resolve_id(id_str: &str) -> Option<ObjectId> {
     let mut candidates = Vec::new();
 
     candidates.append(&mut resolve_id_loose(id_str));
+    candidates.append(&mut resolve_id_packed(id_str));
 
     if candidates.len() == 0 {
         eprintln!("Can't find object");
@@ -72,6 +75,55 @@ fn match_loose_ids(matches: &mut Vec<ObjectId>, target_id: &str) -> Option<()> {
     Some(())
 }
 
+fn resolve_id_packed(id_str: &str) -> Vec<ObjectId> {
+    let mut matches = Vec::new();
+
+    match_pack_idx_ids(&mut matches, id_str);
+
+    matches
+}
+
+fn match_pack_idx_ids(matches: &mut Vec<ObjectId>, target_id: &str) -> Option<()> {
+    let idx_files = fs::read_dir(".git/objects/pack/").ok()?;
+
+    for entry in idx_files {
+        let entry = entry.ok()?;
+
+        let filename = entry
+            .file_name()
+            .into_string()
+            .ok()?;
+
+        let is_idxfile = filename
+            .to_lowercase().ends_with(".idx");
+
+        if !is_idxfile {
+            continue;
+        }
+
+        let filename = format!(".git/objects/pack/{}", filename);
+
+        let file_stream = File::open(filename).ok()?;
+
+        // TODO: fix: we disregard offsets, and therefore do unnecessary work here :(
+        let pack_idx = parse_pack_idx(file_stream)?;
+
+        let objectids: Vec<ObjectId> = pack_idx.locations
+            .into_keys()
+            .collect();
+
+        let target_id = hex::decode(target_id).ok()?;
+
+        for oid in objectids {
+            if oid.0.starts_with(&target_id) {
+                matches.push(oid);
+            }
+        }
+    }
+
+    Some(())
+}
+
 // TODO: implement this
 pub fn find_backend(_id: ObjectId) -> Option<StoreBackend> {
     Some(StoreBackend::Loose)
@@ -98,8 +150,20 @@ impl TryFrom<&[u8]> for ObjectId {
     }
 }
 
+impl From<[u8; 20]> for ObjectId {
+    fn from(value: [u8; 20]) -> ObjectId {
+        ObjectId(value)
+    }
+}
+
 impl fmt::Display for ObjectId {
     fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
         write!(f, "{}", hex::encode(self.0))
     }
 }
+
+impl fmt::Debug for ObjectId {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        write!(f, "{}", self)
+    }
+}