From 846b5373728e41ed92a3e31110ab862b7cefbac8 Mon Sep 17 00:00:00 2001 From: Derrick Stolee Date: Fri, 6 Sep 2024 14:16:13 -0400 Subject: [PATCH 1/9] revision: create mark_trees_uninteresting_dense() The sparse tree walk algorithm was created in d5d2e93577e (revision: implement sparse algorithm, 2019-01-16) and involves using the mark_trees_uninteresting_sparse() method. This method takes a repository and an oidset of tree IDs, some of which have the UNINTERESTING flag and some of which do not. Create a method that has an equivalent set of preconditions but uses a "dense" walk (recursively visits all reachable trees, as long as they have not previously been marked UNINTERESTING). This is an important difference from mark_tree_uninteresting(), which short-circuits if the given tree has the UNINTERESTING flag. A use of this method will be added in a later change, with a condition set whether the sparse or dense approach should be used. Signed-off-by: Derrick Stolee --- revision.c | 15 +++++++++++++++ revision.h | 1 + 2 files changed, 16 insertions(+) diff --git a/revision.c b/revision.c index 474fa1e767c8d8..32d949b36feb80 100644 --- a/revision.c +++ b/revision.c @@ -220,6 +220,21 @@ static void add_children_by_path(struct repository *r, free_tree_buffer(tree); } +void mark_trees_uninteresting_dense(struct repository *r, + struct oidset *trees) +{ + struct object_id *oid; + struct oidset_iter iter; + + oidset_iter_init(trees, &iter); + while ((oid = oidset_iter_next(&iter))) { + struct tree *tree = lookup_tree(r, oid); + + if (tree->object.flags & UNINTERESTING) + mark_tree_contents_uninteresting(r, tree); + } +} + void mark_trees_uninteresting_sparse(struct repository *r, struct oidset *trees) { diff --git a/revision.h b/revision.h index 71e984c452b8d7..8938b2db112e2a 100644 --- a/revision.h +++ b/revision.h @@ -487,6 +487,7 @@ void put_revision_mark(const struct rev_info *revs, void mark_parents_uninteresting(struct rev_info *revs, struct commit *commit); void mark_tree_uninteresting(struct repository *r, struct tree *tree); +void mark_trees_uninteresting_dense(struct repository *r, struct oidset *trees); void mark_trees_uninteresting_sparse(struct repository *r, struct oidset *trees); void show_object_with_name(FILE *, struct object *, const char *); From 9971d269592cd1a39a06b7065f9fccbb1ccfda6c Mon Sep 17 00:00:00 2001 From: Derrick Stolee Date: Tue, 24 Sep 2024 13:45:19 -0400 Subject: [PATCH 2/9] pack-objects: extract should_attempt_deltas() This will be helpful in a future change. Signed-off-by: Derrick Stolee --- builtin/pack-objects.c | 53 +++++++++++++++++++++++------------------- 1 file changed, 29 insertions(+), 24 deletions(-) diff --git a/builtin/pack-objects.c b/builtin/pack-objects.c index f80152d55d1f3c..58f8a45935822c 100644 --- a/builtin/pack-objects.c +++ b/builtin/pack-objects.c @@ -3167,6 +3167,33 @@ static int add_ref_tag(const char *tag UNUSED, const char *referent UNUSED, cons return 0; } +static int should_attempt_deltas(struct object_entry *entry) +{ + if (DELTA(entry)) + return 0; + + if (!entry->type_valid || + oe_size_less_than(&to_pack, entry, 50)) + return 0; + + if (entry->no_try_delta) + return 0; + + if (!entry->preferred_base) { + if (oe_type(entry) < 0) + die(_("unable to get type of object %s"), + oid_to_hex(&entry->idx.oid)); + } else if (oe_type(entry) < 0) { + /* + * This object is not found, but we + * don't have to include it anyway. + */ + return 0; + } + + return 1; +} + static void prepare_pack(int window, int depth) { struct object_entry **delta_list; @@ -3197,33 +3224,11 @@ static void prepare_pack(int window, int depth) for (i = 0; i < to_pack.nr_objects; i++) { struct object_entry *entry = to_pack.objects + i; - if (DELTA(entry)) - /* This happens if we decided to reuse existing - * delta from a pack. "reuse_delta &&" is implied. - */ - continue; - - if (!entry->type_valid || - oe_size_less_than(&to_pack, entry, 50)) + if (!should_attempt_deltas(entry)) continue; - if (entry->no_try_delta) - continue; - - if (!entry->preferred_base) { + if (!entry->preferred_base) nr_deltas++; - if (oe_type(entry) < 0) - die(_("unable to get type of object %s"), - oid_to_hex(&entry->idx.oid)); - } else { - if (oe_type(entry) < 0) { - /* - * This object is not found, but we - * don't have to include it anyway. - */ - continue; - } - } delta_list[n++] = entry; } From 3d0a07d31a6e8741953c8003e81a5388c4d4e11b Mon Sep 17 00:00:00 2001 From: Derrick Stolee Date: Thu, 5 Sep 2024 10:04:51 -0400 Subject: [PATCH 3/9] pack-objects: add --path-walk option In order to more easily compute delta bases among objects that appear at the exact same path, add a --path-walk option to 'git pack-objects'. This option will use the path-walk API instead of the object walk given by the revision machinery. Since objects will be provided in batches representing a common path, those objects can be tested for delta bases immediately instead of waiting for a sort of the full object list by name-hash. This has multiple benefits, including avoiding collisions by name-hash. The objects marked as UNINTERESTING are included in these batches, so we are guaranteeing some locality to find good delta bases. After the individual passes are done on a per-path basis, the default name-hash is used to find other opportunistic delta bases that did not match exactly by the full path name. RFC TODO: It is important to note that this option is inherently incompatible with using a bitmap index. This walk probably also does not work with other advanced features, such as delta islands. Getting ahead of myself, this option compares well with --full-name-hash when the packfile is large enough, but also performs at least as well as the default in all cases that I've seen. RFC TODO: this should probably be recording the batch locations to another list so they could be processed in a second phase using threads. RFC TODO: list some examples of how this outperforms previous pack-objects strategies. (This is coming in later commits that include performance test changes.) Signed-off-by: Derrick Stolee --- Documentation/git-pack-objects.txt | 12 +- Documentation/technical/api-path-walk.txt | 3 +- builtin/pack-objects.c | 147 ++++++++++++++++++++-- t/t5300-pack-object.sh | 17 +++ 4 files changed, 168 insertions(+), 11 deletions(-) diff --git a/Documentation/git-pack-objects.txt b/Documentation/git-pack-objects.txt index 93861d9f85b3b1..1a3023e2067f28 100644 --- a/Documentation/git-pack-objects.txt +++ b/Documentation/git-pack-objects.txt @@ -16,7 +16,7 @@ SYNOPSIS [--cruft] [--cruft-expiration=