From fb787025aeb452f1d2314fa79abe7d2c997afef0 Mon Sep 17 00:00:00 2001
From: Junio C Hamano <gitster@pobox.com>
Date: Mon, 5 Aug 2024 17:35:36 -0700
Subject: [PATCH 01/69] refs: avoid "too many arguments"

Running "git refs migrate master main" would fail and say "too many
arguments".  By reading that message, you cannot tell if you just
should have given a single ref and made it "git refs migrate
master", or the command refuses to take any arguments.

Instead, report that "git ref migrate" takes no arguments, which is
far easier for the user to understand.

    $ git refs migrate master main
    fatal: 'git refs migrate' takes no arguments

The other side of the coin this change is covering is to remove
doubts in new users' minds when we say "git refs migrate", if it is
"git" command running with two "refs migrate" arguments, "git refs"
command running with one "migrate" argument, or "git refs migrate"
command running with no arguments.

In the same spirit, reword the existing "missing --ref-format=<format>"
message and say

    $ git refs migrate
    fatal: 'git refs migrate' needs '--ref-format=<format>'

Note that we are turning two usage() calls to die() calls.  The
former should signal that the message given is a command line that
shows the usage help of the command.  If we are giving a fatal error
message, we should not hesitate to use die().

Signed-off-by: Junio C Hamano <gitster@pobox.com>
---
 builtin/refs.c          |  4 ++--
 t/t1460-refs-migrate.sh | 10 ++--------
 2 files changed, 4 insertions(+), 10 deletions(-)
diff --git a/builtin/refs.c b/builtin/refs.c
index 46dcd150d4e279..a51602f84be58a 100644
--- a/builtin/refs.c
+++ b/builtin/refs.c
@@ -30,9 +30,9 @@ static int cmd_refs_migrate(int argc, const char **argv, const char *prefix)
 
 	argc = parse_options(argc, argv, prefix, options, migrate_usage, 0);
 	if (argc)
-		usage(_("too many arguments"));
+		die(_("'git refs migrate' takes no arguments"));
 	if (!format_str)
-		usage(_("missing --ref-format=<format>"));
+		die(_("'git refs migrate' needs '--ref-format=<format>'"));
 
 	format = ref_storage_format_by_name(format_str);
 	if (format == REF_STORAGE_FORMAT_UNKNOWN) {
diff --git a/t/t1460-refs-migrate.sh b/t/t1460-refs-migrate.sh
index f7c0783d30ccd6..e063a98b11039f 100755
--- a/t/t1460-refs-migrate.sh
+++ b/t/t1460-refs-migrate.sh
@@ -31,20 +31,14 @@ test_expect_success "superfluous arguments" '
 	test_when_finished "rm -rf repo" &&
 	git init repo &&
 	test_must_fail git -C repo refs migrate foo 2>err &&
-	cat >expect <<-EOF &&
-	usage: too many arguments
-	EOF
-	test_cmp expect err
+	test_grep "takes no arguments" err
 '
 
 test_expect_success "missing ref storage format" '
 	test_when_finished "rm -rf repo" &&
 	git init repo &&
 	test_must_fail git -C repo refs migrate 2>err &&
-	cat >expect <<-EOF &&
-	usage: missing --ref-format=<format>
-	EOF
-	test_cmp expect err
+	test_grep "needs ${SQ}--ref-format=<format>${SQ}" err
 '
 
 test_expect_success "unknown ref storage format" '

From becacd210dcfd473c61af3ba2fb481de8fcfb3e1 Mon Sep 17 00:00:00 2001
From: Junio C Hamano <gitster@pobox.com>
Date: Mon, 5 Aug 2024 17:35:37 -0700
Subject: [PATCH 02/69] cat-file: avoid "too many arguments"

Running "git cat-file -e a b c d e f g" would fail and say "too many
arguments".  By reading that message, you cannot tell if the command
could have worked if you limited the list of objects to 5 items
instead of 7, or the command is prepared to take only a single item.

Let's report that "b" is an unexpected argument instead.

Signed-off-by: Junio C Hamano <gitster@pobox.com>
---
 builtin/cat-file.c  | 5 ++++-
 t/t1006-cat-file.sh | 8 +++++---
 2 files changed, 9 insertions(+), 4 deletions(-)

diff --git a/builtin/cat-file.c b/builtin/cat-file.c
index 18fe58d6b8b043..8537cc1fa793e2 100644
--- a/builtin/cat-file.c
+++ b/builtin/cat-file.c
@@ -1071,10 +1071,13 @@ int cmd_cat_file(int argc, const char **argv, const char *prefix)
 		else if (!argc && opt_epts)
 			usage_msg_optf(_("<object> required with '-%c'"),
 				       usage, options, opt);
+		else if (!argc)
+			BUG("argc==0 with opt=%c", opt);
 		else if (argc == 1)
 			obj_name = argv[0];
 		else
-			usage_msg_opt(_("too many arguments"), usage, options);
+			usage_msg_optf(_("unexpected argument: '%s'"),
+				       usage, options, argv[1]);
 	} else if (!argc) {
 		usage_with_options(usage, options);
 	} else if (argc != 2) {
diff --git a/t/t1006-cat-file.sh b/t/t1006-cat-file.sh
index ff9bf213aa2c73..eef8fa663eddee 100755
--- a/t/t1006-cat-file.sh
+++ b/t/t1006-cat-file.sh
@@ -66,16 +66,18 @@ do
 	done
 done
 
-test_too_many_arguments () {
+test_unexpected_arg () {
+	unexpected=$1
+	shift
 	test_expect_code 129 "$@" 2>err &&
-	grep -E "^fatal: too many arguments$" err
+	grep -E "^fatal: unexpected argument: '$unexpected'" err
 }
 
 for opt in $short_modes $cw_modes
 do
 	args="one two three"
 	test_expect_success "usage: too many arguments: $opt $args" '
-		test_too_many_arguments git cat-file $opt $args
+		test_unexpected_arg two git cat-file $opt $args
 	'
 
 	for opt2 in --buffer --follow-symlinks

From 2446e8f4bec9d4f645097807f81a4db7d702b5c2 Mon Sep 17 00:00:00 2001
From: Junio C Hamano <gitster@pobox.com>
Date: Mon, 5 Aug 2024 17:35:38 -0700
Subject: [PATCH 03/69] notes: avoid "too many arguments"

Imagine seeing your command failing with "too many arguments" when
you run "git cmd foo bar baz".  Can you tell it will work if you
said "git cmd foo bar"?  Or is that trimming your command line too
much?  Too little?

Instead, if the command reports "unexpected argument: 'bar'", you'd
know that "bar" and everything after it is unwanted.

Let's make it so for "git notes".

Signed-off-by: Junio C Hamano <gitster@pobox.com>
---
 builtin/notes.c  | 18 +++++++++---------
 t/t3301-notes.sh |  2 +-
 2 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/builtin/notes.c b/builtin/notes.c
index d9c356e3543fec..d62134565a5aaa 100644
--- a/builtin/notes.c
+++ b/builtin/notes.c
@@ -447,7 +447,7 @@ static int list(int argc, const char **argv, const char *prefix)
 				     git_notes_list_usage, 0);
 
 	if (1 < argc) {
-		error(_("too many arguments"));
+		error(_("unexpected argument: '%s'"), argv[1]);
 		usage_with_options(git_notes_list_usage, options);
 	}
 
@@ -509,7 +509,7 @@ static int add(int argc, const char **argv, const char *prefix)
 			     PARSE_OPT_KEEP_ARGV0);
 
 	if (2 < argc) {
-		error(_("too many arguments"));
+		error(_("unexpected argument: '%s'"), argv[2]);
 		usage_with_options(git_notes_add_usage, options);
 	}
 
@@ -591,7 +591,7 @@ static int copy(int argc, const char **argv, const char *prefix)
 
 	if (from_stdin || rewrite_cmd) {
 		if (argc) {
-			error(_("too many arguments"));
+			error(_("unexpected argument: '%s'"), argv[0]);
 			usage_with_options(git_notes_copy_usage, options);
 		} else {
 			return notes_copy_from_stdin(force, rewrite_cmd);
@@ -603,7 +603,7 @@ static int copy(int argc, const char **argv, const char *prefix)
 		usage_with_options(git_notes_copy_usage, options);
 	}
 	if (2 < argc) {
-		error(_("too many arguments"));
+		error(_("unexpected argument: '%s'"), argv[2]);
 		usage_with_options(git_notes_copy_usage, options);
 	}
 
@@ -686,7 +686,7 @@ static int append_edit(int argc, const char **argv, const char *prefix)
 			     PARSE_OPT_KEEP_ARGV0);
 
 	if (2 < argc) {
-		error(_("too many arguments"));
+		error(_("unexpected argument: '%s'"), argv[2]);
 		usage_with_options(usage, options);
 	}
 
@@ -762,7 +762,7 @@ static int show(int argc, const char **argv, const char *prefix)
 			     0);
 
 	if (1 < argc) {
-		error(_("too many arguments"));
+		error(_("unexpected argument: '%s'"), argv[1]);
 		usage_with_options(git_notes_show_usage, options);
 	}
 
@@ -915,7 +915,7 @@ static int merge(int argc, const char **argv, const char *prefix)
 		error(_("must specify a notes ref to merge"));
 		usage_with_options(git_notes_merge_usage, options);
 	} else if (!do_merge && argc) {
-		error(_("too many arguments"));
+		error(_("unexpected argument: '%s'"), argv[0]);
 		usage_with_options(git_notes_merge_usage, options);
 	}
 
@@ -1069,7 +1069,7 @@ static int prune(int argc, const char **argv, const char *prefix)
 			     0);
 
 	if (argc) {
-		error(_("too many arguments"));
+		error(_("unexpected argument: '%s'"), argv[0]);
 		usage_with_options(git_notes_prune_usage, options);
 	}
 
@@ -1091,7 +1091,7 @@ static int get_ref(int argc, const char **argv, const char *prefix)
 			     git_notes_get_ref_usage, 0);
 
 	if (argc) {
-		error(_("too many arguments"));
+		error(_("unexpected argument: '%s'"), argv[0]);
 		usage_with_options(git_notes_get_ref_usage, options);
 	}
 
diff --git a/t/t3301-notes.sh b/t/t3301-notes.sh
index 536bd11ff4769f..28df9ac63c231b 100755
--- a/t/t3301-notes.sh
+++ b/t/t3301-notes.sh
@@ -1472,7 +1472,7 @@ test_expect_success 'git notes copy diagnoses too many or too few arguments' '
 	test_must_fail git notes copy 2>error &&
 	test_grep "too few arguments" error &&
 	test_must_fail git notes copy one two three 2>error &&
-	test_grep "too many arguments" error
+	test_grep "unexpected argument: ${SQ}three${SQ}" error
 '
 
 test_expect_success 'git notes get-ref expands refs/heads/main to refs/notes/refs/heads/main' '

From 6c8ee88aff15f00751a32bf297e0d7f47f91d75f Mon Sep 17 00:00:00 2001
From: Junio C Hamano <gitster@pobox.com>
Date: Mon, 5 Aug 2024 17:35:39 -0700
Subject: [PATCH 04/69] miscellaneous: avoid "too many arguments"

Imagine seeing your command failing with "too many arguments" when
you run "git cmd foo bar baz".  Can you tell it will work if you
said "git cmd foo bar"?  Or is that trimming your command line too
much?  Too little?

Instead, if the command reports "unexpected argument: 'bar'", you'd
know that "bar" and everything after it is unwanted.

Let's make it so for a few remaining commands.

Signed-off-by: Junio C Hamano <gitster@pobox.com>
---
 builtin/prune-packed.c | 4 +---
 builtin/receive-pack.c | 3 ++-
 builtin/tag.c          | 2 +-
 3 files changed, 4 insertions(+), 5 deletions(-)

diff --git a/builtin/prune-packed.c b/builtin/prune-packed.c
index ca3578e1588401..bed59a08a8518c 100644
--- a/builtin/prune-packed.c
+++ b/builtin/prune-packed.c
@@ -23,9 +23,7 @@ int cmd_prune_packed(int argc, const char **argv, const char *prefix)
 			     prune_packed_usage, 0);
 
 	if (argc > 0)
-		usage_msg_opt(_("too many arguments"),
-			      prune_packed_usage,
-			      prune_packed_options);
+		die(_("'git prune-packed' takes no arguments"));
 
 	prune_packed_objects(opts);
 	return 0;
diff --git a/builtin/receive-pack.c b/builtin/receive-pack.c
index 339524ae2a891c..e522ad10a84ce6 100644
--- a/builtin/receive-pack.c
+++ b/builtin/receive-pack.c
@@ -2503,7 +2503,8 @@ int cmd_receive_pack(int argc, const char **argv, const char *prefix)
 	argc = parse_options(argc, argv, prefix, options, receive_pack_usage, 0);
 
 	if (argc > 1)
-		usage_msg_opt(_("too many arguments"), receive_pack_usage, options);
+		usage_msg_optf(_("unexpected argument: '%s'"),
+			       receive_pack_usage, options, argv[1]);
 	if (argc == 0)
 		usage_msg_opt(_("you must specify a directory"), receive_pack_usage, options);
 
diff --git a/builtin/tag.c b/builtin/tag.c
index a1fb218512cc1a..bbc1dffcb83dcf 100644
--- a/builtin/tag.c
+++ b/builtin/tag.c
@@ -641,7 +641,7 @@ int cmd_tag(int argc, const char **argv, const char *prefix)
 
 	object_ref = argc == 2 ? argv[1] : "HEAD";
 	if (argc > 2)
-		die(_("too many arguments"));
+		die(_("unexpected argument: '%s'"), argv[2]);
 
 	if (repo_get_oid(the_repository, object_ref, &object))
 		die(_("Failed to resolve '%s' as a valid ref."), object_ref);

From 1e77bf09953572c7ad2f8e7a474738979786d522 Mon Sep 17 00:00:00 2001
From: Jeff King <peff@peff.net>
Date: Fri, 23 Aug 2024 22:46:21 +0000
Subject: [PATCH 05/69] packfile: move sizep computation

Moving the sizep computation now makes the next commit to avoid
redundant object info lookups easier to understand.  There is
no user-visible change, here.

[ew: commit message]

Signed-off-by: Jeff King <peff@peff.net>
Signed-off-by: Eric Wong <e@80x24.org>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
---
 packfile.c | 32 ++++++++++++++++----------------
 1 file changed, 16 insertions(+), 16 deletions(-)

diff --git a/packfile.c b/packfile.c
index 813584646f762a..40287639478df1 100644
--- a/packfile.c
+++ b/packfile.c
@@ -1536,24 +1536,24 @@ int packed_object_info(struct repository *r, struct packed_git *p,
 			type = OBJ_BAD;
 	} else {
 		type = unpack_object_header(p, &w_curs, &curpos, &size);
-	}
 
-	if (!oi->contentp && oi->sizep) {
-		if (type == OBJ_OFS_DELTA || type == OBJ_REF_DELTA) {
-			off_t tmp_pos = curpos;
-			off_t base_offset = get_delta_base(p, &w_curs, &tmp_pos,
-							   type, obj_offset);
-			if (!base_offset) {
-				type = OBJ_BAD;
-				goto out;
+		if (oi->sizep) {
+			if (type == OBJ_OFS_DELTA || type == OBJ_REF_DELTA) {
+				off_t tmp_pos = curpos;
+				off_t base_offset = get_delta_base(p, &w_curs, &tmp_pos,
+								   type, obj_offset);
+				if (!base_offset) {
+					type = OBJ_BAD;
+					goto out;
+				}
+				*oi->sizep = get_size_from_delta(p, &w_curs, tmp_pos);
+				if (*oi->sizep == 0) {
+					type = OBJ_BAD;
+					goto out;
+				}
+			} else {
+				*oi->sizep = size;
 			}
-			*oi->sizep = get_size_from_delta(p, &w_curs, tmp_pos);
-			if (*oi->sizep == 0) {
-				type = OBJ_BAD;
-				goto out;
-			}
-		} else {
-			*oi->sizep = size;
 		}
 	}
 

From 117addcb5e58ce223b3ef6bac4dffd9454309e04 Mon Sep 17 00:00:00 2001
From: Jeff King <peff@peff.net>
Date: Fri, 23 Aug 2024 22:46:22 +0000
Subject: [PATCH 06/69] packfile: allow content-limit for cat-file

Avoid unnecessary round trips to the object store to speed
up cat-file contents retrievals.  The majority of packed objects
don't benefit from the streaming interface at all and we end up
having to load them in core anyways to satisfy our streaming
API.

This drops the runtime of
`git cat-file --batch-all-objects --unordered --batch' on
git.git from ~7.1s to ~6.1s on Jeff's machine.

[ew: commit message]

Signed-off-by: Jeff King <peff@peff.net>
Signed-off-by: Eric Wong <e@80x24.org>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
---
 builtin/cat-file.c | 17 +++++++++++++++--
 object-file.c      |  6 ++++++
 object-store-ll.h  |  1 +
 packfile.c         | 13 ++++++++++++-
 4 files changed, 34 insertions(+), 3 deletions(-)

diff --git a/builtin/cat-file.c b/builtin/cat-file.c
index 18fe58d6b8b043..bc4bb8961096b8 100644
--- a/builtin/cat-file.c
+++ b/builtin/cat-file.c
@@ -280,6 +280,7 @@ struct expand_data {
 	off_t disk_size;
 	const char *rest;
 	struct object_id delta_base_oid;
+	void *content;
 
 	/*
 	 * If mark_query is true, we do not expand anything, but rather
@@ -383,7 +384,10 @@ static void print_object_or_die(struct batch_options *opt, struct expand_data *d
 
 	assert(data->info.typep);
 
-	if (data->type == OBJ_BLOB) {
+	if (data->content) {
+		batch_write(opt, data->content, data->size);
+		FREE_AND_NULL(data->content);
+	} else if (data->type == OBJ_BLOB) {
 		if (opt->buffer_output)
 			fflush(stdout);
 		if (opt->transform_mode) {
@@ -801,9 +805,18 @@ static int batch_objects(struct batch_options *opt)
 	/*
 	 * If we are printing out the object, then always fill in the type,
 	 * since we will want to decide whether or not to stream.
+	 *
+	 * Likewise, grab the content in the initial request if it's small
+	 * and we're not planning to filter it.
 	 */
-	if (opt->batch_mode == BATCH_MODE_CONTENTS)
+	if (opt->batch_mode == BATCH_MODE_CONTENTS) {
 		data.info.typep = &data.type;
+		if (!opt->transform_mode) {
+			data.info.sizep = &data.size;
+			data.info.contentp = &data.content;
+			data.info.content_limit = big_file_threshold;
+		}
+	}
 
 	if (opt->all_objects) {
 		struct object_cb_data cb;
diff --git a/object-file.c b/object-file.c
index 065103be3ea923..1cc29c3c584264 100644
--- a/object-file.c
+++ b/object-file.c
@@ -1492,6 +1492,12 @@ static int loose_object_info(struct repository *r,
 
 		if (!oi->contentp)
 			break;
+		if (oi->content_limit && *oi->sizep > oi->content_limit) {
+			git_inflate_end(&stream);
+			oi->contentp = NULL;
+			goto cleanup;
+		}
+
 		*oi->contentp = unpack_loose_rest(&stream, hdr, *oi->sizep, oid);
 		if (*oi->contentp)
 			goto cleanup;
diff --git a/object-store-ll.h b/object-store-ll.h
index c5f2bb2fc2fe6e..b71a15f590683a 100644
--- a/object-store-ll.h
+++ b/object-store-ll.h
@@ -289,6 +289,7 @@ struct object_info {
 	struct object_id *delta_base_oid;
 	struct strbuf *type_name;
 	void **contentp;
+	size_t content_limit;
 
 	/* Response */
 	enum {
diff --git a/packfile.c b/packfile.c
index 40287639478df1..c12a0515b3f8f2 100644
--- a/packfile.c
+++ b/packfile.c
@@ -1529,7 +1529,7 @@ int packed_object_info(struct repository *r, struct packed_git *p,
 	 * We always get the representation type, but only convert it to
 	 * a "real" type later if the caller is interested.
 	 */
-	if (oi->contentp) {
+	if (oi->contentp && !oi->content_limit) {
 		*oi->contentp = cache_or_unpack_entry(r, p, obj_offset, oi->sizep,
 						      &type);
 		if (!*oi->contentp)
@@ -1555,6 +1555,17 @@ int packed_object_info(struct repository *r, struct packed_git *p,
 				*oi->sizep = size;
 			}
 		}
+
+		if (oi->contentp) {
+			if (oi->sizep && *oi->sizep < oi->content_limit) {
+				*oi->contentp = cache_or_unpack_entry(r, p, obj_offset,
+								      oi->sizep, &type);
+				if (!*oi->contentp)
+					type = OBJ_BAD;
+			} else {
+				*oi->contentp = NULL;
+			}
+		}
 	}
 
 	if (oi->disk_sizep) {

From b2df0c0cfa8427396e0645f7cdaf9b99fc859865 Mon Sep 17 00:00:00 2001
From: Eric Wong <e@80x24.org>
Date: Fri, 23 Aug 2024 22:46:23 +0000
Subject: [PATCH 07/69] packfile: fix off-by-one in content_limit comparison

object-file.c::loose_object_info() accepts objects matching
content_limit exactly, so it follows packfile handling allows
slurping objects which match loose object handling and slurp
objects with size matching the content_limit exactly.

This change is merely for consistency with the majority of
existing code and there is no user visible change in nearly all
cases.  The only exception being the corner case when the object
size matches content_limit exactly where users will see a
speedup from avoiding an extra lookup.

Signed-off-by: Eric Wong <e@80x24.org>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
---
 packfile.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/packfile.c b/packfile.c
index c12a0515b3f8f2..8ec86d2d6937d4 100644
--- a/packfile.c
+++ b/packfile.c
@@ -1557,7 +1557,7 @@ int packed_object_info(struct repository *r, struct packed_git *p,
 		}
 
 		if (oi->contentp) {
-			if (oi->sizep && *oi->sizep < oi->content_limit) {
+			if (oi->sizep && *oi->sizep <= oi->content_limit) {
 				*oi->contentp = cache_or_unpack_entry(r, p, obj_offset,
 								      oi->sizep, &type);
 				if (!*oi->contentp)

From a5f683f93da7efabe663b728d403cf8d6bd758e4 Mon Sep 17 00:00:00 2001
From: Eric Wong <e@80x24.org>
Date: Fri, 23 Aug 2024 22:46:24 +0000
Subject: [PATCH 08/69] packfile: inline cache_or_unpack_entry

We need to check delta_base_cache anyways to fill in the
`whence' field in `struct object_info'.  Inlining (and getting
rid of) cache_or_unpack_entry() makes it easier to only do the
hashmap lookup once and avoid a redundant lookup later on.

This code reorganization will also make an optimization to
use the cache entry directly easier to implement in the next
commit.

Signed-off-by: Eric Wong <e@80x24.org>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
---
 packfile.c | 48 +++++++++++++++++++++---------------------------
 1 file changed, 21 insertions(+), 27 deletions(-)

diff --git a/packfile.c b/packfile.c
index 8ec86d2d6937d4..0a90a5ed6712b8 100644
--- a/packfile.c
+++ b/packfile.c
@@ -1444,23 +1444,6 @@ static void detach_delta_base_cache_entry(struct delta_base_cache_entry *ent)
 	free(ent);
 }
 
-static void *cache_or_unpack_entry(struct repository *r, struct packed_git *p,
-				   off_t base_offset, unsigned long *base_size,
-				   enum object_type *type)
-{
-	struct delta_base_cache_entry *ent;
-
-	ent = get_delta_base_cache_entry(p, base_offset);
-	if (!ent)
-		return unpack_entry(r, p, base_offset, type, base_size);
-
-	if (type)
-		*type = ent->type;
-	if (base_size)
-		*base_size = ent->size;
-	return xmemdupz(ent->data, ent->size);
-}
-
 static inline void release_delta_base_cache(struct delta_base_cache_entry *ent)
 {
 	free(ent->data);
@@ -1521,20 +1504,35 @@ int packed_object_info(struct repository *r, struct packed_git *p,
 		       off_t obj_offset, struct object_info *oi)
 {
 	struct pack_window *w_curs = NULL;
-	unsigned long size;
 	off_t curpos = obj_offset;
 	enum object_type type;
+	struct delta_base_cache_entry *ent;
 
 	/*
 	 * We always get the representation type, but only convert it to
 	 * a "real" type later if the caller is interested.
 	 */
-	if (oi->contentp && !oi->content_limit) {
-		*oi->contentp = cache_or_unpack_entry(r, p, obj_offset, oi->sizep,
-						      &type);
+	oi->whence = OI_PACKED;
+	ent = get_delta_base_cache_entry(p, obj_offset);
+	if (ent) {
+		oi->whence = OI_DBCACHED;
+		type = ent->type;
+		if (oi->sizep)
+			*oi->sizep = ent->size;
+		if (oi->contentp) {
+			if (!oi->content_limit ||
+					ent->size <= oi->content_limit)
+				*oi->contentp = xmemdupz(ent->data, ent->size);
+			else
+				*oi->contentp = NULL; /* caller must stream */
+		}
+	} else if (oi->contentp && !oi->content_limit) {
+		*oi->contentp = unpack_entry(r, p, obj_offset, &type,
+						oi->sizep);
 		if (!*oi->contentp)
 			type = OBJ_BAD;
 	} else {
+		unsigned long size;
 		type = unpack_object_header(p, &w_curs, &curpos, &size);
 
 		if (oi->sizep) {
@@ -1558,8 +1556,8 @@ int packed_object_info(struct repository *r, struct packed_git *p,
 
 		if (oi->contentp) {
 			if (oi->sizep && *oi->sizep <= oi->content_limit) {
-				*oi->contentp = cache_or_unpack_entry(r, p, obj_offset,
-								      oi->sizep, &type);
+				*oi->contentp = unpack_entry(r, p, obj_offset,
+							&type, oi->sizep);
 				if (!*oi->contentp)
 					type = OBJ_BAD;
 			} else {
@@ -1608,10 +1606,6 @@ int packed_object_info(struct repository *r, struct packed_git *p,
 		} else
 			oidclr(oi->delta_base_oid, the_repository->hash_algo);
 	}
-
-	oi->whence = in_delta_base_cache(p, obj_offset) ? OI_DBCACHED :
-							  OI_PACKED;
-
 out:
 	unuse_pack(&w_curs);
 	return type;

From 98521d6f04ca9216be1e91653389fdaaaaee95ed Mon Sep 17 00:00:00 2001
From: Eric Wong <e@80x24.org>
Date: Fri, 23 Aug 2024 22:46:25 +0000
Subject: [PATCH 09/69] cat-file: use delta_base_cache entries directly

For objects already in the delta_base_cache, we can safely use
one entry at-a-time directly to avoid the malloc+memcpy+free
overhead.  For a 1MB delta base object, this eliminates the
speed penalty of duplicating large objects into memory and
speeds up those 1MB delta base cached content retrievals by
roughly 30%.

While only 2-7% of objects are delta bases in repos I've looked
at, this avoids up to 96MB of duplicated memory in the worst
case with the default git config.

The new delta_base_cache_lock is a simple single-threaded
assertion to ensure cat-file (and similar) is the exclusive user
of the delta_base_cache.  In other words, we cannot have diff
or similar commands using two or more entries directly from the
delta base cache.  The new lock has nothing to do with parallel
access via multiple threads at the moment.

Signed-off-by: Eric Wong <e@80x24.org>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
---
 builtin/cat-file.c | 16 +++++++++++++++-
 object-file.c      |  5 +++++
 object-store-ll.h  |  8 ++++++++
 packfile.c         | 33 ++++++++++++++++++++++++++++++---
 packfile.h         |  4 ++++
 5 files changed, 62 insertions(+), 4 deletions(-)

diff --git a/builtin/cat-file.c b/builtin/cat-file.c
index bc4bb8961096b8..8debcdca3e9884 100644
--- a/builtin/cat-file.c
+++ b/builtin/cat-file.c
@@ -386,7 +386,20 @@ static void print_object_or_die(struct batch_options *opt, struct expand_data *d
 
 	if (data->content) {
 		batch_write(opt, data->content, data->size);
-		FREE_AND_NULL(data->content);
+		switch (data->info.whence) {
+		case OI_CACHED:
+			/*
+			 * only blame uses OI_CACHED atm, so it's unlikely
+			 * we'll ever hit this path
+			 */
+			BUG("TODO OI_CACHED support not done");
+		case OI_LOOSE:
+		case OI_PACKED:
+			FREE_AND_NULL(data->content);
+			break;
+		case OI_DBCACHED:
+			unlock_delta_base_cache();
+		}
 	} else if (data->type == OBJ_BLOB) {
 		if (opt->buffer_output)
 			fflush(stdout);
@@ -815,6 +828,7 @@ static int batch_objects(struct batch_options *opt)
 			data.info.sizep = &data.size;
 			data.info.contentp = &data.content;
 			data.info.content_limit = big_file_threshold;
+			data.info.direct_cache = 1;
 		}
 	}
 
diff --git a/object-file.c b/object-file.c
index 1cc29c3c584264..19100e823daaeb 100644
--- a/object-file.c
+++ b/object-file.c
@@ -1586,6 +1586,11 @@ static int do_oid_object_info_extended(struct repository *r,
 			oidclr(oi->delta_base_oid, the_repository->hash_algo);
 		if (oi->type_name)
 			strbuf_addstr(oi->type_name, type_name(co->type));
+		/*
+		 * Currently `blame' is the only command which creates
+		 * OI_CACHED, and direct_cache is only used by `cat-file'.
+		 */
+		assert(!oi->direct_cache);
 		if (oi->contentp)
 			*oi->contentp = xmemdupz(co->buf, co->size);
 		oi->whence = OI_CACHED;
diff --git a/object-store-ll.h b/object-store-ll.h
index b71a15f590683a..669bb93784e12d 100644
--- a/object-store-ll.h
+++ b/object-store-ll.h
@@ -298,6 +298,14 @@ struct object_info {
 		OI_PACKED,
 		OI_DBCACHED
 	} whence;
+
+	/*
+	 * Set if caller is able to use OI_DBCACHED entries without copying.
+	 * This only applies to OI_DBCACHED entries at the moment,
+	 * not OI_CACHED or any other type of entry.
+	 */
+	unsigned direct_cache:1;
+
 	union {
 		/*
 		 * struct {
diff --git a/packfile.c b/packfile.c
index 0a90a5ed6712b8..40c6c2e3870242 100644
--- a/packfile.c
+++ b/packfile.c
@@ -1362,6 +1362,14 @@ static enum object_type packed_to_object_type(struct repository *r,
 static struct hashmap delta_base_cache;
 static size_t delta_base_cached;
 
+/*
+ * Ensures only a single object is used at-a-time via oi->direct_cache.
+ * Using two objects directly at once (e.g. diff) would cause corruption
+ * since populating the cache may invalidate existing entries.
+ * This lock has nothing to do with parallelism at the moment.
+ */
+static int delta_base_cache_lock;
+
 static LIST_HEAD(delta_base_cache_lru);
 
 struct delta_base_cache_key {
@@ -1444,6 +1452,18 @@ static void detach_delta_base_cache_entry(struct delta_base_cache_entry *ent)
 	free(ent);
 }
 
+static void lock_delta_base_cache(void)
+{
+	delta_base_cache_lock++;
+	assert(delta_base_cache_lock == 1);
+}
+
+void unlock_delta_base_cache(void)
+{
+	delta_base_cache_lock--;
+	assert(delta_base_cache_lock == 0);
+}
+
 static inline void release_delta_base_cache(struct delta_base_cache_entry *ent)
 {
 	free(ent->data);
@@ -1453,6 +1473,7 @@ static inline void release_delta_base_cache(struct delta_base_cache_entry *ent)
 void clear_delta_base_cache(void)
 {
 	struct list_head *lru, *tmp;
+	assert(!delta_base_cache_lock);
 	list_for_each_safe(lru, tmp, &delta_base_cache_lru) {
 		struct delta_base_cache_entry *entry =
 			list_entry(lru, struct delta_base_cache_entry, lru);
@@ -1466,6 +1487,7 @@ static void add_delta_base_cache(struct packed_git *p, off_t base_offset,
 	struct delta_base_cache_entry *ent;
 	struct list_head *lru, *tmp;
 
+	assert(!delta_base_cache_lock);
 	/*
 	 * Check required to avoid redundant entries when more than one thread
 	 * is unpacking the same object, in unpack_entry() (since its phases I
@@ -1520,11 +1542,16 @@ int packed_object_info(struct repository *r, struct packed_git *p,
 		if (oi->sizep)
 			*oi->sizep = ent->size;
 		if (oi->contentp) {
-			if (!oi->content_limit ||
-					ent->size <= oi->content_limit)
+			/* ignore content_limit if avoiding copy from cache */
+			if (oi->direct_cache) {
+				lock_delta_base_cache();
+				*oi->contentp = ent->data;
+			} else if (!oi->content_limit ||
+					ent->size <= oi->content_limit) {
 				*oi->contentp = xmemdupz(ent->data, ent->size);
-			else
+			} else {
 				*oi->contentp = NULL; /* caller must stream */
+			}
 		}
 	} else if (oi->contentp && !oi->content_limit) {
 		*oi->contentp = unpack_entry(r, p, obj_offset, &type,
diff --git a/packfile.h b/packfile.h
index eb18ec15dbf3bc..94941bbe80b620 100644
--- a/packfile.h
+++ b/packfile.h
@@ -210,4 +210,8 @@ int is_promisor_object(const struct object_id *oid);
 int load_idx(const char *path, const unsigned int hashsz, void *idx_map,
 	     size_t idx_size, struct packed_git *p);
 
+/*
+ * release lock acquired via oi->direct_cache
+ */
+void unlock_delta_base_cache(void);
 #endif

From 28402bcedd727adbfdf4c5aebb63703fbb207400 Mon Sep 17 00:00:00 2001
From: Eric Wong <e@80x24.org>
Date: Fri, 23 Aug 2024 22:46:26 +0000
Subject: [PATCH 10/69] packfile: packed_object_info avoids
 packed_to_object_type

For entries in the delta base cache, packed_to_object_type calls
can be omitted.  This prepares us to bypass content_limit for
non-blob types in the following commit.

Signed-off-by: Eric Wong <e@80x24.org>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
---
 packfile.c | 18 ++++++++++--------
 1 file changed, 10 insertions(+), 8 deletions(-)

diff --git a/packfile.c b/packfile.c
index 40c6c2e3870242..94d20034e4bac0 100644
--- a/packfile.c
+++ b/packfile.c
@@ -1527,7 +1527,7 @@ int packed_object_info(struct repository *r, struct packed_git *p,
 {
 	struct pack_window *w_curs = NULL;
 	off_t curpos = obj_offset;
-	enum object_type type;
+	enum object_type type, final_type = OBJ_BAD;
 	struct delta_base_cache_entry *ent;
 
 	/*
@@ -1538,7 +1538,7 @@ int packed_object_info(struct repository *r, struct packed_git *p,
 	ent = get_delta_base_cache_entry(p, obj_offset);
 	if (ent) {
 		oi->whence = OI_DBCACHED;
-		type = ent->type;
+		final_type = type = ent->type;
 		if (oi->sizep)
 			*oi->sizep = ent->size;
 		if (oi->contentp) {
@@ -1556,6 +1556,7 @@ int packed_object_info(struct repository *r, struct packed_git *p,
 	} else if (oi->contentp && !oi->content_limit) {
 		*oi->contentp = unpack_entry(r, p, obj_offset, &type,
 						oi->sizep);
+		final_type = type;
 		if (!*oi->contentp)
 			type = OBJ_BAD;
 	} else {
@@ -1585,6 +1586,7 @@ int packed_object_info(struct repository *r, struct packed_git *p,
 			if (oi->sizep && *oi->sizep <= oi->content_limit) {
 				*oi->contentp = unpack_entry(r, p, obj_offset,
 							&type, oi->sizep);
+				final_type = type;
 				if (!*oi->contentp)
 					type = OBJ_BAD;
 			} else {
@@ -1606,17 +1608,17 @@ int packed_object_info(struct repository *r, struct packed_git *p,
 	}
 
 	if (oi->typep || oi->type_name) {
-		enum object_type ptot;
-		ptot = packed_to_object_type(r, p, obj_offset,
-					     type, &w_curs, curpos);
+		if (final_type < 0)
+			final_type = packed_to_object_type(r, p, obj_offset,
+						     type, &w_curs, curpos);
 		if (oi->typep)
-			*oi->typep = ptot;
+			*oi->typep = final_type;
 		if (oi->type_name) {
-			const char *tn = type_name(ptot);
+			const char *tn = type_name(final_type);
 			if (tn)
 				strbuf_addstr(oi->type_name, tn);
 		}
-		if (ptot < 0) {
+		if (final_type < 0) {
 			type = OBJ_BAD;
 			goto out;
 		}

From 7d4e4d5dc88ed70b73ae10d4d3283f661110c99c Mon Sep 17 00:00:00 2001
From: Eric Wong <e@80x24.org>
Date: Fri, 23 Aug 2024 22:46:27 +0000
Subject: [PATCH 11/69] object_info: content_limit only applies to blobs

Streaming is only supported for blobs, so we'd end up having to
slurp all the other object types into memory regardless.  So
slurp all the non-blob types up front when requesting content
since we always handle them in-core, anyways.

Signed-off-by: Eric Wong <e@80x24.org>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
---
 builtin/cat-file.c  | 21 +++++++++++++++++++--
 object-file.c       |  3 ++-
 packfile.c          |  8 +++++---
 t/t1006-cat-file.sh | 19 ++++++++++++++++---
 4 files changed, 42 insertions(+), 9 deletions(-)

diff --git a/builtin/cat-file.c b/builtin/cat-file.c
index 8debcdca3e9884..2aedd6232474b3 100644
--- a/builtin/cat-file.c
+++ b/builtin/cat-file.c
@@ -385,7 +385,24 @@ static void print_object_or_die(struct batch_options *opt, struct expand_data *d
 	assert(data->info.typep);
 
 	if (data->content) {
-		batch_write(opt, data->content, data->size);
+		void *content = data->content;
+		unsigned long size = data->size;
+
+		data->content = NULL;
+		if (use_mailmap && (data->type == OBJ_COMMIT ||
+					data->type == OBJ_TAG)) {
+			size_t s = size;
+
+			if (data->info.whence == OI_DBCACHED) {
+				content = xmemdupz(content, s);
+				data->info.whence = OI_PACKED;
+			}
+
+			content = replace_idents_using_mailmap(content, &s);
+			size = cast_size_t_to_ulong(s);
+		}
+
+		batch_write(opt, content, size);
 		switch (data->info.whence) {
 		case OI_CACHED:
 			/*
@@ -395,7 +412,7 @@ static void print_object_or_die(struct batch_options *opt, struct expand_data *d
 			BUG("TODO OI_CACHED support not done");
 		case OI_LOOSE:
 		case OI_PACKED:
-			FREE_AND_NULL(data->content);
+			free(content);
 			break;
 		case OI_DBCACHED:
 			unlock_delta_base_cache();
diff --git a/object-file.c b/object-file.c
index 19100e823daaeb..59842cfe1b2a23 100644
--- a/object-file.c
+++ b/object-file.c
@@ -1492,7 +1492,8 @@ static int loose_object_info(struct repository *r,
 
 		if (!oi->contentp)
 			break;
-		if (oi->content_limit && *oi->sizep > oi->content_limit) {
+		if (oi->content_limit && *oi->typep == OBJ_BLOB &&
+				*oi->sizep > oi->content_limit) {
 			git_inflate_end(&stream);
 			oi->contentp = NULL;
 			goto cleanup;
diff --git a/packfile.c b/packfile.c
index 94d20034e4bac0..a592e0b32c4e1e 100644
--- a/packfile.c
+++ b/packfile.c
@@ -1546,7 +1546,7 @@ int packed_object_info(struct repository *r, struct packed_git *p,
 			if (oi->direct_cache) {
 				lock_delta_base_cache();
 				*oi->contentp = ent->data;
-			} else if (!oi->content_limit ||
+			} else if (type != OBJ_BLOB || !oi->content_limit ||
 					ent->size <= oi->content_limit) {
 				*oi->contentp = xmemdupz(ent->data, ent->size);
 			} else {
@@ -1583,10 +1583,12 @@ int packed_object_info(struct repository *r, struct packed_git *p,
 		}
 
 		if (oi->contentp) {
-			if (oi->sizep && *oi->sizep <= oi->content_limit) {
+			final_type = packed_to_object_type(r, p, obj_offset,
+						     type, &w_curs, curpos);
+			if (final_type != OBJ_BLOB || (oi->sizep &&
+					*oi->sizep <= oi->content_limit)) {
 				*oi->contentp = unpack_entry(r, p, obj_offset,
 							&type, oi->sizep);
-				final_type = type;
 				if (!*oi->contentp)
 					type = OBJ_BAD;
 			} else {
diff --git a/t/t1006-cat-file.sh b/t/t1006-cat-file.sh
index ff9bf213aa2c73..841e8567e975d0 100755
--- a/t/t1006-cat-file.sh
+++ b/t/t1006-cat-file.sh
@@ -622,20 +622,33 @@ test_expect_success 'confirm that neither loose blob is a delta' '
 	test_cmp expect actual
 '
 
+test_expect_success 'setup delta base tests' '
+	foo="$(git rev-parse HEAD:foo)" &&
+	foo_plus="$(git rev-parse HEAD:foo-plus)" &&
+	git repack -ad
+'
+
 # To avoid relying too much on the current delta heuristics,
 # we will check only that one of the two objects is a delta
 # against the other, but not the order. We can do so by just
 # asking for the base of both, and checking whether either
 # oid appears in the output.
 test_expect_success '%(deltabase) reports packed delta bases' '
-	git repack -ad &&
 	git cat-file --batch-check="%(deltabase)" <blobs >actual &&
 	{
-		grep "$(git rev-parse HEAD:foo)" actual ||
-		grep "$(git rev-parse HEAD:foo-plus)" actual
+		grep "$foo" actual || grep "$foo_plus" actual
 	}
 '
 
+test_expect_success 'delta base direct cache use succeeds w/o asserting' '
+	commands="info $foo
+info $foo_plus
+contents $foo_plus
+contents $foo" &&
+	echo "$commands" >in &&
+	git cat-file --batch-command <in >out
+'
+
 test_expect_success 'setup bogus data' '
 	bogus_short_type="bogus" &&
 	bogus_short_content="bogus" &&

From 489810f78bf3d435b3b29b40951492b7e225870c Mon Sep 17 00:00:00 2001
From: Eric Wong <e@80x24.org>
Date: Fri, 23 Aug 2024 22:46:28 +0000
Subject: [PATCH 12/69] cat-file: batch-command uses content_limit

As with the normal `--batch' mode, we can use the content_limit
round trip optimization to avoid a redundant lookup.  The only
tricky thing here is we need to enable/disable setting the
object_info.contentp field depending on whether we hit an `info'
or `contents' command.

t1006 is updated to ensure we can switch back and forth between
`info' and `contents' commands without problems.

Signed-off-by: Eric Wong <e@80x24.org>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
---
 builtin/cat-file.c | 32 ++++++--------------------------
 1 file changed, 6 insertions(+), 26 deletions(-)

diff --git a/builtin/cat-file.c b/builtin/cat-file.c
index 2aedd6232474b3..067cdbdbf9e7bb 100644
--- a/builtin/cat-file.c
+++ b/builtin/cat-file.c
@@ -417,7 +417,8 @@ static void print_object_or_die(struct batch_options *opt, struct expand_data *d
 		case OI_DBCACHED:
 			unlock_delta_base_cache();
 		}
-	} else if (data->type == OBJ_BLOB) {
+	} else {
+		assert(data->type == OBJ_BLOB);
 		if (opt->buffer_output)
 			fflush(stdout);
 		if (opt->transform_mode) {
@@ -452,30 +453,6 @@ static void print_object_or_die(struct batch_options *opt, struct expand_data *d
 			stream_blob(oid);
 		}
 	}
-	else {
-		enum object_type type;
-		unsigned long size;
-		void *contents;
-
-		contents = repo_read_object_file(the_repository, oid, &type,
-						 &size);
-		if (!contents)
-			die("object %s disappeared", oid_to_hex(oid));
-
-		if (use_mailmap) {
-			size_t s = size;
-			contents = replace_idents_using_mailmap(contents, &s);
-			size = cast_size_t_to_ulong(s);
-		}
-
-		if (type != data->type)
-			die("object %s changed type!?", oid_to_hex(oid));
-		if (data->info.sizep && size != data->size && !use_mailmap)
-			die("object %s changed size!?", oid_to_hex(oid));
-
-		batch_write(opt, contents, size);
-		free(contents);
-	}
 }
 
 static void print_default_format(struct strbuf *scratch, struct expand_data *data,
@@ -689,6 +666,7 @@ static void parse_cmd_contents(struct batch_options *opt,
 			     struct expand_data *data)
 {
 	opt->batch_mode = BATCH_MODE_CONTENTS;
+	data->info.contentp = &data->content;
 	batch_one_object(line, output, opt, data);
 }
 
@@ -698,6 +676,7 @@ static void parse_cmd_info(struct batch_options *opt,
 			   struct expand_data *data)
 {
 	opt->batch_mode = BATCH_MODE_INFO;
+	data->info.contentp = NULL;
 	batch_one_object(line, output, opt, data);
 }
 
@@ -839,7 +818,8 @@ static int batch_objects(struct batch_options *opt)
 	 * Likewise, grab the content in the initial request if it's small
 	 * and we're not planning to filter it.
 	 */
-	if (opt->batch_mode == BATCH_MODE_CONTENTS) {
+	if ((opt->batch_mode == BATCH_MODE_CONTENTS) ||
+			(opt->batch_mode == BATCH_MODE_QUEUE_AND_DISPATCH)) {
 		data.info.typep = &data.type;
 		if (!opt->transform_mode) {
 			data.info.sizep = &data.size;

From 1732ddaa0ff3e3109845580b1fbd5e6971a8bb52 Mon Sep 17 00:00:00 2001
From: Eric Wong <e@80x24.org>
Date: Fri, 23 Aug 2024 22:46:29 +0000
Subject: [PATCH 13/69] cat-file: batch_write: use size_t for length

fwrite(3) and write(2), and all of our wrappers for them use
size_t while object size is `unsigned long', so there's no
excuse to use a potentially smaller representation.

Signed-off-by: Eric Wong <e@80x24.org>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
---
 builtin/cat-file.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/builtin/cat-file.c b/builtin/cat-file.c
index 067cdbdbf9e7bb..bf810546623cfd 100644
--- a/builtin/cat-file.c
+++ b/builtin/cat-file.c
@@ -369,7 +369,7 @@ static void expand_format(struct strbuf *sb, const char *start,
 	}
 }
 
-static void batch_write(struct batch_options *opt, const void *data, int len)
+static void batch_write(struct batch_options *opt, const void *data, size_t len)
 {
 	if (opt->buffer_output) {
 		if (fwrite(data, 1, len, stdout) != len)

From f043683713705a2b5b7b7172f5959a98aec0f6e7 Mon Sep 17 00:00:00 2001
From: Eric Wong <e@80x24.org>
Date: Fri, 23 Aug 2024 22:46:30 +0000
Subject: [PATCH 14/69] cat-file: use writev(2) if available

Using writev here is 20-40% faster than three write syscalls in
succession for smaller (1-10k) objects in the delta base cache.
This advantage decreases as object sizes approach pipe size (64k
on Linux).

writev reduces wakeups and syscalls on the read side as well:
each write(2) syscall may trigger one or more corresponding
read(2) syscalls in the reader.  Attempting atomicity in the
writer via writev also reduces the likelyhood of non-blocking
readers failing with EAGAIN and having to call poll||select
before attempting to read again.

Unfortunately, this turns into a small (1-3%) slowdown for
gigantic objects of a megabyte or more even with after
increasing pipe size to 1MB via the F_SETPIPE_SZ fcntl(2) op.
This slowdown is acceptable to me since the vast majority of
objects are 64K or less for projects I've looked at.

Relying on stdio buffering and fflush(3) after each response was
considered for users without --buffer, but historically cat-file
defaults to being compatible with non-blocking stdout and able
to poll(2) after hitting EAGAIN on write(2).  Using stdio on
files with the O_NONBLOCK flag is (AFAIK) unspecified and likely
subject to portability problems and thus avoided.

Signed-off-by: Eric Wong <e@80x24.org>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
---
 Makefile           |  3 +++
 builtin/cat-file.c | 62 ++++++++++++++++++++++++++++++-------------
 config.mak.uname   |  5 ++++
 git-compat-util.h  | 10 +++++++
 wrapper.c          | 18 +++++++++++++
 wrapper.h          |  1 +
 write-or-die.c     | 66 ++++++++++++++++++++++++++++++++++++++++++++++
 write-or-die.h     |  2 ++
 8 files changed, 149 insertions(+), 18 deletions(-)

diff --git a/Makefile b/Makefile
index 3eab701b10179f..c7a062de002bc3 100644
--- a/Makefile
+++ b/Makefile
@@ -1844,6 +1844,9 @@ ifdef NO_PREAD
 	COMPAT_CFLAGS += -DNO_PREAD
 	COMPAT_OBJS += compat/pread.o
 endif
+ifdef HAVE_WRITEV
+	COMPAT_CFLAGS += -DHAVE_WRITEV
+endif
 ifdef NO_FAST_WORKING_DIRECTORY
 	BASIC_CFLAGS += -DNO_FAST_WORKING_DIRECTORY
 endif
diff --git a/builtin/cat-file.c b/builtin/cat-file.c
index bf810546623cfd..016b7d26a72154 100644
--- a/builtin/cat-file.c
+++ b/builtin/cat-file.c
@@ -280,7 +280,7 @@ struct expand_data {
 	off_t disk_size;
 	const char *rest;
 	struct object_id delta_base_oid;
-	void *content;
+	struct git_iovec iov[3];
 
 	/*
 	 * If mark_query is true, we do not expand anything, but rather
@@ -378,17 +378,42 @@ static void batch_write(struct batch_options *opt, const void *data, size_t len)
 		write_or_die(1, data, len);
 }
 
-static void print_object_or_die(struct batch_options *opt, struct expand_data *data)
+static void batch_writev(struct batch_options *opt, struct expand_data *data,
+			const struct strbuf *hdr, size_t size)
+{
+	data->iov[0].iov_base = hdr->buf;
+	data->iov[0].iov_len = hdr->len;
+	data->iov[1].iov_len = size;
+
+	/*
+	 * Copying a (8|16)-byte iovec for a single byte is gross, but my
+	 * attempt to stuff output_delim into the trailing NUL byte of
+	 * iov[1].iov_base (and restoring it after writev(2) for the
+	 * OI_DBCACHED case) to drop iovcnt from 3->2 wasn't faster.
+	 */
+	data->iov[2].iov_base = &opt->output_delim;
+	data->iov[2].iov_len = 1;
+
+	if (opt->buffer_output)
+		fwritev_or_die(stdout, data->iov, 3);
+	else
+		writev_or_die(1, data->iov, 3);
+
+	/* writev_or_die may move iov[1].iov_base, so it's invalid */
+	data->iov[1].iov_base = NULL;
+}
+
+static void print_object_or_die(struct batch_options *opt,
+				struct expand_data *data, struct strbuf *hdr)
 {
 	const struct object_id *oid = &data->oid;
 
 	assert(data->info.typep);
 
-	if (data->content) {
-		void *content = data->content;
+	if (data->iov[1].iov_base) {
+		void *content = data->iov[1].iov_base;
 		unsigned long size = data->size;
 
-		data->content = NULL;
 		if (use_mailmap && (data->type == OBJ_COMMIT ||
 					data->type == OBJ_TAG)) {
 			size_t s = size;
@@ -399,10 +424,10 @@ static void print_object_or_die(struct batch_options *opt, struct expand_data *d
 			}
 
 			content = replace_idents_using_mailmap(content, &s);
+			data->iov[1].iov_base = content;
 			size = cast_size_t_to_ulong(s);
 		}
-
-		batch_write(opt, content, size);
+		batch_writev(opt, data, hdr, size);
 		switch (data->info.whence) {
 		case OI_CACHED:
 			/*
@@ -419,8 +444,6 @@ static void print_object_or_die(struct batch_options *opt, struct expand_data *d
 		}
 	} else {
 		assert(data->type == OBJ_BLOB);
-		if (opt->buffer_output)
-			fflush(stdout);
 		if (opt->transform_mode) {
 			char *contents;
 			unsigned long size;
@@ -447,10 +470,15 @@ static void print_object_or_die(struct batch_options *opt, struct expand_data *d
 					    oid_to_hex(oid), data->rest);
 			} else
 				BUG("invalid transform_mode: %c", opt->transform_mode);
-			batch_write(opt, contents, size);
+			data->iov[1].iov_base = contents;
+			batch_writev(opt, data, hdr, size);
 			free(contents);
 		} else {
+			batch_write(opt, hdr->buf, hdr->len);
+			if (opt->buffer_output)
+				fflush(stdout);
 			stream_blob(oid);
+			batch_write(opt, &opt->output_delim, 1);
 		}
 	}
 }
@@ -519,12 +547,10 @@ static void batch_object_write(const char *obj_name,
 		strbuf_addch(scratch, opt->output_delim);
 	}
 
-	batch_write(opt, scratch->buf, scratch->len);
-
-	if (opt->batch_mode == BATCH_MODE_CONTENTS) {
-		print_object_or_die(opt, data);
-		batch_write(opt, &opt->output_delim, 1);
-	}
+	if (opt->batch_mode == BATCH_MODE_CONTENTS)
+		print_object_or_die(opt, data, scratch);
+	else
+		batch_write(opt, scratch->buf, scratch->len);
 }
 
 static void batch_one_object(const char *obj_name,
@@ -666,7 +692,7 @@ static void parse_cmd_contents(struct batch_options *opt,
 			     struct expand_data *data)
 {
 	opt->batch_mode = BATCH_MODE_CONTENTS;
-	data->info.contentp = &data->content;
+	data->info.contentp = &data->iov[1].iov_base;
 	batch_one_object(line, output, opt, data);
 }
 
@@ -823,7 +849,7 @@ static int batch_objects(struct batch_options *opt)
 		data.info.typep = &data.type;
 		if (!opt->transform_mode) {
 			data.info.sizep = &data.size;
-			data.info.contentp = &data.content;
+			data.info.contentp = &data.iov[1].iov_base;
 			data.info.content_limit = big_file_threshold;
 			data.info.direct_cache = 1;
 		}
diff --git a/config.mak.uname b/config.mak.uname
index 85d63821ec95f6..8ce8776657b2cc 100644
--- a/config.mak.uname
+++ b/config.mak.uname
@@ -69,6 +69,7 @@ ifeq ($(uname_S),Linux)
 		BASIC_CFLAGS += -std=c99
         endif
 	LINK_FUZZ_PROGRAMS = YesPlease
+	HAVE_WRITEV = YesPlease
 endif
 ifeq ($(uname_S),GNU/kFreeBSD)
 	HAVE_ALLOCA_H = YesPlease
@@ -77,6 +78,7 @@ ifeq ($(uname_S),GNU/kFreeBSD)
 	DIR_HAS_BSD_GROUP_SEMANTICS = YesPlease
 	LIBC_CONTAINS_LIBINTL = YesPlease
 	FREAD_READS_DIRECTORIES = UnfortunatelyYes
+	HAVE_WRITEV = YesPlease
 endif
 ifeq ($(uname_S),UnixWare)
 	CC = cc
@@ -292,6 +294,7 @@ ifeq ($(uname_S),FreeBSD)
 	PAGER_ENV = LESS=FRX LV=-c MORE=FRX
 	FREAD_READS_DIRECTORIES = UnfortunatelyYes
 	FILENO_IS_A_MACRO = UnfortunatelyYes
+	HAVE_WRITEV = YesPlease
 endif
 ifeq ($(uname_S),OpenBSD)
 	NO_STRCASESTR = YesPlease
@@ -307,6 +310,7 @@ ifeq ($(uname_S),OpenBSD)
 	PROCFS_EXECUTABLE_PATH = /proc/curproc/file
 	FREAD_READS_DIRECTORIES = UnfortunatelyYes
 	FILENO_IS_A_MACRO = UnfortunatelyYes
+	HAVE_WRITEV = YesPlease
 endif
 ifeq ($(uname_S),MirBSD)
 	NO_STRCASESTR = YesPlease
@@ -329,6 +333,7 @@ ifeq ($(uname_S),NetBSD)
 	HAVE_BSD_KERN_PROC_SYSCTL = YesPlease
 	CSPRNG_METHOD = arc4random
 	PROCFS_EXECUTABLE_PATH = /proc/curproc/exe
+	HAVE_WRITEV = YesPlease
 endif
 ifeq ($(uname_S),AIX)
 	DEFAULT_PAGER = more
diff --git a/git-compat-util.h b/git-compat-util.h
index ca7678a379dcbc..afde8abc99100e 100644
--- a/git-compat-util.h
+++ b/git-compat-util.h
@@ -388,6 +388,16 @@ static inline int git_setitimer(int which UNUSED,
 #define setitimer(which,value,ovalue) git_setitimer(which,value,ovalue)
 #endif
 
+#ifdef HAVE_WRITEV
+#include <sys/uio.h>
+#define git_iovec iovec
+#else /* !HAVE_WRITEV */
+struct git_iovec {
+	void *iov_base;
+	size_t iov_len;
+};
+#endif /* !HAVE_WRITEV */
+
 #ifndef NO_LIBGEN_H
 #include <libgen.h>
 #else
diff --git a/wrapper.c b/wrapper.c
index f87d90bf5794a5..066c7721450694 100644
--- a/wrapper.c
+++ b/wrapper.c
@@ -262,6 +262,24 @@ ssize_t xwrite(int fd, const void *buf, size_t len)
 	}
 }
 
+#ifdef HAVE_WRITEV
+ssize_t xwritev(int fd, const struct iovec *iov, int iovcnt)
+{
+	while (1) {
+		ssize_t nr = writev(fd, iov, iovcnt);
+
+		if (nr < 0) {
+			if (errno == EINTR)
+				continue;
+			if (handle_nonblock(fd, POLLOUT, errno))
+				continue;
+		}
+
+		return nr;
+	}
+}
+#endif /* !HAVE_WRITEV */
+
 /*
  * xpread() is the same as pread(), but it automatically restarts pread()
  * operations with a recoverable error (EAGAIN and EINTR). xpread() DOES
diff --git a/wrapper.h b/wrapper.h
index 1b2b047ea06927..3d33c63d4f9482 100644
--- a/wrapper.h
+++ b/wrapper.h
@@ -16,6 +16,7 @@ void *xmmap_gently(void *start, size_t length, int prot, int flags, int fd, off_
 int xopen(const char *path, int flags, ...);
 ssize_t xread(int fd, void *buf, size_t len);
 ssize_t xwrite(int fd, const void *buf, size_t len);
+ssize_t xwritev(int fd, const struct git_iovec *, int iovcnt);
 ssize_t xpread(int fd, void *buf, size_t len, off_t offset);
 int xdup(int fd);
 FILE *xfopen(const char *path, const char *mode);
diff --git a/write-or-die.c b/write-or-die.c
index 01a9a51fa2fcd7..227b05116581a7 100644
--- a/write-or-die.c
+++ b/write-or-die.c
@@ -107,3 +107,69 @@ void fflush_or_die(FILE *f)
 	if (fflush(f))
 		die_errno("fflush error");
 }
+
+void fwritev_or_die(FILE *fp, const struct git_iovec *iov, int iovcnt)
+{
+	int i;
+
+	for (i = 0; i < iovcnt; i++) {
+		size_t n = iov[i].iov_len;
+
+		if (fwrite(iov[i].iov_base, 1, n, fp) != n)
+			die_errno("unable to write to FD=%d", fileno(fp));
+	}
+}
+
+/*
+ * note: we don't care about atomicity from writev(2) right now.
+ * The goal is to avoid allocations+copies in the writer and
+ * reduce wakeups+syscalls in the reader.
+ * n.b. @iov is not const since we modify it to avoid allocating
+ * on partial write.
+ */
+#ifdef HAVE_WRITEV
+void writev_or_die(int fd, struct git_iovec *iov, int iovcnt)
+{
+	int i;
+
+	while (iovcnt > 0) {
+		ssize_t n = xwritev(fd, iov, iovcnt);
+
+		/* EINVAL happens when sum of iov_len exceeds SSIZE_MAX */
+		if (n < 0 && errno == EINVAL)
+			n = xwrite(fd, iov[0].iov_base, iov[0].iov_len);
+		if (n < 0) {
+			check_pipe(errno);
+			die_errno("writev error");
+		} else if (!n) {
+			errno = ENOSPC;
+			die_errno("writev_error");
+		}
+		/* skip fully written iovs, retry from the first partial iov */
+		for (i = 0; i < iovcnt; i++) {
+			if (n >= iov[i].iov_len) {
+				n -= iov[i].iov_len;
+			} else {
+				iov[i].iov_len -= n;
+				iov[i].iov_base = (char *)iov[i].iov_base + n;
+				break;
+			}
+		}
+		iovcnt -= i;
+		iov += i;
+	}
+}
+#else /* !HAVE_WRITEV */
+
+/*
+ * n.b. don't use stdio fwrite here even if it's faster, @fd may be
+ * non-blocking and stdio isn't equipped for EAGAIN
+ */
+void writev_or_die(int fd, struct git_iovec *iov, int iovcnt)
+{
+	int i;
+
+	for (i = 0; i < iovcnt; i++)
+		write_or_die(fd, iov[i].iov_base, iov[i].iov_len);
+}
+#endif /* !HAVE_WRITEV */
diff --git a/write-or-die.h b/write-or-die.h
index 65a5c42a47ac86..20abec211c72f8 100644
--- a/write-or-die.h
+++ b/write-or-die.h
@@ -7,6 +7,8 @@ void fprintf_or_die(FILE *, const char *fmt, ...);
 void fwrite_or_die(FILE *f, const void *buf, size_t count);
 void fflush_or_die(FILE *f);
 void write_or_die(int fd, const void *buf, size_t count);
+void writev_or_die(int fd, struct git_iovec *, int iovcnt);
+void fwritev_or_die(FILE *, const struct git_iovec *, int iovcnt);
 
 /*
  * These values are used to help identify parts of a repository to fsync.

From 31c5b4b16f885e2398bdc4abe1b271315e9c1f41 Mon Sep 17 00:00:00 2001
From: Christian Couder <christian.couder@gmail.com>
Date: Tue, 10 Sep 2024 18:29:57 +0200
Subject: [PATCH 15/69] version: refactor strbuf_sanitize()

The git_user_agent_sanitized() function performs some sanitizing to
avoid special characters being sent over the line and possibly messing
up with the protocol or with the parsing on the other side.

Let's extract this sanitizing into a new strbuf_sanitize() function, as
we will want to reuse it in a following patch, and let's put it into
strbuf.{c,h}.

While at it, let's also make a few small improvements:
  - use 'size_t' for 'i' instead of 'int',
  - move the declaration of 'i' inside the 'for ( ... )',
  - use strbuf_detach() to explicitly detach the string contained by
    the 'sb' strbuf.

Helped-by: Eric Sunshine <sunshine@sunshineco.com>
Signed-off-by: Christian Couder <chriscool@tuxfamily.org>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
---
 strbuf.c  | 9 +++++++++
 strbuf.h  | 7 +++++++
 version.c | 9 ++-------
 3 files changed, 18 insertions(+), 7 deletions(-)

diff --git a/strbuf.c b/strbuf.c
index 3d2189a7f648dc..cccfdec0e360af 100644
--- a/strbuf.c
+++ b/strbuf.c
@@ -1082,3 +1082,12 @@ void strbuf_strip_file_from_path(struct strbuf *sb)
 	char *path_sep = find_last_dir_sep(sb->buf);
 	strbuf_setlen(sb, path_sep ? path_sep - sb->buf + 1 : 0);
 }
+
+void strbuf_sanitize(struct strbuf *sb)
+{
+	strbuf_trim(sb);
+	for (size_t i = 0; i < sb->len; i++) {
+		if (sb->buf[i] <= 32 || sb->buf[i] >= 127)
+			sb->buf[i] = '.';
+	}
+}
diff --git a/strbuf.h b/strbuf.h
index 003f880ff7d61c..884157873ec9a7 100644
--- a/strbuf.h
+++ b/strbuf.h
@@ -664,6 +664,13 @@ typedef int (*char_predicate)(char ch);
 void strbuf_addstr_urlencode(struct strbuf *sb, const char *name,
 			     char_predicate allow_unencoded_fn);
 
+/*
+ * Trim and replace each character with ascii code below 32 or above
+ * 127 (included) using a dot '.' character. Useful for sending
+ * capabilities.
+ */
+void strbuf_sanitize(struct strbuf *sb);
+
 __attribute__((format (printf,1,2)))
 int printf_ln(const char *fmt, ...);
 __attribute__((format (printf,2,3)))
diff --git a/version.c b/version.c
index 41b718c29e1b9f..951e6dca74eb4b 100644
--- a/version.c
+++ b/version.c
@@ -24,15 +24,10 @@ const char *git_user_agent_sanitized(void)
 
 	if (!agent) {
 		struct strbuf buf = STRBUF_INIT;
-		int i;
 
 		strbuf_addstr(&buf, git_user_agent());
-		strbuf_trim(&buf);
-		for (i = 0; i < buf.len; i++) {
-			if (buf.buf[i] <= 32 || buf.buf[i] >= 127)
-				buf.buf[i] = '.';
-		}
-		agent = buf.buf;
+		strbuf_sanitize(&buf);
+		agent = strbuf_detach(&buf, NULL);
 	}
 
 	return agent;

From 8afd0ee95a58e8997bcebbdd0e0a07956dc08a63 Mon Sep 17 00:00:00 2001
From: Christian Couder <christian.couder@gmail.com>
Date: Tue, 10 Sep 2024 18:29:58 +0200
Subject: [PATCH 16/69] strbuf: refactor strbuf_trim_trailing_ch()

We often have to split strings at some specified terminator character.
The strbuf_split*() functions, that we can use for this purpose,
return substrings that include the terminator character, so we often
need to remove that character.

When it is a whitespace, newline or directory separator, the
terminator character can easily be removed using an existing triming
function like strbuf_rtrim(), strbuf_trim_trailing_newline() or
strbuf_trim_trailing_dir_sep(). There is no function to remove that
character when it's not one of those characters though.

Let's introduce a new strbuf_trim_trailing_ch() function that can be
used to remove any trailing character, and let's refactor existing code
that manually removed trailing characters using this new function.

We are also going to use this new function in a following commit.

Signed-off-by: Christian Couder <chriscool@tuxfamily.org>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
---
 strbuf.c         |  7 +++++++
 strbuf.h         |  3 +++
 trace2/tr2_cfg.c | 10 ++--------
 3 files changed, 12 insertions(+), 8 deletions(-)

diff --git a/strbuf.c b/strbuf.c
index cccfdec0e360af..c986ec28f4d2b5 100644
--- a/strbuf.c
+++ b/strbuf.c
@@ -134,6 +134,13 @@ void strbuf_trim_trailing_dir_sep(struct strbuf *sb)
 	sb->buf[sb->len] = '\0';
 }
 
+void strbuf_trim_trailing_ch(struct strbuf *sb, int c)
+{
+	while (sb->len > 0 && sb->buf[sb->len - 1] == c)
+		sb->len--;
+	sb->buf[sb->len] = '\0';
+}
+
 void strbuf_trim_trailing_newline(struct strbuf *sb)
 {
 	if (sb->len > 0 && sb->buf[sb->len - 1] == '\n') {
diff --git a/strbuf.h b/strbuf.h
index 884157873ec9a7..5e389ab06575a3 100644
--- a/strbuf.h
+++ b/strbuf.h
@@ -197,6 +197,9 @@ void strbuf_trim_trailing_dir_sep(struct strbuf *sb);
 /* Strip trailing LF or CR/LF */
 void strbuf_trim_trailing_newline(struct strbuf *sb);
 
+/* Strip trailing character c */
+void strbuf_trim_trailing_ch(struct strbuf *sb, int c);
+
 /**
  * Replace the contents of the strbuf with a reencoded form.  Returns -1
  * on error, 0 on success.
diff --git a/trace2/tr2_cfg.c b/trace2/tr2_cfg.c
index d96d908bb9df6f..356fcd38f48f7a 100644
--- a/trace2/tr2_cfg.c
+++ b/trace2/tr2_cfg.c
@@ -33,10 +33,7 @@ static int tr2_cfg_load_patterns(void)
 
 	tr2_cfg_patterns = strbuf_split_buf(envvar, strlen(envvar), ',', -1);
 	for (s = tr2_cfg_patterns; *s; s++) {
-		struct strbuf *buf = *s;
-
-		if (buf->len && buf->buf[buf->len - 1] == ',')
-			strbuf_setlen(buf, buf->len - 1);
+		strbuf_trim_trailing_ch(*s, ',');
 		strbuf_trim_trailing_newline(*s);
 		strbuf_trim(*s);
 	}
@@ -72,10 +69,7 @@ static int tr2_load_env_vars(void)
 
 	tr2_cfg_env_vars = strbuf_split_buf(varlist, strlen(varlist), ',', -1);
 	for (s = tr2_cfg_env_vars; *s; s++) {
-		struct strbuf *buf = *s;
-
-		if (buf->len && buf->buf[buf->len - 1] == ',')
-			strbuf_setlen(buf, buf->len - 1);
+		strbuf_trim_trailing_ch(*s, ',');
 		strbuf_trim_trailing_newline(*s);
 		strbuf_trim(*s);
 	}

From 3cb155a011f548736f859841631070431c2ad3b2 Mon Sep 17 00:00:00 2001
From: Christian Couder <christian.couder@gmail.com>
Date: Tue, 10 Sep 2024 18:29:59 +0200
Subject: [PATCH 17/69] Add 'promisor-remote' capability to protocol v2

When a server S knows that some objects from a repository are available
from a promisor remote X, S might want to suggest to a client C cloning
or fetching the repo from S that C should use X directly instead of S
for these objects.

Note that this could happen both in the case S itself doesn't have the
objects and borrows them from X, and in the case S has the objects but
knows that X is better connected to the world (e.g., it is in a
$LARGEINTERNETCOMPANY datacenter with petabit/s backbone connections)
than S. Implementation of the latter case, which would require S to
omit in its response the objects available on X, is left for future
improvement though.

Then C might or might not, want to get the objects from X, and should
let S know about this.

To allow S and C to agree and let each other know about C using X or
not, let's introduce a new "promisor-remote" capability in the
protocol v2, as well as a few new configuration variables:

  - "promisor.advertise" on the server side, and:
  - "promisor.acceptFromServer" on the client side.

By default, or if "promisor.advertise" is set to 'false', a server S will
not advertise the "promisor-remote" capability.

If S doesn't advertise the "promisor-remote" capability, then a client C
replying to S shouldn't advertise the "promisor-remote" capability
either.

If "promisor.advertise" is set to 'true', S will advertise its promisor
remotes with a string like:

  promisor-remote=<pr-info>[;<pr-info>]...

where each <pr-info> element contains information about a single
promisor remote in the form:

  name=<pr-name>[,url=<pr-url>]

where <pr-name> is the urlencoded name of a promisor remote and
<pr-url> is the urlencoded URL of the promisor remote named <pr-name>.

For now, the URL is passed in addition to the name. In the future, it
might be possible to pass other information like a filter-spec that the
client should use when cloning from S, or a token that the client should
use when retrieving objects from X.

It might also be possible in the future for "promisor.advertise" to have
other values. For example a value like "onlyName" could prevent S from
advertising URLs, which could help in case C should use a different URL
for X than the URL S is using. (The URL S is using might be an internal
one on the server side for example.)

By default or if "promisor.acceptFromServer" is set to "None", C will
not accept to use the promisor remotes that might have been advertised
by S. In this case, C will not advertise any "promisor-remote"
capability in its reply to S.

If "promisor.acceptFromServer" is set to "All" and S advertised some
promisor remotes, then on the contrary, C will accept to use all the
promisor remotes that S advertised and C will reply with a string like:

  promisor-remote=<pr-name>[;<pr-name>]...

where the <pr-name> elements are the urlencoded names of all the
promisor remotes S advertised.

In a following commit, other values for "promisor.acceptFromServer" will
be implemented, so that C will be able to decide the promisor remotes it
accepts depending on the name and URL it received from S. So even if
that name and URL information is not used much right now, it will be
needed soon.

Helped-by: Taylor Blau <me@ttaylorr.com>
Helped-by: Patrick Steinhardt <ps@pks.im>
Signed-off-by: Christian Couder <chriscool@tuxfamily.org>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
---
 Documentation/config/promisor.txt     |  17 +++
 Documentation/gitprotocol-v2.txt      |  54 +++++++
 connect.c                             |   9 ++
 promisor-remote.c                     | 198 ++++++++++++++++++++++++++
 promisor-remote.h                     |  36 ++++-
 serve.c                               |  26 ++++
 t/t5710-promisor-remote-capability.sh | 124 ++++++++++++++++
 upload-pack.c                         |   3 +
 8 files changed, 466 insertions(+), 1 deletion(-)
 create mode 100755 t/t5710-promisor-remote-capability.sh

diff --git a/Documentation/config/promisor.txt b/Documentation/config/promisor.txt
index 98c5cb2ec20d34..9cbfe3e59ea834 100644
--- a/Documentation/config/promisor.txt
+++ b/Documentation/config/promisor.txt
@@ -1,3 +1,20 @@
 promisor.quiet::
 	If set to "true" assume `--quiet` when fetching additional
 	objects for a partial clone.
+
+promisor.advertise::
+	If set to "true", a server will use the "promisor-remote"
+	capability, see linkgit:gitprotocol-v2[5], to advertise the
+	promisor remotes it is using, if it uses some. Default is
+	"false", which means the "promisor-remote" capability is not
+	advertised.
+
+promisor.acceptFromServer::
+	If set to "all", a client will accept all the promisor remotes
+	a server might advertise using the "promisor-remote"
+	capability. Default is "none", which means no promisor remote
+	advertised by a server will be accepted. By accepting a
+	promisor remote, the client agrees that the server might omit
+	objects that are lazily fetchable from this promisor remote
+	from its responses to "fetch" and "clone" requests from the
+	client. See linkgit:gitprotocol-v2[5].
diff --git a/Documentation/gitprotocol-v2.txt b/Documentation/gitprotocol-v2.txt
index 414bc625d5dd21..65d5256baf1a19 100644
--- a/Documentation/gitprotocol-v2.txt
+++ b/Documentation/gitprotocol-v2.txt
@@ -781,6 +781,60 @@ retrieving the header from a bundle at the indicated URI, and thus
 save themselves and the server(s) the request(s) needed to inspect the
 headers of that bundle or bundles.
 
+promisor-remote=<pr-infos>
+~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+The server may advertise some promisor remotes it is using or knows
+about to a client which may want to use them as its promisor remotes,
+instead of this repository. In this case <pr-infos> should be of the
+form:
+
+	pr-infos = pr-info | pr-infos ";" pr-info
+
+	pr-info = "name=" pr-name | "name=" pr-name "," "url=" pr-url
+
+where `pr-name` is the urlencoded name of a promisor remote, and
+`pr-url` the urlencoded URL of that promisor remote.
+
+In this case, if the client decides to use one or more promisor
+remotes the server advertised, it can reply with
+"promisor-remote=<pr-names>" where <pr-names> should be of the form:
+
+	pr-names = pr-name | pr-names ";" pr-name
+
+where `pr-name` is the urlencoded name of a promisor remote the server
+advertised and the client accepts.
+
+Note that, everywhere in this document, `pr-name` MUST be a valid
+remote name, and the ';' and ',' characters MUST be encoded if they
+appear in `pr-name` or `pr-url`.
+
+If the server doesn't know any promisor remote that could be good for
+a client to use, or prefers a client not to use any promisor remote it
+uses or knows about, it shouldn't advertise the "promisor-remote"
+capability at all.
+
+In this case, or if the client doesn't want to use any promisor remote
+the server advertised, the client shouldn't advertise the
+"promisor-remote" capability at all in its reply.
+
+The "promisor.advertise" and "promisor.acceptFromServer" configuration
+options can be used on the server and client side respectively to
+control what they advertise or accept respectively. See the
+documentation of these configuration options for more information.
+
+Note that in the future it would be nice if the "promisor-remote"
+protocol capability could be used by the server, when responding to
+`git fetch` or `git clone`, to advertise better-connected remotes that
+the client can use as promisor remotes, instead of this repository, so
+that the client can lazily fetch objects from these other
+better-connected remotes. This would require the server to omit in its
+response the objects available on the better-connected remotes that
+the client has accepted. This hasn't been implemented yet though. So
+for now this "promisor-remote" capability is useful only when the
+server advertises some promisor remotes it already uses to borrow
+objects from.
+
 GIT
 ---
 Part of the linkgit:git[1] suite
diff --git a/connect.c b/connect.c
index cf84e631e9f266..1650bbd71d4c9a 100644
--- a/connect.c
+++ b/connect.c
@@ -20,6 +20,7 @@
 #include "protocol.h"
 #include "alias.h"
 #include "bundle-uri.h"
+#include "promisor-remote.h"
 
 static char *server_capabilities_v1;
 static struct strvec server_capabilities_v2 = STRVEC_INIT;
@@ -485,6 +486,7 @@ void check_stateless_delimiter(int stateless_rpc,
 static void send_capabilities(int fd_out, struct packet_reader *reader)
 {
 	const char *hash_name;
+	const char *promisor_remote_info;
 
 	if (server_supports_v2("agent"))
 		packet_write_fmt(fd_out, "agent=%s", git_user_agent_sanitized());
@@ -498,6 +500,13 @@ static void send_capabilities(int fd_out, struct packet_reader *reader)
 	} else {
 		reader->hash_algo = &hash_algos[GIT_HASH_SHA1];
 	}
+	if (server_feature_v2("promisor-remote", &promisor_remote_info)) {
+		char *reply = promisor_remote_reply(promisor_remote_info);
+		if (reply) {
+			packet_write_fmt(fd_out, "promisor-remote=%s", reply);
+			free(reply);
+		}
+	}
 }
 
 int get_remote_bundle_uri(int fd_out, struct packet_reader *reader,
diff --git a/promisor-remote.c b/promisor-remote.c
index 317e1b127fede4..baacbe9d949b08 100644
--- a/promisor-remote.c
+++ b/promisor-remote.c
@@ -11,6 +11,7 @@
 #include "strvec.h"
 #include "packfile.h"
 #include "environment.h"
+#include "url.h"
 
 struct promisor_remote_config {
 	struct promisor_remote *promisors;
@@ -219,6 +220,18 @@ int repo_has_promisor_remote(struct repository *r)
 	return !!repo_promisor_remote_find(r, NULL);
 }
 
+int repo_has_accepted_promisor_remote(struct repository *r)
+{
+	struct promisor_remote *p;
+
+	promisor_remote_init(r);
+
+	for (p = r->promisor_remote_config->promisors; p; p = p->next)
+		if (p->accepted)
+			return 1;
+	return 0;
+}
+
 static int remove_fetched_oids(struct repository *repo,
 			       struct object_id **oids,
 			       int oid_nr, int to_free)
@@ -290,3 +303,188 @@ void promisor_remote_get_direct(struct repository *repo,
 	if (to_free)
 		free(remaining_oids);
 }
+
+static int allow_unsanitized(char ch)
+{
+	if (ch == ',' || ch == ';' || ch == '%')
+		return 0;
+	return ch > 32 && ch < 127;
+}
+
+static void promisor_info_vecs(struct repository *repo,
+			       struct strvec *names,
+			       struct strvec *urls)
+{
+	struct promisor_remote *r;
+
+	promisor_remote_init(repo);
+
+	for (r = repo->promisor_remote_config->promisors; r; r = r->next) {
+		char *url;
+		char *url_key = xstrfmt("remote.%s.url", r->name);
+
+		strvec_push(names, r->name);
+		strvec_push(urls, git_config_get_string(url_key, &url) ? NULL : url);
+
+		free(url);
+		free(url_key);
+	}
+}
+
+char *promisor_remote_info(struct repository *repo)
+{
+	struct strbuf sb = STRBUF_INIT;
+	int advertise_promisors = 0;
+	struct strvec names = STRVEC_INIT;
+	struct strvec urls = STRVEC_INIT;
+
+	git_config_get_bool("promisor.advertise", &advertise_promisors);
+
+	if (!advertise_promisors)
+		return NULL;
+
+	promisor_info_vecs(repo, &names, &urls);
+
+	if (!names.nr)
+		return NULL;
+
+	for (size_t i = 0; i < names.nr; i++) {
+		if (i)
+			strbuf_addch(&sb, ';');
+		strbuf_addstr(&sb, "name=");
+		strbuf_addstr_urlencode(&sb, names.v[i], allow_unsanitized);
+		if (urls.v[i]) {
+			strbuf_addstr(&sb, ",url=");
+			strbuf_addstr_urlencode(&sb, urls.v[i], allow_unsanitized);
+		}
+	}
+
+	strbuf_sanitize(&sb);
+
+	strvec_clear(&names);
+	strvec_clear(&urls);
+
+	return strbuf_detach(&sb, NULL);
+}
+
+enum accept_promisor {
+	ACCEPT_NONE = 0,
+	ACCEPT_ALL
+};
+
+static int should_accept_remote(enum accept_promisor accept,
+				const char *remote_name UNUSED,
+				const char *remote_url UNUSED)
+{
+	if (accept == ACCEPT_ALL)
+		return 1;
+
+	BUG("Unhandled 'enum accept_promisor' value '%d'", accept);
+}
+
+static void filter_promisor_remote(struct repository *repo,
+				   struct strvec *accepted,
+				   const char *info)
+{
+	struct strbuf **remotes;
+	char *accept_str;
+	enum accept_promisor accept = ACCEPT_NONE;
+
+	if (!git_config_get_string("promisor.acceptfromserver", &accept_str)) {
+		if (!accept_str || !*accept_str || !strcasecmp("None", accept_str))
+			accept = ACCEPT_NONE;
+		else if (!strcasecmp("All", accept_str))
+			accept = ACCEPT_ALL;
+		else
+			warning(_("unknown '%s' value for '%s' config option"),
+				accept_str, "promisor.acceptfromserver");
+	}
+
+	if (accept == ACCEPT_NONE)
+		return;
+
+	/* Parse remote info received */
+
+	remotes = strbuf_split_str(info, ';', 0);
+
+	for (size_t i = 0; remotes[i]; i++) {
+		struct strbuf **elems;
+		const char *remote_name = NULL;
+		const char *remote_url = NULL;
+		char *decoded_name = NULL;
+		char *decoded_url = NULL;
+
+		strbuf_trim_trailing_ch(remotes[i], ';');
+		elems = strbuf_split_str(remotes[i]->buf, ',', 0);
+
+		for (size_t j = 0; elems[j]; j++) {
+			int res;
+			strbuf_trim_trailing_ch(elems[j], ',');
+			res = skip_prefix(elems[j]->buf, "name=", &remote_name) ||
+				skip_prefix(elems[j]->buf, "url=", &remote_url);
+			if (!res)
+				warning(_("unknown element '%s' from remote info"),
+					elems[j]->buf);
+		}
+
+		if (remote_name)
+			decoded_name = url_percent_decode(remote_name);
+		if (remote_url)
+			decoded_url = url_percent_decode(remote_url);
+
+		if (decoded_name && should_accept_remote(accept, decoded_name, decoded_url))
+			strvec_push(accepted, decoded_name);
+
+		strbuf_list_free(elems);
+		free(decoded_name);
+		free(decoded_url);
+	}
+
+	free(accept_str);
+	strbuf_list_free(remotes);
+}
+
+char *promisor_remote_reply(const char *info)
+{
+	struct strvec accepted = STRVEC_INIT;
+	struct strbuf reply = STRBUF_INIT;
+
+	filter_promisor_remote(the_repository, &accepted, info);
+
+	if (!accepted.nr)
+		return NULL;
+
+	for (size_t i = 0; i < accepted.nr; i++) {
+		if (i)
+			strbuf_addch(&reply, ';');
+		strbuf_addstr_urlencode(&reply, accepted.v[i], allow_unsanitized);
+	}
+
+	strvec_clear(&accepted);
+
+	return strbuf_detach(&reply, NULL);
+}
+
+void mark_promisor_remotes_as_accepted(struct repository *r, const char *remotes)
+{
+	struct strbuf **accepted_remotes = strbuf_split_str(remotes, ';', 0);
+
+	for (size_t i = 0; accepted_remotes[i]; i++) {
+		struct promisor_remote *p;
+		char *decoded_remote;
+
+		strbuf_trim_trailing_ch(accepted_remotes[i], ';');
+		decoded_remote = url_percent_decode(accepted_remotes[i]->buf);
+
+		p = repo_promisor_remote_find(r, decoded_remote);
+		if (p)
+			p->accepted = 1;
+		else
+			warning(_("accepted promisor remote '%s' not found"),
+				decoded_remote);
+
+		free(decoded_remote);
+	}
+
+	strbuf_list_free(accepted_remotes);
+}
diff --git a/promisor-remote.h b/promisor-remote.h
index 88cb599c391aea..814ca248c77a37 100644
--- a/promisor-remote.h
+++ b/promisor-remote.h
@@ -9,11 +9,13 @@ struct object_id;
  * Promisor remote linked list
  *
  * Information in its fields come from remote.XXX config entries or
- * from extensions.partialclone.
+ * from extensions.partialclone, except for 'accepted' which comes
+ * from protocol v2 capabilities exchange.
  */
 struct promisor_remote {
 	struct promisor_remote *next;
 	char *partial_clone_filter;
+	unsigned int accepted : 1;
 	const char name[FLEX_ARRAY];
 };
 
@@ -32,4 +34,36 @@ void promisor_remote_get_direct(struct repository *repo,
 				const struct object_id *oids,
 				int oid_nr);
 
+/*
+ * Prepare a "promisor-remote" advertisement by a server.
+ * Check the value of "promisor.advertise" and maybe the configured
+ * promisor remotes, if any, to prepare information to send in an
+ * advertisement.
+ * Return value is NULL if no promisor remote advertisement should be
+ * made. Otherwise it contains the names and urls of the advertised
+ * promisor remotes separated by ';'
+ */
+char *promisor_remote_info(struct repository *repo);
+
+/*
+ * Prepare a reply to a "promisor-remote" advertisement from a server.
+ * Check the value of "promisor.acceptfromserver" and maybe the
+ * configured promisor remotes, if any, to prepare the reply.
+ * Return value is NULL if no promisor remote from the server
+ * is accepted. Otherwise it contains the names of the accepted promisor
+ * remotes separated by ';'.
+ */
+char *promisor_remote_reply(const char *info);
+
+/*
+ * Set the 'accepted' flag for some promisor remotes. Useful when some
+ * promisor remotes have been accepted by the client.
+ */
+void mark_promisor_remotes_as_accepted(struct repository *repo, const char *remotes);
+
+/*
+ * Has any promisor remote been accepted by the client?
+ */
+int repo_has_accepted_promisor_remote(struct repository *r);
+
 #endif /* PROMISOR_REMOTE_H */
diff --git a/serve.c b/serve.c
index 884cd84ca8a0ea..a8935571d6d8fa 100644
--- a/serve.c
+++ b/serve.c
@@ -12,6 +12,7 @@
 #include "upload-pack.h"
 #include "bundle-uri.h"
 #include "trace2.h"
+#include "promisor-remote.h"
 
 static int advertise_sid = -1;
 static int advertise_object_info = -1;
@@ -31,6 +32,26 @@ static int agent_advertise(struct repository *r UNUSED,
 	return 1;
 }
 
+static int promisor_remote_advertise(struct repository *r,
+				     struct strbuf *value)
+{
+	if (value) {
+		char *info = promisor_remote_info(r);
+		if (!info)
+			return 0;
+		strbuf_addstr(value, info);
+		free(info);
+	}
+	return 1;
+}
+
+static void promisor_remote_receive(struct repository *r,
+				    const char *remotes)
+{
+	mark_promisor_remotes_as_accepted(r, remotes);
+}
+
+
 static int object_format_advertise(struct repository *r,
 				   struct strbuf *value)
 {
@@ -157,6 +178,11 @@ static struct protocol_capability capabilities[] = {
 		.advertise = bundle_uri_advertise,
 		.command = bundle_uri_command,
 	},
+	{
+		.name = "promisor-remote",
+		.advertise = promisor_remote_advertise,
+		.receive = promisor_remote_receive,
+	},
 };
 
 void protocol_v2_advertise_capabilities(void)
diff --git a/t/t5710-promisor-remote-capability.sh b/t/t5710-promisor-remote-capability.sh
new file mode 100755
index 00000000000000..7e44ad15ceea23
--- /dev/null
+++ b/t/t5710-promisor-remote-capability.sh
@@ -0,0 +1,124 @@
+#!/bin/sh
+
+test_description='handling of promisor remote advertisement'
+
+. ./test-lib.sh
+
+# Setup the repository with three commits, this way HEAD is always
+# available and we can hide commit 1 or 2.
+test_expect_success 'setup: create "template" repository' '
+	git init template &&
+	test_commit -C template 1 &&
+	test_commit -C template 2 &&
+	test_commit -C template 3 &&
+	test-tool genrandom foo 10240 >template/foo &&
+	git -C template add foo &&
+	git -C template commit -m foo
+'
+
+# A bare repo will act as a server repo with unpacked objects.
+test_expect_success 'setup: create bare "server" repository' '
+	git clone --bare --no-local template server &&
+	mv server/objects/pack/pack-* . &&
+	packfile=$(ls pack-*.pack) &&
+	git -C server unpack-objects --strict <"$packfile"
+'
+
+check_missing_objects () {
+	git -C "$1" rev-list --objects --all --missing=print > all.txt &&
+	perl -ne 'print if s/^[?]//' all.txt >missing.txt &&
+	test_line_count = "$2" missing.txt &&
+	test "$3" = "$(cat missing.txt)"
+}
+
+initialize_server () {
+	# Repack everything first
+	git -C server -c repack.writebitmaps=false repack -a -d &&
+
+	# Remove promisor file in case they exist, useful when reinitializing
+	rm -rf server/objects/pack/*.promisor &&
+
+	# Repack without the largest object and create a promisor pack on server
+	git -C server -c repack.writebitmaps=false repack -a -d \
+	    --filter=blob:limit=5k --filter-to="$(pwd)" &&
+	promisor_file=$(ls server/objects/pack/*.pack | sed "s/\.pack/.promisor/") &&
+	touch "$promisor_file" &&
+
+	# Check that only one object is missing on the server
+	check_missing_objects server 1 "$oid"
+}
+
+test_expect_success "setup for testing promisor remote advertisement" '
+	# Create another bare repo called "server2"
+	git init --bare server2 &&
+
+	# Copy the largest object from server to server2
+	obj="HEAD:foo" &&
+	oid="$(git -C server rev-parse $obj)" &&
+	oid_path="$(test_oid_to_path $oid)" &&
+	path="server/objects/$oid_path" &&
+	path2="server2/objects/$oid_path" &&
+	mkdir -p $(dirname "$path2") &&
+	cp "$path" "$path2" &&
+
+	initialize_server &&
+
+	# Configure server2 as promisor remote for server
+	git -C server remote add server2 "file://$(pwd)/server2" &&
+	git -C server config remote.server2.promisor true &&
+
+	git -C server2 config uploadpack.allowFilter true &&
+	git -C server2 config uploadpack.allowAnySHA1InWant true &&
+	git -C server config uploadpack.allowFilter true &&
+	git -C server config uploadpack.allowAnySHA1InWant true
+'
+
+test_expect_success "fetch with promisor.advertise set to 'true'" '
+	git -C server config promisor.advertise true &&
+
+	# Clone from server to create a client
+	GIT_NO_LAZY_FETCH=0 git clone -c remote.server2.promisor=true \
+		-c remote.server2.fetch="+refs/heads/*:refs/remotes/server2/*" \
+		-c remote.server2.url="file://$(pwd)/server2" \
+		-c promisor.acceptfromserver=All \
+		--no-local --filter="blob:limit=5k" server client &&
+	test_when_finished "rm -rf client" &&
+
+	# Check that the largest object is still missing on the server
+	check_missing_objects server 1 "$oid"
+'
+
+test_expect_success "fetch with promisor.advertise set to 'false'" '
+	git -C server config promisor.advertise false &&
+
+	# Clone from server to create a client
+	GIT_NO_LAZY_FETCH=0 git clone -c remote.server2.promisor=true \
+		-c remote.server2.fetch="+refs/heads/*:refs/remotes/server2/*" \
+		-c remote.server2.url="file://$(pwd)/server2" \
+		-c promisor.acceptfromserver=All \
+		--no-local --filter="blob:limit=5k" server client &&
+	test_when_finished "rm -rf client" &&
+
+	# Check that the largest object is not missing on the server
+	check_missing_objects server 0 "" &&
+
+	# Reinitialize server so that the largest object is missing again
+	initialize_server
+'
+
+test_expect_success "fetch with promisor.acceptfromserver set to 'None'" '
+	git -C server config promisor.advertise true &&
+
+	# Clone from server to create a client
+	GIT_NO_LAZY_FETCH=0 git clone -c remote.server2.promisor=true \
+		-c remote.server2.fetch="+refs/heads/*:refs/remotes/server2/*" \
+		-c remote.server2.url="file://$(pwd)/server2" \
+		-c promisor.acceptfromserver=None \
+		--no-local --filter="blob:limit=5k" server client &&
+	test_when_finished "rm -rf client" &&
+
+	# Check that the largest object is not missing on the server
+	check_missing_objects server 0 ""
+'
+
+test_done
diff --git a/upload-pack.c b/upload-pack.c
index 0052c6a4dce1fa..0cff76c8453549 100644
--- a/upload-pack.c
+++ b/upload-pack.c
@@ -31,6 +31,7 @@
 #include "write-or-die.h"
 #include "json-writer.h"
 #include "strmap.h"
+#include "promisor-remote.h"
 
 /* Remember to update object flag allocation in object.h */
 #define THEY_HAVE	(1u << 11)
@@ -317,6 +318,8 @@ static void create_pack_file(struct upload_pack_data *pack_data,
 		strvec_push(&pack_objects.args, "--delta-base-offset");
 	if (pack_data->use_include_tag)
 		strvec_push(&pack_objects.args, "--include-tag");
+	if (repo_has_accepted_promisor_remote(the_repository))
+		strvec_push(&pack_objects.args, "--missing=allow-promisor");
 	if (pack_data->filter_options.choice) {
 		const char *spec =
 			expand_list_objects_filter_spec(&pack_data->filter_options);

From bc0c4e163775992394d5a3b83be20f990e2cdd4a Mon Sep 17 00:00:00 2001
From: Christian Couder <christian.couder@gmail.com>
Date: Tue, 10 Sep 2024 18:30:00 +0200
Subject: [PATCH 18/69] promisor-remote: check advertised name or URL

A previous commit introduced a "promisor.acceptFromServer" configuration
variable with only "None" or "All" as valid values.

Let's introduce "KnownName" and "KnownUrl" as valid values for this
configuration option to give more choice to a client about which
promisor remotes it might accept among those that the server advertised.

In case of "KnownName", the client will accept promisor remotes which
are already configured on the client and have the same name as those
advertised by the client. This could be useful in a corporate setup
where servers and clients are trusted to not switch names and URLs, but
where some kind of control is still useful.

In case of "KnownUrl", the client will accept promisor remotes which
have both the same name and the same URL configured on the client as the
name and URL advertised by the server. This is the most secure option,
so it should be used if possible.

Signed-off-by: Christian Couder <chriscool@tuxfamily.org>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
---
 Documentation/config/promisor.txt     | 22 ++++++---
 promisor-remote.c                     | 54 +++++++++++++++++++--
 t/t5710-promisor-remote-capability.sh | 68 +++++++++++++++++++++++++++
 3 files changed, 134 insertions(+), 10 deletions(-)

diff --git a/Documentation/config/promisor.txt b/Documentation/config/promisor.txt
index 9cbfe3e59ea834..d1364bc018ba61 100644
--- a/Documentation/config/promisor.txt
+++ b/Documentation/config/promisor.txt
@@ -12,9 +12,19 @@ promisor.advertise::
 promisor.acceptFromServer::
 	If set to "all", a client will accept all the promisor remotes
 	a server might advertise using the "promisor-remote"
-	capability. Default is "none", which means no promisor remote
-	advertised by a server will be accepted. By accepting a
-	promisor remote, the client agrees that the server might omit
-	objects that are lazily fetchable from this promisor remote
-	from its responses to "fetch" and "clone" requests from the
-	client. See linkgit:gitprotocol-v2[5].
+	capability. If set to "knownName" the client will accept
+	promisor remotes which are already configured on the client
+	and have the same name as those advertised by the client. This
+	is not very secure, but could be used in a corporate setup
+	where servers and clients are trusted to not switch name and
+	URLs. If set to "knownUrl", the client will accept promisor
+	remotes which have both the same name and the same URL
+	configured on the client as the name and URL advertised by the
+	server. This is more secure than "all" or "knownUrl", so it
+	should be used if possible instead of those options. Default
+	is "none", which means no promisor remote advertised by a
+	server will be accepted. By accepting a promisor remote, the
+	client agrees that the server might omit objects that are
+	lazily fetchable from this promisor remote from its responses
+	to "fetch" and "clone" requests from the client. See
+	linkgit:gitprotocol-v2[5].
diff --git a/promisor-remote.c b/promisor-remote.c
index baacbe9d949b08..f713595eb02d87 100644
--- a/promisor-remote.c
+++ b/promisor-remote.c
@@ -367,19 +367,54 @@ char *promisor_remote_info(struct repository *repo)
 	return strbuf_detach(&sb, NULL);
 }
 
+/*
+ * Find first index of 'vec' where there is 'val'. 'val' is compared
+ * case insensively to the strings in 'vec'. If not found 'vec->nr' is
+ * returned.
+ */
+static size_t strvec_find_index(struct strvec *vec, const char *val)
+{
+	for (size_t i = 0; i < vec->nr; i++)
+		if (!strcasecmp(vec->v[i], val))
+			return i;
+	return vec->nr;
+}
+
 enum accept_promisor {
 	ACCEPT_NONE = 0,
+	ACCEPT_KNOWN_URL,
+	ACCEPT_KNOWN_NAME,
 	ACCEPT_ALL
 };
 
 static int should_accept_remote(enum accept_promisor accept,
-				const char *remote_name UNUSED,
-				const char *remote_url UNUSED)
+				const char *remote_name, const char *remote_url,
+				struct strvec *names, struct strvec *urls)
 {
+	size_t i;
+
 	if (accept == ACCEPT_ALL)
 		return 1;
 
-	BUG("Unhandled 'enum accept_promisor' value '%d'", accept);
+	i = strvec_find_index(names, remote_name);
+
+	if (i >= names->nr)
+		/* We don't know about that remote */
+		return 0;
+
+	if (accept == ACCEPT_KNOWN_NAME)
+		return 1;
+
+	if (accept != ACCEPT_KNOWN_URL)
+		BUG("Unhandled 'enum accept_promisor' value '%d'", accept);
+
+	if (!strcasecmp(urls->v[i], remote_url))
+		return 1;
+
+	warning(_("known remote named '%s' but with url '%s' instead of '%s'"),
+		remote_name, urls->v[i], remote_url);
+
+	return 0;
 }
 
 static void filter_promisor_remote(struct repository *repo,
@@ -389,10 +424,16 @@ static void filter_promisor_remote(struct repository *repo,
 	struct strbuf **remotes;
 	char *accept_str;
 	enum accept_promisor accept = ACCEPT_NONE;
+	struct strvec names = STRVEC_INIT;
+	struct strvec urls = STRVEC_INIT;
 
 	if (!git_config_get_string("promisor.acceptfromserver", &accept_str)) {
 		if (!accept_str || !*accept_str || !strcasecmp("None", accept_str))
 			accept = ACCEPT_NONE;
+		else if (!strcasecmp("KnownUrl", accept_str))
+			accept = ACCEPT_KNOWN_URL;
+		else if (!strcasecmp("KnownName", accept_str))
+			accept = ACCEPT_KNOWN_NAME;
 		else if (!strcasecmp("All", accept_str))
 			accept = ACCEPT_ALL;
 		else
@@ -403,6 +444,9 @@ static void filter_promisor_remote(struct repository *repo,
 	if (accept == ACCEPT_NONE)
 		return;
 
+	if (accept != ACCEPT_ALL)
+		promisor_info_vecs(repo, &names, &urls);
+
 	/* Parse remote info received */
 
 	remotes = strbuf_split_str(info, ';', 0);
@@ -432,7 +476,7 @@ static void filter_promisor_remote(struct repository *repo,
 		if (remote_url)
 			decoded_url = url_percent_decode(remote_url);
 
-		if (decoded_name && should_accept_remote(accept, decoded_name, decoded_url))
+		if (decoded_name && should_accept_remote(accept, decoded_name, decoded_url, &names, &urls))
 			strvec_push(accepted, decoded_name);
 
 		strbuf_list_free(elems);
@@ -441,6 +485,8 @@ static void filter_promisor_remote(struct repository *repo,
 	}
 
 	free(accept_str);
+	strvec_clear(&names);
+	strvec_clear(&urls);
 	strbuf_list_free(remotes);
 }
 
diff --git a/t/t5710-promisor-remote-capability.sh b/t/t5710-promisor-remote-capability.sh
index 7e44ad15ceea23..c2c83a5914c09e 100755
--- a/t/t5710-promisor-remote-capability.sh
+++ b/t/t5710-promisor-remote-capability.sh
@@ -117,6 +117,74 @@ test_expect_success "fetch with promisor.acceptfromserver set to 'None'" '
 		--no-local --filter="blob:limit=5k" server client &&
 	test_when_finished "rm -rf client" &&
 
+	# Check that the largest object is not missing on the server
+	check_missing_objects server 0 "" &&
+
+	# Reinitialize server so that the largest object is missing again
+	initialize_server
+'
+
+test_expect_success "fetch with promisor.acceptfromserver set to 'KnownName'" '
+	git -C server config promisor.advertise true &&
+
+	# Clone from server to create a client
+	GIT_NO_LAZY_FETCH=0 git clone -c remote.server2.promisor=true \
+		-c remote.server2.fetch="+refs/heads/*:refs/remotes/server2/*" \
+		-c remote.server2.url="file://$(pwd)/server2" \
+		-c promisor.acceptfromserver=KnownName \
+		--no-local --filter="blob:limit=5k" server client &&
+	test_when_finished "rm -rf client" &&
+
+	# Check that the largest object is still missing on the server
+	check_missing_objects server 1 "$oid"
+'
+
+test_expect_success "fetch with 'KnownName' and different remote names" '
+	git -C server config promisor.advertise true &&
+
+	# Clone from server to create a client
+	GIT_NO_LAZY_FETCH=0 git clone -c remote.serverTwo.promisor=true \
+		-c remote.serverTwo.fetch="+refs/heads/*:refs/remotes/server2/*" \
+		-c remote.serverTwo.url="file://$(pwd)/server2" \
+		-c promisor.acceptfromserver=KnownName \
+		--no-local --filter="blob:limit=5k" server client &&
+	test_when_finished "rm -rf client" &&
+
+	# Check that the largest object is not missing on the server
+	check_missing_objects server 0 "" &&
+
+	# Reinitialize server so that the largest object is missing again
+	initialize_server
+'
+
+test_expect_success "fetch with promisor.acceptfromserver set to 'KnownUrl'" '
+	git -C server config promisor.advertise true &&
+
+	# Clone from server to create a client
+	GIT_NO_LAZY_FETCH=0 git clone -c remote.server2.promisor=true \
+		-c remote.server2.fetch="+refs/heads/*:refs/remotes/server2/*" \
+		-c remote.server2.url="file://$(pwd)/server2" \
+		-c promisor.acceptfromserver=KnownUrl \
+		--no-local --filter="blob:limit=5k" server client &&
+	test_when_finished "rm -rf client" &&
+
+	# Check that the largest object is still missing on the server
+	check_missing_objects server 1 "$oid"
+'
+
+test_expect_success "fetch with 'KnownUrl' and different remote urls" '
+	ln -s server2 serverTwo &&
+
+	git -C server config promisor.advertise true &&
+
+	# Clone from server to create a client
+	GIT_NO_LAZY_FETCH=0 git clone -c remote.server2.promisor=true \
+		-c remote.server2.fetch="+refs/heads/*:refs/remotes/server2/*" \
+		-c remote.server2.url="file://$(pwd)/serverTwo" \
+		-c promisor.acceptfromserver=KnownUrl \
+		--no-local --filter="blob:limit=5k" server client &&
+	test_when_finished "rm -rf client" &&
+
 	# Check that the largest object is not missing on the server
 	check_missing_objects server 0 ""
 '

From 82d283c626cac034858215eebdff8a69972d9c4e Mon Sep 17 00:00:00 2001
From: Junio C Hamano <gitster@pobox.com>
Date: Mon, 14 Oct 2024 13:44:25 -0700
Subject: [PATCH 19/69] t7500: make each piece more independent

These tests prepare the working tree & index state to have something
to be committed, and try a sequence of "test_must_fail git commit".
If an earlier one did not fail by a bug, a later one will fail for
a wrong reason (namely, "nothing to commit").

Give them "--allow-empty" to make sure that they would work even
when there is nothing to commit by accident.

Signed-off-by: Junio C Hamano <gitster@pobox.com>
Signed-off-by: Taylor Blau <me@ttaylorr.com>
---
 t/t7500-commit-template-squash-signoff.sh | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/t/t7500-commit-template-squash-signoff.sh b/t/t7500-commit-template-squash-signoff.sh
index 4dca8d97a772d6..4927b7260d6e49 100755
--- a/t/t7500-commit-template-squash-signoff.sh
+++ b/t/t7500-commit-template-squash-signoff.sh
@@ -50,33 +50,33 @@ test_expect_success 'nonexistent template file in config should return error' '
 TEMPLATE="$PWD"/template
 
 test_expect_success 'unedited template should not commit' '
-	echo "template line" > "$TEMPLATE" &&
-	test_must_fail git commit --template "$TEMPLATE"
+	echo "template line" >"$TEMPLATE" &&
+	test_must_fail git commit --allow-empty --template "$TEMPLATE"
 '
 
 test_expect_success 'unedited template with comments should not commit' '
-	echo "# comment in template" >> "$TEMPLATE" &&
-	test_must_fail git commit --template "$TEMPLATE"
+	echo "# comment in template" >>"$TEMPLATE" &&
+	test_must_fail git commit --allow-empty --template "$TEMPLATE"
 '
 
 test_expect_success 'a Signed-off-by line by itself should not commit' '
 	(
 		test_set_editor "$TEST_DIRECTORY"/t7500/add-signed-off &&
-		test_must_fail git commit --template "$TEMPLATE"
+		test_must_fail git commit --allow-empty --template "$TEMPLATE"
 	)
 '
 
 test_expect_success 'adding comments to a template should not commit' '
 	(
 		test_set_editor "$TEST_DIRECTORY"/t7500/add-comments &&
-		test_must_fail git commit --template "$TEMPLATE"
+		test_must_fail git commit --allow-empty --template "$TEMPLATE"
 	)
 '
 
 test_expect_success 'adding real content to a template should commit' '
 	(
 		test_set_editor "$TEST_DIRECTORY"/t7500/add-content &&
-		git commit --template "$TEMPLATE"
+		git commit --allow-empty --template "$TEMPLATE"
 	) &&
 	commit_msg_is "template linecommit message"
 '

From dbafaff13b9296ff97d572a97ce41649999c2d4f Mon Sep 17 00:00:00 2001
From: Junio C Hamano <gitster@pobox.com>
Date: Mon, 14 Oct 2024 13:44:26 -0700
Subject: [PATCH 20/69] config: values of pathname type can be prefixed with
 :(optional)

Sometimes people want to specify additional configuration data
as "best effort" basis.  Maybe commit.template configuration file points
at somewhere in ~/template/ but on a particular system, the file may not
exist and the user may be OK without using the template in such a case.

When the value given to a configuration variable whose type is
pathname wants to signal such an optional file, it can be marked by
prepending ":(optional)" in front of it.  Such a setting that is
marked optional would avoid getting the command barf for a missing
file, as an optional configuration setting that names a missing or
an empty file is not even seen.

cf. <xmqq5ywehb69.fsf@gitster.g>

Signed-off-by: Junio C Hamano <gitster@pobox.com>
Signed-off-by: Taylor Blau <me@ttaylorr.com>
---
 Documentation/config.txt                  |  5 ++++-
 config.c                                  | 16 ++++++++++++++--
 t/t7500-commit-template-squash-signoff.sh |  9 +++++++++
 3 files changed, 27 insertions(+), 3 deletions(-)

diff --git a/Documentation/config.txt b/Documentation/config.txt
index 8c0b3ed8075214..199e29ccea675b 100644
--- a/Documentation/config.txt
+++ b/Documentation/config.txt
@@ -358,7 +358,10 @@ compiled without runtime prefix support, the compiled-in prefix will be
 substituted instead. In the unlikely event that a literal path needs to
 be specified that should _not_ be expanded, it needs to be prefixed by
 `./`, like so: `./%(prefix)/bin`.
-
++
+If prefixed with `:(optional)`, the configuration variable is treated
+as if it does not exist, if the named path does not exist or names an
+empty file.
 
 Variables
 ~~~~~~~~~
diff --git a/config.c b/config.c
index a11bb85da303a7..4a060f1d8228a1 100644
--- a/config.c
+++ b/config.c
@@ -1364,11 +1364,23 @@ int git_config_string(char **dest, const char *var, const char *value)
 
 int git_config_pathname(char **dest, const char *var, const char *value)
 {
+	int is_optional;
+	char *path;
+
 	if (!value)
 		return config_error_nonbool(var);
-	*dest = interpolate_path(value, 0);
-	if (!*dest)
+
+	is_optional = skip_prefix(value, ":(optional)", &value);
+	path = interpolate_path(value, 0);
+	if (!path)
 		die(_("failed to expand user dir in: '%s'"), value);
+
+	if (is_optional && is_empty_or_missing_file(path)) {
+		free(path);
+		return 0;
+	}
+
+	*dest = path;
 	return 0;
 }
 
diff --git a/t/t7500-commit-template-squash-signoff.sh b/t/t7500-commit-template-squash-signoff.sh
index 4927b7260d6e49..e28a79987db981 100755
--- a/t/t7500-commit-template-squash-signoff.sh
+++ b/t/t7500-commit-template-squash-signoff.sh
@@ -46,6 +46,15 @@ test_expect_success 'nonexistent template file in config should return error' '
 	)
 '
 
+test_expect_success 'nonexistent optional template file in config' '
+	test_config commit.template ":(optional)$PWD"/notexist &&
+	(
+		GIT_EDITOR="echo hello >\"\$1\"" &&
+		export GIT_EDITOR &&
+		git commit --allow-empty
+	)
+'
+
 # From now on we'll use a template file that exists.
 TEMPLATE="$PWD"/template
 

From 2da08f2c3db128c5c07969867615701d2f1a8792 Mon Sep 17 00:00:00 2001
From: Junio C Hamano <gitster@pobox.com>
Date: Mon, 14 Oct 2024 13:44:27 -0700
Subject: [PATCH 21/69] parseopt: values of pathname type can be prefixed with
 :(optional)

In the previous step, we introduced an optional filename that can be
given to a configuration variable, and nullify the fact that such a
configuration setting even existed if the named path is missing or
empty.

Let's do the same for command line options that name a pathname.

Signed-off-by: Junio C Hamano <gitster@pobox.com>
Signed-off-by: Taylor Blau <me@ttaylorr.com>
---
 parse-options.c                           | 31 +++++++++++++++--------
 t/t7500-commit-template-squash-signoff.sh | 12 ++++++++-
 2 files changed, 31 insertions(+), 12 deletions(-)

diff --git a/parse-options.c b/parse-options.c
index 33bfba0ed4a0ea..7a2a3b1f085a30 100644
--- a/parse-options.c
+++ b/parse-options.c
@@ -75,7 +75,6 @@ static enum parse_opt_result do_get_value(struct parse_opt_ctx_t *p,
 {
 	const char *s, *arg;
 	const int unset = flags & OPT_UNSET;
-	int err;
 
 	if (unset && p->opt)
 		return error(_("%s takes no value"), optname(opt, flags));
@@ -131,21 +130,31 @@ static enum parse_opt_result do_get_value(struct parse_opt_ctx_t *p,
 	case OPTION_FILENAME:
 	{
 		const char *value;
-
-		FREE_AND_NULL(*(char **)opt->value);
-
-		err = 0;
+		int is_optional;
 
 		if (unset)
 			value = NULL;
 		else if (opt->flags & PARSE_OPT_OPTARG && !p->opt)
-			value = (const char *) opt->defval;
-		else
-			err = get_arg(p, opt, flags, &value);
+			value = (char *)opt->defval;
+		else {
+			int err = get_arg(p, opt, flags, &value);
+			if (err)
+				return err;
+		}
+		if (!value)
+			return 0;
 
-		if (!err)
-			*(char **)opt->value = fix_filename(p->prefix, value);
-		return err;
+		is_optional = skip_prefix(value, ":(optional)", &value);
+		if (!value)
+			is_optional = 0;
+		value = fix_filename(p->prefix, value);
+		if (is_optional && is_empty_or_missing_file(value)) {
+			free((char *)value);
+		} else {
+			FREE_AND_NULL(*(char **)opt->value);
+			*(const char **)opt->value = value;
+		}
+		return 0;
 	}
 	case OPTION_CALLBACK:
 	{
diff --git a/t/t7500-commit-template-squash-signoff.sh b/t/t7500-commit-template-squash-signoff.sh
index e28a79987db981..c065f12baf1baa 100755
--- a/t/t7500-commit-template-squash-signoff.sh
+++ b/t/t7500-commit-template-squash-signoff.sh
@@ -37,12 +37,22 @@ test_expect_success 'nonexistent template file should return error' '
 	)
 '
 
+test_expect_success 'nonexistent optional template file on command line' '
+	echo changes >> foo &&
+	git add foo &&
+	(
+		GIT_EDITOR="echo hello >\"\$1\"" &&
+		export GIT_EDITOR &&
+		git commit --template ":(optional)$PWD/notexist"
+	)
+'
+
 test_expect_success 'nonexistent template file in config should return error' '
 	test_config commit.template "$PWD"/notexist &&
 	(
 		GIT_EDITOR="echo hello >\"\$1\"" &&
 		export GIT_EDITOR &&
-		test_must_fail git commit
+		test_must_fail git commit --allow-empty
 	)
 '
 

From bed4d8035a7240c98913ad280f6829790695f145 Mon Sep 17 00:00:00 2001
From: Patrick Steinhardt <ps@pks.im>
Date: Mon, 21 Oct 2024 11:28:00 +0200
Subject: [PATCH 22/69] builtin/ls-remote: plug leaking server options

The server options populated via `OPT_STRING_LIST()` is never cleared,
causing a memory leak. Plug it.

This leak is exposed by t5702, but plugging it alone does not make the
whole test suite pass.

Signed-off-by: Patrick Steinhardt <ps@pks.im>
Signed-off-by: Taylor Blau <me@ttaylorr.com>
---
 builtin/ls-remote.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/builtin/ls-remote.c b/builtin/ls-remote.c
index 423318f87ec5f9..42f34e123610c1 100644
--- a/builtin/ls-remote.c
+++ b/builtin/ls-remote.c
@@ -166,6 +166,7 @@ int cmd_ls_remote(int argc,
 		status = 0; /* we found something */
 	}
 
+	string_list_clear(&server_options, 0);
 	ref_sorting_release(sorting);
 	ref_array_clear(&ref_array);
 	if (transport_disconnect(transport))

From bfac141bf0679b963bed40328ea78db1b248863a Mon Sep 17 00:00:00 2001
From: Patrick Steinhardt <ps@pks.im>
Date: Mon, 21 Oct 2024 11:28:03 +0200
Subject: [PATCH 23/69] t/helper: fix leaks in "reach" test tool

The "reach" test tool doesn't bother to clean up any of its allocated
resources, causing various leaks. Plug them.

Signed-off-by: Patrick Steinhardt <ps@pks.im>
Signed-off-by: Taylor Blau <me@ttaylorr.com>
---
 t/helper/test-reach.c | 10 ++++++++++
 t/t6600-test-reach.sh |  1 +
 2 files changed, 11 insertions(+)

diff --git a/t/helper/test-reach.c b/t/helper/test-reach.c
index 995e382863ac96..84deee604adb4b 100644
--- a/t/helper/test-reach.c
+++ b/t/helper/test-reach.c
@@ -127,10 +127,12 @@ int cmd__reach(int ac, const char **av)
 			exit(128);
 		printf("%s(A,X):\n", av[1]);
 		print_sorted_commit_ids(list);
+		free_commit_list(list);
 	} else if (!strcmp(av[1], "reduce_heads")) {
 		struct commit_list *list = reduce_heads(X);
 		printf("%s(X):\n", av[1]);
 		print_sorted_commit_ids(list);
+		free_commit_list(list);
 	} else if (!strcmp(av[1], "can_all_from_reach")) {
 		printf("%s(X,Y):%d\n", av[1], can_all_from_reach(X, Y, 1));
 	} else if (!strcmp(av[1], "can_all_from_reach_with_flag")) {
@@ -153,6 +155,7 @@ int cmd__reach(int ac, const char **av)
 			filter.with_commit_tag_algo = 0;
 
 		printf("%s(_,A,X,_):%d\n", av[1], commit_contains(&filter, A, X, &cache));
+		clear_contains_cache(&cache);
 	} else if (!strcmp(av[1], "get_reachable_subset")) {
 		const int reachable_flag = 1;
 		int i, count = 0;
@@ -176,7 +179,14 @@ int cmd__reach(int ac, const char **av)
 			die(_("too many commits marked reachable"));
 
 		print_sorted_commit_ids(list);
+		free_commit_list(list);
 	}
 
+	object_array_clear(&X_obj);
+	strbuf_release(&buf);
+	free_commit_list(X);
+	free_commit_list(Y);
+	free(X_array);
+	free(Y_array);
 	return 0;
 }
diff --git a/t/t6600-test-reach.sh b/t/t6600-test-reach.sh
index 2591f8b8b39bf4..307deefed2c1a7 100755
--- a/t/t6600-test-reach.sh
+++ b/t/t6600-test-reach.sh
@@ -2,6 +2,7 @@
 
 test_description='basic commit reachability tests'
 
+TEST_PASSES_SANITIZE_LEAK=true
 . ./test-lib.sh
 
 # Construct a grid-like commit graph with points (x,y)

From f4bb6329c519604d0859bcb7c08547ef2d241927 Mon Sep 17 00:00:00 2001
From: Patrick Steinhardt <ps@pks.im>
Date: Mon, 21 Oct 2024 11:28:09 +0200
Subject: [PATCH 24/69] grep: fix leak in `grep_splice_or()`

In `grep_splice_or()` we search for the next `TRUE` node in our tree of
grep exrpessions and replace it with the given new expression. But we
don't free the old node, which causes a memory leak. Plug it.

This leak is exposed by t7810, but plugging it alone isn't sufficient to
make the test suite pass.

Signed-off-by: Patrick Steinhardt <ps@pks.im>
Signed-off-by: Taylor Blau <me@ttaylorr.com>
---
 grep.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/grep.c b/grep.c
index 701e58de04ef61..e9337f32cbf6e7 100644
--- a/grep.c
+++ b/grep.c
@@ -756,6 +756,7 @@ static struct grep_expr *grep_splice_or(struct grep_expr *x, struct grep_expr *y
 		assert(x->node == GREP_NODE_OR);
 		if (x->u.binary.right &&
 		    x->u.binary.right->node == GREP_NODE_TRUE) {
+			free(x->u.binary.right);
 			x->u.binary.right = y;
 			break;
 		}

From 4042f03cbc5513696bfd409ca4787dd17517d801 Mon Sep 17 00:00:00 2001
From: Patrick Steinhardt <ps@pks.im>
Date: Mon, 21 Oct 2024 11:28:12 +0200
Subject: [PATCH 25/69] builtin/grep: fix leak with `--max-count=0`

When executing with `--max-count=0` we'll return early from git-grep(1)
without performing any cleanup, which causes memory leaks. Plug these.

Signed-off-by: Patrick Steinhardt <ps@pks.im>
Signed-off-by: Taylor Blau <me@ttaylorr.com>
---
 builtin/grep.c  | 13 ++++++++++---
 t/t7810-grep.sh |  1 +
 2 files changed, 11 insertions(+), 3 deletions(-)

diff --git a/builtin/grep.c b/builtin/grep.c
index f17d46a06e46ba..98b85c7fcaccc4 100644
--- a/builtin/grep.c
+++ b/builtin/grep.c
@@ -906,6 +906,7 @@ int cmd_grep(int argc,
 	int dummy;
 	int use_index = 1;
 	int allow_revs;
+	int ret;
 
 	struct option options[] = {
 		OPT_BOOL(0, "cached", &cached,
@@ -1172,8 +1173,10 @@ int cmd_grep(int argc,
 	 * Optimize out the case where the amount of matches is limited to zero.
 	 * We do this to keep results consistent with GNU grep(1).
 	 */
-	if (opt.max_count == 0)
-		return 1;
+	if (opt.max_count == 0) {
+		ret = 1;
+		goto out;
+	}
 
 	if (show_in_pager) {
 		if (num_threads > 1)
@@ -1267,10 +1270,14 @@ int cmd_grep(int argc,
 		hit |= wait_all();
 	if (hit && show_in_pager)
 		run_pager(&opt, prefix);
+
+	ret = !hit;
+
+out:
 	clear_pathspec(&pathspec);
 	string_list_clear(&path_list, 0);
 	free_grep_patterns(&opt);
 	object_array_clear(&list);
 	free_repos();
-	return !hit;
+	return ret;
 }
diff --git a/t/t7810-grep.sh b/t/t7810-grep.sh
index af2cf2f78ab890..9e7681f083163d 100755
--- a/t/t7810-grep.sh
+++ b/t/t7810-grep.sh
@@ -9,6 +9,7 @@ test_description='git grep various.
 GIT_TEST_DEFAULT_INITIAL_BRANCH_NAME=main
 export GIT_TEST_DEFAULT_INITIAL_BRANCH_NAME
 
+TEST_PASSES_SANITIZE_LEAK=true
 . ./test-lib.sh
 
 test_invalid_grep_expression() {

From ab28bc860f33ae21ef6ac0eee38b3fae0389f02e Mon Sep 17 00:00:00 2001
From: Patrick Steinhardt <ps@pks.im>
Date: Mon, 21 Oct 2024 11:28:15 +0200
Subject: [PATCH 26/69] revision: fix leaking bloom filters

The memory allocated by `prepare_to_use_bloom_filter()` is not released
by `release_revisions()`, causing a memory leak. Plug it.

Signed-off-by: Patrick Steinhardt <ps@pks.im>
Signed-off-by: Taylor Blau <me@ttaylorr.com>
---
 revision.c           | 5 +++++
 t/t4216-log-bloom.sh | 1 +
 2 files changed, 6 insertions(+)

diff --git a/revision.c b/revision.c
index f5f5b84f2b0836..8df75b82249b36 100644
--- a/revision.c
+++ b/revision.c
@@ -3227,6 +3227,11 @@ void release_revisions(struct rev_info *revs)
 	clear_decoration(&revs->treesame, free);
 	line_log_free(revs);
 	oidset_clear(&revs->missing_commits);
+
+	for (int i = 0; i < revs->bloom_keys_nr; i++)
+		clear_bloom_key(&revs->bloom_keys[i]);
+	FREE_AND_NULL(revs->bloom_keys);
+	revs->bloom_keys_nr = 0;
 }
 
 static void add_child(struct rev_info *revs, struct commit *parent, struct commit *child)
diff --git a/t/t4216-log-bloom.sh b/t/t4216-log-bloom.sh
index 3f163dc396980f..8d22338f6aafe7 100755
--- a/t/t4216-log-bloom.sh
+++ b/t/t4216-log-bloom.sh
@@ -4,6 +4,7 @@ test_description='git log for a path with Bloom filters'
 GIT_TEST_DEFAULT_INITIAL_BRANCH_NAME=main
 export GIT_TEST_DEFAULT_INITIAL_BRANCH_NAME
 
+TEST_PASSES_SANITIZE_LEAK=true
 . ./test-lib.sh
 . "$TEST_DIRECTORY"/lib-chunk.sh
 

From 27a1c1fd42029e9d42f4ecd779949edaa32f07ce Mon Sep 17 00:00:00 2001
From: Patrick Steinhardt <ps@pks.im>
Date: Mon, 21 Oct 2024 11:28:18 +0200
Subject: [PATCH 27/69] diff-lib: fix leaking diffopts in `do_diff_cache()`

In `do_diff_cache()` we initialize a new `rev_info` and then overwrite
its `diffopt` with a user-provided set of options. This can leak memory
because `repo_init_revisions()` may end up allocating memory for the
`diffopt` itself depending on the configuration. And as that field is
overwritten we won't ever free that.

Plug the memory leak by releasing the diffopts before we overwrite them.

Signed-off-by: Patrick Steinhardt <ps@pks.im>
Signed-off-by: Taylor Blau <me@ttaylorr.com>
---
 diff-lib.c           | 1 +
 t/t7610-mergetool.sh | 1 +
 2 files changed, 2 insertions(+)

diff --git a/diff-lib.c b/diff-lib.c
index 6b14b9596299f1..3cf353946f51ca 100644
--- a/diff-lib.c
+++ b/diff-lib.c
@@ -661,6 +661,7 @@ int do_diff_cache(const struct object_id *tree_oid, struct diff_options *opt)
 
 	repo_init_revisions(opt->repo, &revs, NULL);
 	copy_pathspec(&revs.prune_data, &opt->pathspec);
+	diff_free(&revs.diffopt);
 	revs.diffopt = *opt;
 	revs.diffopt.no_free = 1;
 
diff --git a/t/t7610-mergetool.sh b/t/t7610-mergetool.sh
index 22b3a85b3e960e..5c5e79e99052c3 100755
--- a/t/t7610-mergetool.sh
+++ b/t/t7610-mergetool.sh
@@ -10,6 +10,7 @@ Testing basic merge tool invocation'
 GIT_TEST_DEFAULT_INITIAL_BRANCH_NAME=main
 export GIT_TEST_DEFAULT_INITIAL_BRANCH_NAME
 
+TEST_PASSES_SANITIZE_LEAK=true
 . ./test-lib.sh
 
 # All the mergetool test work by checking out a temporary branch based

From c0a1026884cfd1168e1e7f19128cdeb91db7c99e Mon Sep 17 00:00:00 2001
From: Patrick Steinhardt <ps@pks.im>
Date: Mon, 21 Oct 2024 11:28:20 +0200
Subject: [PATCH 28/69] pretty: clear signature check

The signature check in of the formatting context is never getting
released. Fix this to plug the resulting memory leak.

Signed-off-by: Patrick Steinhardt <ps@pks.im>
Signed-off-by: Taylor Blau <me@ttaylorr.com>
---
 pretty.c                         | 1 +
 t/t4202-log.sh                   | 1 +
 t/t7031-verify-tag-signed-ssh.sh | 1 +
 t/t7510-signed-commit.sh         | 1 +
 t/t7528-signed-commit-ssh.sh     | 1 +
 5 files changed, 5 insertions(+)

diff --git a/pretty.c b/pretty.c
index 6403e268900b88..098378720a44c7 100644
--- a/pretty.c
+++ b/pretty.c
@@ -2032,6 +2032,7 @@ void repo_format_commit_message(struct repository *r,
 
 	free(context.commit_encoding);
 	repo_unuse_commit_buffer(r, commit, context.message);
+	signature_check_clear(&context.signature_check);
 }
 
 static void pp_header(struct pretty_print_context *pp,
diff --git a/t/t4202-log.sh b/t/t4202-log.sh
index 51f7beb59f88c8..35bec4089a33d7 100755
--- a/t/t4202-log.sh
+++ b/t/t4202-log.sh
@@ -5,6 +5,7 @@ test_description='git log'
 GIT_TEST_DEFAULT_INITIAL_BRANCH_NAME=main
 export GIT_TEST_DEFAULT_INITIAL_BRANCH_NAME
 
+TEST_PASSES_SANITIZE_LEAK=true
 . ./test-lib.sh
 . "$TEST_DIRECTORY/lib-gpg.sh"
 . "$TEST_DIRECTORY/lib-terminal.sh"
diff --git a/t/t7031-verify-tag-signed-ssh.sh b/t/t7031-verify-tag-signed-ssh.sh
index 20913b37134426..2ee62c0729309f 100755
--- a/t/t7031-verify-tag-signed-ssh.sh
+++ b/t/t7031-verify-tag-signed-ssh.sh
@@ -4,6 +4,7 @@ test_description='signed tag tests'
 GIT_TEST_DEFAULT_INITIAL_BRANCH_NAME=main
 export GIT_TEST_DEFAULT_INITIAL_BRANCH_NAME
 
+TEST_PASSES_SANITIZE_LEAK=true
 . ./test-lib.sh
 . "$TEST_DIRECTORY/lib-gpg.sh"
 
diff --git a/t/t7510-signed-commit.sh b/t/t7510-signed-commit.sh
index 0d2dd29fe6a12c..eb229082e407bf 100755
--- a/t/t7510-signed-commit.sh
+++ b/t/t7510-signed-commit.sh
@@ -4,6 +4,7 @@ test_description='signed commit tests'
 GIT_TEST_DEFAULT_INITIAL_BRANCH_NAME=main
 export GIT_TEST_DEFAULT_INITIAL_BRANCH_NAME
 
+TEST_PASSES_SANITIZE_LEAK=true
 . ./test-lib.sh
 GNUPGHOME_NOT_USED=$GNUPGHOME
 . "$TEST_DIRECTORY/lib-gpg.sh"
diff --git a/t/t7528-signed-commit-ssh.sh b/t/t7528-signed-commit-ssh.sh
index 065f78063629cb..68e18856b66ab7 100755
--- a/t/t7528-signed-commit-ssh.sh
+++ b/t/t7528-signed-commit-ssh.sh
@@ -4,6 +4,7 @@ test_description='ssh signed commit tests'
 GIT_TEST_DEFAULT_INITIAL_BRANCH_NAME=main
 export GIT_TEST_DEFAULT_INITIAL_BRANCH_NAME
 
+TEST_PASSES_SANITIZE_LEAK=true
 . ./test-lib.sh
 GNUPGHOME_NOT_USED=$GNUPGHOME
 . "$TEST_DIRECTORY/lib-gpg.sh"

From 5ca4de4e05df1b8f0e7ad803ca8af25182c2aada Mon Sep 17 00:00:00 2001
From: Patrick Steinhardt <ps@pks.im>
Date: Mon, 21 Oct 2024 11:28:23 +0200
Subject: [PATCH 29/69] upload-pack: fix leaking URI protocols

We don't clear `struct upload_pack::uri_protocols`, which causes a
memory leak. Fix this.

Signed-off-by: Patrick Steinhardt <ps@pks.im>
Signed-off-by: Taylor Blau <me@ttaylorr.com>
---
 t/t5702-protocol-v2.sh | 1 +
 upload-pack.c          | 1 +
 2 files changed, 2 insertions(+)

diff --git a/t/t5702-protocol-v2.sh b/t/t5702-protocol-v2.sh
index d3df81e7852d7d..e4ce059236851a 100755
--- a/t/t5702-protocol-v2.sh
+++ b/t/t5702-protocol-v2.sh
@@ -7,6 +7,7 @@ TEST_NO_CREATE_REPO=1
 GIT_TEST_DEFAULT_INITIAL_BRANCH_NAME=main
 export GIT_TEST_DEFAULT_INITIAL_BRANCH_NAME
 
+TEST_PASSES_SANITIZE_LEAK=true
 . ./test-lib.sh
 
 # Test protocol v2 with 'git://' transport
diff --git a/upload-pack.c b/upload-pack.c
index 6d6e0f9f9800f9..b4a59c3518bd88 100644
--- a/upload-pack.c
+++ b/upload-pack.c
@@ -166,6 +166,7 @@ static void upload_pack_data_clear(struct upload_pack_data *data)
 	object_array_clear(&data->extra_edge_obj);
 	list_objects_filter_release(&data->filter_options);
 	string_list_clear(&data->allowed_filters, 0);
+	string_list_clear(&data->uri_protocols, 0);
 
 	free((char *)data->pack_objects_hook);
 }

From 27164f719e3656b006a64408f1d9bd153f6ea538 Mon Sep 17 00:00:00 2001
From: Patrick Steinhardt <ps@pks.im>
Date: Mon, 21 Oct 2024 11:28:29 +0200
Subject: [PATCH 30/69] builtin/commit: fix leaking change data contents

While we free the worktree change data, we never free its contents. Fix
this.

Signed-off-by: Patrick Steinhardt <ps@pks.im>
Signed-off-by: Taylor Blau <me@ttaylorr.com>
---
 builtin/commit.c                          | 9 ++++++++-
 t/t7500-commit-template-squash-signoff.sh | 1 +
 2 files changed, 9 insertions(+), 1 deletion(-)

diff --git a/builtin/commit.c b/builtin/commit.c
index 8db4e9df0c9944..18a55bd1b9189a 100644
--- a/builtin/commit.c
+++ b/builtin/commit.c
@@ -728,6 +728,13 @@ static void prepare_amend_commit(struct commit *commit, struct strbuf *sb,
 	repo_unuse_commit_buffer(the_repository, commit, buffer);
 }
 
+static void change_data_free(void *util, const char *str UNUSED)
+{
+	struct wt_status_change_data *d = util;
+	free(d->rename_source);
+	free(d);
+}
+
 static int prepare_to_commit(const char *index_file, const char *prefix,
 			     struct commit *current_head,
 			     struct wt_status *s,
@@ -991,7 +998,7 @@ static int prepare_to_commit(const char *index_file, const char *prefix,
 		s->use_color = 0;
 		committable = run_status(s->fp, index_file, prefix, 1, s);
 		s->use_color = saved_color_setting;
-		string_list_clear(&s->change, 1);
+		string_list_clear_func(&s->change, change_data_free);
 	} else {
 		struct object_id oid;
 		const char *parent = "HEAD";
diff --git a/t/t7500-commit-template-squash-signoff.sh b/t/t7500-commit-template-squash-signoff.sh
index 4dca8d97a772d6..379d3ed3413d82 100755
--- a/t/t7500-commit-template-squash-signoff.sh
+++ b/t/t7500-commit-template-squash-signoff.sh
@@ -7,6 +7,7 @@ test_description='git commit
 
 Tests for template, signoff, squash and -F functions.'
 
+TEST_PASSES_SANITIZE_LEAK=true
 . ./test-lib.sh
 
 . "$TEST_DIRECTORY"/lib-rebase.sh

From ecbb58e2956956d3b7ac688f43ae0946834bf949 Mon Sep 17 00:00:00 2001
From: Patrick Steinhardt <ps@pks.im>
Date: Mon, 21 Oct 2024 11:28:32 +0200
Subject: [PATCH 31/69] trailer: fix leaking trailer values

Fix leaking trailer values when replacing the value with a command or
when the token value is empty.

This leak is exposed by t7513, but plugging it does not make the whole
test suite pass.

Signed-off-by: Patrick Steinhardt <ps@pks.im>
Signed-off-by: Taylor Blau <me@ttaylorr.com>
---
 trailer.c | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/trailer.c b/trailer.c
index 682d74505bfbb9..f1eca6d5d15ff7 100644
--- a/trailer.c
+++ b/trailer.c
@@ -249,17 +249,23 @@ static char *apply_command(struct conf_info *conf, const char *arg)
 static void apply_item_command(struct trailer_item *in_tok, struct arg_item *arg_tok)
 {
 	if (arg_tok->conf.command || arg_tok->conf.cmd) {
-		const char *arg;
+		char *value_to_free = NULL;
+		char *arg;
+
 		if (arg_tok->value && arg_tok->value[0]) {
-			arg = arg_tok->value;
+			arg = (char *)arg_tok->value;
 		} else {
 			if (in_tok && in_tok->value)
 				arg = xstrdup(in_tok->value);
 			else
 				arg = xstrdup("");
+			value_to_free = arg_tok->value;
 		}
+
 		arg_tok->value = apply_command(&arg_tok->conf, arg);
-		free((char *)arg);
+
+		free(value_to_free);
+		free(arg);
 	}
 }
 

From 9d0482d2a5fadd7f8531eeb0859bcf04be506fe1 Mon Sep 17 00:00:00 2001
From: Patrick Steinhardt <ps@pks.im>
Date: Mon, 21 Oct 2024 11:28:35 +0200
Subject: [PATCH 32/69] trailer: fix leaking strbufs when formatting trailers

We are populating, but never releasing two string buffers in
`format_trailers()`, causing a memory leak. Plug this leak by lifting
those buffers outside of the loop and releasing them on function return.
This fixes the memory leaks, but also optimizes the loop as we don't
have to reallocate the buffers on every single iteration.

Signed-off-by: Patrick Steinhardt <ps@pks.im>
Signed-off-by: Taylor Blau <me@ttaylorr.com>
---
 t/t7513-interpret-trailers.sh |  1 +
 trailer.c                     | 13 ++++++++-----
 2 files changed, 9 insertions(+), 5 deletions(-)

diff --git a/t/t7513-interpret-trailers.sh b/t/t7513-interpret-trailers.sh
index 0f7d8938d984d9..38d6ccaa001f90 100755
--- a/t/t7513-interpret-trailers.sh
+++ b/t/t7513-interpret-trailers.sh
@@ -5,6 +5,7 @@
 
 test_description='git interpret-trailers'
 
+TEST_PASSES_SANITIZE_LEAK=true
 . ./test-lib.sh
 
 # When we want one trailing space at the end of each line, let's use sed
diff --git a/trailer.c b/trailer.c
index f1eca6d5d15ff7..24e4e56fdf8cc3 100644
--- a/trailer.c
+++ b/trailer.c
@@ -1111,16 +1111,19 @@ void format_trailers(const struct process_trailer_options *opts,
 		     struct list_head *trailers,
 		     struct strbuf *out)
 {
+	struct strbuf tok = STRBUF_INIT;
+	struct strbuf val = STRBUF_INIT;
 	size_t origlen = out->len;
 	struct list_head *pos;
 	struct trailer_item *item;
 
+
 	list_for_each(pos, trailers) {
 		item = list_entry(pos, struct trailer_item, list);
 		if (item->token) {
-			struct strbuf tok = STRBUF_INIT;
-			struct strbuf val = STRBUF_INIT;
+			strbuf_reset(&tok);
 			strbuf_addstr(&tok, item->token);
+			strbuf_reset(&val);
 			strbuf_addstr(&val, item->value);
 
 			/*
@@ -1151,9 +1154,6 @@ void format_trailers(const struct process_trailer_options *opts,
 				if (!opts->separator)
 					strbuf_addch(out, '\n');
 			}
-			strbuf_release(&tok);
-			strbuf_release(&val);
-
 		} else if (!opts->only_trailers) {
 			if (opts->separator && out->len != origlen) {
 				strbuf_addbuf(out, opts->separator);
@@ -1165,6 +1165,9 @@ void format_trailers(const struct process_trailer_options *opts,
 				strbuf_addch(out, '\n');
 		}
 	}
+
+	strbuf_release(&tok);
+	strbuf_release(&val);
 }
 
 void format_trailers_from_commit(const struct process_trailer_options *opts,

From 06d658163d3ea5797f6a7b64c1a376ee2f2e5706 Mon Sep 17 00:00:00 2001
From: Patrick Steinhardt <ps@pks.im>
Date: Mon, 21 Oct 2024 11:28:38 +0200
Subject: [PATCH 33/69] builtin/commit: fix leaking cleanup config

The cleanup string set by the config is leaking when it is being
overridden by an option. Fix this by tracking these via two separate
variables such that we can free the old value.

Signed-off-by: Patrick Steinhardt <ps@pks.im>
Signed-off-by: Taylor Blau <me@ttaylorr.com>
---
 builtin/commit.c            | 17 ++++++++++++-----
 t/t7502-commit-porcelain.sh |  1 +
 2 files changed, 13 insertions(+), 5 deletions(-)

diff --git a/builtin/commit.c b/builtin/commit.c
index 18a55bd1b9189a..71d674138c9a2b 100644
--- a/builtin/commit.c
+++ b/builtin/commit.c
@@ -135,7 +135,7 @@ static struct strvec trailer_args = STRVEC_INIT;
  * is specified explicitly.
  */
 static enum commit_msg_cleanup_mode cleanup_mode;
-static char *cleanup_arg;
+static char *cleanup_config;
 
 static enum commit_whence whence;
 static int use_editor = 1, include_status = 1;
@@ -1387,8 +1387,6 @@ static int parse_and_validate_options(int argc, const char *argv[],
 	if (0 <= edit_flag)
 		use_editor = edit_flag;
 
-	cleanup_mode = get_cleanup_mode(cleanup_arg, use_editor);
-
 	handle_untracked_files_arg(s);
 
 	if (all && argc > 0)
@@ -1636,8 +1634,10 @@ static int git_commit_config(const char *k, const char *v,
 		include_status = git_config_bool(k, v);
 		return 0;
 	}
-	if (!strcmp(k, "commit.cleanup"))
-		return git_config_string(&cleanup_arg, k, v);
+	if (!strcmp(k, "commit.cleanup")) {
+		FREE_AND_NULL(cleanup_config);
+		return git_config_string(&cleanup_config, k, v);
+	}
 	if (!strcmp(k, "commit.gpgsign")) {
 		sign_commit = git_config_bool(k, v) ? "" : NULL;
 		return 0;
@@ -1658,6 +1658,7 @@ int cmd_commit(int argc,
 	       struct repository *repo UNUSED)
 {
 	static struct wt_status s;
+	static const char *cleanup_arg = NULL;
 	static struct option builtin_commit_options[] = {
 		OPT__QUIET(&quiet, N_("suppress summary after successful commit")),
 		OPT__VERBOSE(&verbose, N_("show diff in commit message template")),
@@ -1757,6 +1758,12 @@ int cmd_commit(int argc,
 	if (verbose == -1)
 		verbose = (config_commit_verbose < 0) ? 0 : config_commit_verbose;
 
+	if (cleanup_arg) {
+		free(cleanup_config);
+		cleanup_config = xstrdup(cleanup_arg);
+	}
+	cleanup_mode = get_cleanup_mode(cleanup_config, use_editor);
+
 	if (dry_run)
 		return dry_run_commit(argv, prefix, current_head, &s);
 	index_file = prepare_index(argv, prefix, current_head, 0);
diff --git a/t/t7502-commit-porcelain.sh b/t/t7502-commit-porcelain.sh
index b37e2018a74a7b..84f1ff52b6738d 100755
--- a/t/t7502-commit-porcelain.sh
+++ b/t/t7502-commit-porcelain.sh
@@ -5,6 +5,7 @@ test_description='git commit porcelain-ish'
 GIT_TEST_DEFAULT_INITIAL_BRANCH_NAME=main
 export GIT_TEST_DEFAULT_INITIAL_BRANCH_NAME
 
+TEST_PASSES_SANITIZE_LEAK=true
 . ./test-lib.sh
 
 commit_msg_is () {

From 7c6616737dbc87da939bad4d1571089beac25968 Mon Sep 17 00:00:00 2001
From: Patrick Steinhardt <ps@pks.im>
Date: Mon, 21 Oct 2024 11:28:40 +0200
Subject: [PATCH 34/69] transport-helper: fix leaking import/export marks

Fix leaking import and export marks for transport helpers.

Signed-off-by: Patrick Steinhardt <ps@pks.im>
Signed-off-by: Taylor Blau <me@ttaylorr.com>
---
 t/t5801-remote-helpers.sh | 1 +
 transport-helper.c        | 2 ++
 2 files changed, 3 insertions(+)

diff --git a/t/t5801-remote-helpers.sh b/t/t5801-remote-helpers.sh
index d21877150ed82e..d4882288a30ca0 100755
--- a/t/t5801-remote-helpers.sh
+++ b/t/t5801-remote-helpers.sh
@@ -8,6 +8,7 @@ test_description='Test remote-helper import and export commands'
 GIT_TEST_DEFAULT_INITIAL_BRANCH_NAME=main
 export GIT_TEST_DEFAULT_INITIAL_BRANCH_NAME
 
+TEST_PASSES_SANITIZE_LEAK=true
 . ./test-lib.sh
 . "$TEST_DIRECTORY"/lib-gpg.sh
 
diff --git a/transport-helper.c b/transport-helper.c
index 013ec79dc9cdc5..bc27653cdee211 100644
--- a/transport-helper.c
+++ b/transport-helper.c
@@ -399,6 +399,8 @@ static int release_helper(struct transport *transport)
 	int res = 0;
 	struct helper_data *data = transport->data;
 	refspec_clear(&data->rs);
+	free(data->import_marks);
+	free(data->export_marks);
 	res = disconnect_helper(transport);
 	free(transport->data);
 	return res;

From 8eded2ff0e60f829f823941b2764660bea7d36a6 Mon Sep 17 00:00:00 2001
From: Patrick Steinhardt <ps@pks.im>
Date: Mon, 21 Oct 2024 11:28:43 +0200
Subject: [PATCH 35/69] builtin/tag: fix leaking key ID on failure to sign

We do not free the key ID when signing a tag fails. Do so by using
the common exit path.

Signed-off-by: Patrick Steinhardt <ps@pks.im>
Signed-off-by: Taylor Blau <me@ttaylorr.com>
---
 builtin/tag.c  | 2 +-
 t/t7004-tag.sh | 1 +
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/builtin/tag.c b/builtin/tag.c
index 93d10d59157d2e..c37c0a68fdaa16 100644
--- a/builtin/tag.c
+++ b/builtin/tag.c
@@ -164,7 +164,7 @@ static int do_sign(struct strbuf *buffer, struct object_id **compat_oid,
 	int ret = -1;
 
 	if (sign_buffer(buffer, &sig, keyid))
-		return -1;
+		goto out;
 
 	if (compat) {
 		const struct git_hash_algo *algo = the_repository->hash_algo;
diff --git a/t/t7004-tag.sh b/t/t7004-tag.sh
index b1316e62f46ded..42b3327e69bed3 100755
--- a/t/t7004-tag.sh
+++ b/t/t7004-tag.sh
@@ -10,6 +10,7 @@ Tests for operations with tags.'
 GIT_TEST_DEFAULT_INITIAL_BRANCH_NAME=main
 export GIT_TEST_DEFAULT_INITIAL_BRANCH_NAME
 
+TEST_PASSES_SANITIZE_LEAK=true
 . ./test-lib.sh
 . "$TEST_DIRECTORY"/lib-gpg.sh
 . "$TEST_DIRECTORY"/lib-terminal.sh

From 9a2c5b013b2c80702afde2b2d5d4aa87e4a4e749 Mon Sep 17 00:00:00 2001
From: Patrick Steinhardt <ps@pks.im>
Date: Mon, 21 Oct 2024 11:28:48 +0200
Subject: [PATCH 36/69] combine-diff: fix leaking lost lines

The `cnt` variable tracks the number of lines in a patch diff. It can
happen though that there are no newlines, in which case we'd still end
up allocating our array of `sline`s. In fact, we always allocate it with
`cnt + 2` entries. But when we loop through the array to clear it at the
end of this function we only loop until `lno < cnt`, and thus we may not
end up releasing whatever the two extra `sline`s contain.

Plug this memory leak.

Signed-off-by: Patrick Steinhardt <ps@pks.im>
Signed-off-by: Taylor Blau <me@ttaylorr.com>
---
 combine-diff.c           | 2 +-
 t/t4038-diff-combined.sh | 1 +
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/combine-diff.c b/combine-diff.c
index f6b624dc288d72..3c6d9507fec072 100644
--- a/combine-diff.c
+++ b/combine-diff.c
@@ -1220,7 +1220,7 @@ static void show_patch_diff(struct combine_diff_path *elem, int num_parent,
 	}
 	free(result);
 
-	for (lno = 0; lno < cnt; lno++) {
+	for (lno = 0; lno < cnt + 2; lno++) {
 		if (sline[lno].lost) {
 			struct lline *ll = sline[lno].lost;
 			while (ll) {
diff --git a/t/t4038-diff-combined.sh b/t/t4038-diff-combined.sh
index 2ce26e585c98c1..00190802d831f0 100755
--- a/t/t4038-diff-combined.sh
+++ b/t/t4038-diff-combined.sh
@@ -5,6 +5,7 @@ test_description='combined diff'
 GIT_TEST_DEFAULT_INITIAL_BRANCH_NAME=main
 export GIT_TEST_DEFAULT_INITIAL_BRANCH_NAME
 
+TEST_PASSES_SANITIZE_LEAK=true
 . ./test-lib.sh
 . "$TEST_DIRECTORY"/lib-diff.sh
 

From 8977bcf842ff16e12824282250131480eb963d73 Mon Sep 17 00:00:00 2001
From: Patrick Steinhardt <ps@pks.im>
Date: Mon, 21 Oct 2024 11:28:50 +0200
Subject: [PATCH 37/69] dir: release untracked cache data

There are several cases where we invalidate untracked cache directory
entries where we do not free the underlying data, but reset the number
of entries. This causes us to leak memory because `free_untracked()`
will not iterate over any potential entries which we still had in the
array.

Fix this issue by freeing old entries. The leak is exposed by t7519, but
plugging it alone does not make the whole test suite pass.

Signed-off-by: Patrick Steinhardt <ps@pks.im>
Signed-off-by: Taylor Blau <me@ttaylorr.com>
---
 dir.c | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/dir.c b/dir.c
index e3ddd5b5296475..cb9782fa11f54c 100644
--- a/dir.c
+++ b/dir.c
@@ -1056,6 +1056,8 @@ static void do_invalidate_gitignore(struct untracked_cache_dir *dir)
 {
 	int i;
 	dir->valid = 0;
+	for (size_t i = 0; i < dir->untracked_nr; i++)
+		free(dir->untracked[i]);
 	dir->untracked_nr = 0;
 	for (i = 0; i < dir->dirs_nr; i++)
 		do_invalidate_gitignore(dir->dirs[i]);
@@ -1083,6 +1085,8 @@ static void invalidate_directory(struct untracked_cache *uc,
 		uc->dir_invalidated++;
 
 	dir->valid = 0;
+	for (size_t i = 0; i < dir->untracked_nr; i++)
+		free(dir->untracked[i]);
 	dir->untracked_nr = 0;
 	for (i = 0; i < dir->dirs_nr; i++)
 		dir->dirs[i]->recurse = 0;
@@ -3573,6 +3577,8 @@ static void write_one_dir(struct untracked_cache_dir *untracked,
 	 * for safety..
 	 */
 	if (!untracked->valid) {
+		for (size_t i = 0; i < untracked->untracked_nr; i++)
+			free(untracked->untracked[i]);
 		untracked->untracked_nr = 0;
 		untracked->check_only = 0;
 	}
@@ -3905,6 +3911,8 @@ static void invalidate_one_directory(struct untracked_cache *uc,
 {
 	uc->dir_invalidated++;
 	ucd->valid = 0;
+	for (size_t i = 0; i < ucd->untracked_nr; i++)
+		free(ucd->untracked[i]);
 	ucd->untracked_nr = 0;
 }
 

From 1fe066c43d74c4e5611643c4f391764033976512 Mon Sep 17 00:00:00 2001
From: Patrick Steinhardt <ps@pks.im>
Date: Mon, 21 Oct 2024 11:28:53 +0200
Subject: [PATCH 38/69] sparse-index: correctly free EWAH contents

While we free the `fsmonitor_dirty` member of `struct index_state`, we
do not free the contents of that EWAH. Do so by using `ewah_free()`
instead of `FREE_AND_NULL()`.

This leak is exposed by t7519, but plugging it alone does not make the
test suite pass.

Signed-off-by: Patrick Steinhardt <ps@pks.im>
Signed-off-by: Taylor Blau <me@ttaylorr.com>
---
 sparse-index.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/sparse-index.c b/sparse-index.c
index 3d7f2164e25ee5..2107840bfc5671 100644
--- a/sparse-index.c
+++ b/sparse-index.c
@@ -2,6 +2,7 @@
 
 #include "git-compat-util.h"
 #include "environment.h"
+#include "ewah/ewok.h"
 #include "gettext.h"
 #include "name-hash.h"
 #include "read-cache-ll.h"
@@ -242,7 +243,8 @@ int convert_to_sparse(struct index_state *istate, int flags)
 	cache_tree_update(istate, 0);
 
 	istate->fsmonitor_has_run_once = 0;
-	FREE_AND_NULL(istate->fsmonitor_dirty);
+	ewah_free(istate->fsmonitor_dirty);
+	istate->fsmonitor_dirty = NULL;
 	FREE_AND_NULL(istate->fsmonitor_last_update);
 
 	istate->sparse_index = INDEX_COLLAPSED;
@@ -438,7 +440,8 @@ void expand_index(struct index_state *istate, struct pattern_list *pl)
 	istate->cache_nr = full->cache_nr;
 	istate->cache_alloc = full->cache_alloc;
 	istate->fsmonitor_has_run_once = 0;
-	FREE_AND_NULL(istate->fsmonitor_dirty);
+	ewah_free(istate->fsmonitor_dirty);
+	istate->fsmonitor_dirty = NULL;
 	FREE_AND_NULL(istate->fsmonitor_last_update);
 
 	strbuf_release(&base);

From a4a3b8c7b4cef66629ae219a5a26ab7230c46849 Mon Sep 17 00:00:00 2001
From: Patrick Steinhardt <ps@pks.im>
Date: Mon, 21 Oct 2024 11:28:56 +0200
Subject: [PATCH 39/69] t/helper: stop re-initialization of `the_repository`

While "common-main.c" already initializes `the_repository` for us, we do
so a second time in the "read-cache" test helper. This causes a memory
leak because the old repository's contents isn't released.

Stop calling `initialize_repository()` to plug this leak.

Signed-off-by: Patrick Steinhardt <ps@pks.im>
Signed-off-by: Taylor Blau <me@ttaylorr.com>
---
 t/helper/test-read-cache.c  | 2 --
 t/t7519-status-fsmonitor.sh | 1 +
 2 files changed, 1 insertion(+), 2 deletions(-)

diff --git a/t/helper/test-read-cache.c b/t/helper/test-read-cache.c
index d285c656bd3a32..e277dde8e7107a 100644
--- a/t/helper/test-read-cache.c
+++ b/t/helper/test-read-cache.c
@@ -11,8 +11,6 @@ int cmd__read_cache(int argc, const char **argv)
 	int i, cnt = 1;
 	const char *name = NULL;
 
-	initialize_repository(the_repository);
-
 	if (argc > 1 && skip_prefix(argv[1], "--print-and-refresh=", &name)) {
 		argc--;
 		argv++;
diff --git a/t/t7519-status-fsmonitor.sh b/t/t7519-status-fsmonitor.sh
index 7ee69ecdd4aa2c..0f88a58a8192e8 100755
--- a/t/t7519-status-fsmonitor.sh
+++ b/t/t7519-status-fsmonitor.sh
@@ -2,6 +2,7 @@
 
 test_description='git status with file system watcher'
 
+TEST_PASSES_SANITIZE_LEAK=true
 . ./test-lib.sh
 
 # Note, after "git reset --hard HEAD" no extensions exist other than 'TREE'

From 44cbc972291c8d8fc58d7e54aaf035243493b00c Mon Sep 17 00:00:00 2001
From: Patrick Steinhardt <ps@pks.im>
Date: Mon, 21 Oct 2024 11:28:58 +0200
Subject: [PATCH 40/69] t/helper: fix leaking buffer in "dump-untracked-cache"

We never release the local `struct strbuf base` buffer, thus leaking
memory. Fix this leak.

This leak is exposed by t7063, but plugging it alone does not make the
whole test suite pass.

Signed-off-by: Patrick Steinhardt <ps@pks.im>
Signed-off-by: Taylor Blau <me@ttaylorr.com>
---
 t/helper/test-dump-untracked-cache.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/t/helper/test-dump-untracked-cache.c b/t/helper/test-dump-untracked-cache.c
index 4f010d53249520..b2e70837a90a94 100644
--- a/t/helper/test-dump-untracked-cache.c
+++ b/t/helper/test-dump-untracked-cache.c
@@ -68,5 +68,7 @@ int cmd__dump_untracked_cache(int ac UNUSED, const char **av UNUSED)
 	printf("flags %08x\n", uc->dir_flags);
 	if (uc->root)
 		dump(uc->root, &base);
+
+	strbuf_release(&base);
 	return 0;
 }

From e0e970d6ae7434b2468cc9c6087da5c701217876 Mon Sep 17 00:00:00 2001
From: Patrick Steinhardt <ps@pks.im>
Date: Mon, 21 Oct 2024 11:29:01 +0200
Subject: [PATCH 41/69] dir: fix leak when parsing "status.showUntrackedFiles"

We use `repo_config_get_string()` to read "status.showUntrackedFiles"
from the config subsystem. This function allocates the result, but we
never free the result after parsing it.

The value never leaves the scope of the calling function, so refactor it
to instead use `repo_config_get_string_tmp()`, which does not hand over
ownership to the caller.

Signed-off-by: Patrick Steinhardt <ps@pks.im>
Signed-off-by: Taylor Blau <me@ttaylorr.com>
---
 dir.c                             | 4 ++--
 t/t7063-status-untracked-cache.sh | 1 +
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/dir.c b/dir.c
index cb9782fa11f54c..7f35a3e3175355 100644
--- a/dir.c
+++ b/dir.c
@@ -2872,14 +2872,14 @@ static void set_untracked_ident(struct untracked_cache *uc)
 static unsigned new_untracked_cache_flags(struct index_state *istate)
 {
 	struct repository *repo = istate->repo;
-	char *val;
+	const char *val;
 
 	/*
 	 * This logic is coordinated with the setting of these flags in
 	 * wt-status.c#wt_status_collect_untracked(), and the evaluation
 	 * of the config setting in commit.c#git_status_config()
 	 */
-	if (!repo_config_get_string(repo, "status.showuntrackedfiles", &val) &&
+	if (!repo_config_get_string_tmp(repo, "status.showuntrackedfiles", &val) &&
 	    !strcmp(val, "all"))
 		return 0;
 
diff --git a/t/t7063-status-untracked-cache.sh b/t/t7063-status-untracked-cache.sh
index 8929ef481f926c..13fea7931cddef 100755
--- a/t/t7063-status-untracked-cache.sh
+++ b/t/t7063-status-untracked-cache.sh
@@ -5,6 +5,7 @@ test_description='test untracked cache'
 GIT_TEST_DEFAULT_INITIAL_BRANCH_NAME=main
 export GIT_TEST_DEFAULT_INITIAL_BRANCH_NAME
 
+TEST_PASSES_SANITIZE_LEAK=true
 . ./test-lib.sh
 
 # On some filesystems (e.g. FreeBSD's ext2 and ufs) directory mtime

From 3ec42f3a4567c8144fe2095be214bac8766fb25b Mon Sep 17 00:00:00 2001
From: Patrick Steinhardt <ps@pks.im>
Date: Mon, 21 Oct 2024 11:29:07 +0200
Subject: [PATCH 42/69] builtin/merge: release outbut buffer after performing
 merge

The `obuf` member of `struct merge_options` is used to buffer output in
some cases. In order to not discard its allocated memory we only release
its contents in `merge_finalize()` when we're not currently recursing
into a subtree.

This results in some situations where we seemingly do not release the
buffer reliably. We thus have calls to `strbuf_release()` for this
buffer scattered across the codebase. But we're missing one callsite in
git-merge(1), which causes a memory leak.

We should ideally refactor this interface so that callers don't have to
know about any such internals. But for now, paper over the issue by
adding one more `strbuf_release()` call.

Signed-off-by: Patrick Steinhardt <ps@pks.im>
Signed-off-by: Taylor Blau <me@ttaylorr.com>
---
 builtin/merge.c                          | 1 +
 t/t6424-merge-unrelated-index-changes.sh | 1 +
 2 files changed, 2 insertions(+)

diff --git a/builtin/merge.c b/builtin/merge.c
index 84d0f3604bc36c..51038eaca849fa 100644
--- a/builtin/merge.c
+++ b/builtin/merge.c
@@ -754,6 +754,7 @@ static int try_merge_strategy(const char *strategy, struct commit_list *common,
 			clean = merge_recursive(&o, head, remoteheads->item,
 						reversed, &result);
 		free_commit_list(reversed);
+		strbuf_release(&o.obuf);
 
 		if (clean < 0) {
 			rollback_lock_file(&lock);
diff --git a/t/t6424-merge-unrelated-index-changes.sh b/t/t6424-merge-unrelated-index-changes.sh
index 7677c5f08d0e8f..a7ea8acb8451cd 100755
--- a/t/t6424-merge-unrelated-index-changes.sh
+++ b/t/t6424-merge-unrelated-index-changes.sh
@@ -2,6 +2,7 @@
 
 test_description="merges with unrelated index changes"
 
+TEST_PASSES_SANITIZE_LEAK=true
 . ./test-lib.sh
 
 # Testcase for some simple merges

From 931847ac0c807435c4848968d059c543118fcff4 Mon Sep 17 00:00:00 2001
From: Patrick Steinhardt <ps@pks.im>
Date: Mon, 21 Oct 2024 11:29:09 +0200
Subject: [PATCH 43/69] list-objects-filter-options: work around reported leak
 on error

This one is a little bit more curious. In t6112, we have a test that
exercises the `git rev-list --filter` option with invalid filters. We
execute git-rev-list(1) via `test_must_fail`, which means that we check
for leaks even though Git exits with an error code. This causes the
following leak:

    Direct leak of 27 byte(s) in 1 object(s) allocated from:
        #0 0x5555555e6946 in realloc.part.0 lsan_interceptors.cpp.o
        #1 0x5555558fb4b6 in xrealloc wrapper.c:137:8
        #2 0x5555558b6e06 in strbuf_grow strbuf.c:112:2
        #3 0x5555558b7550 in strbuf_add strbuf.c:311:2
        #4 0x5555557c1a88 in strbuf_addstr strbuf.h:310:2
        #5 0x5555557c1d4c in parse_list_objects_filter list-objects-filter-options.c:261:3
        #6 0x555555885ead in handle_revision_pseudo_opt revision.c:2899:3
        #7 0x555555884e20 in setup_revisions revision.c:3014:11
        #8 0x5555556c4b42 in cmd_rev_list builtin/rev-list.c:588:9
        #9 0x5555555ec5e3 in run_builtin git.c:483:11
        #10 0x5555555eb1e4 in handle_builtin git.c:749:13
        #11 0x5555555ec001 in run_argv git.c:819:4
        #12 0x5555555eaf94 in cmd_main git.c:954:19
        #13 0x5555556fd569 in main common-main.c:64:11
        #14 0x7ffff7ca714d in __libc_start_call_main (.../lib/libc.so.6+0x2a14d)
        #15 0x7ffff7ca7208 in __libc_start_main@GLIBC_2.2.5 (.../libc.so.6+0x2a208)
        #16 0x5555555ad064 in _start (git+0x59064)

This leak is valid, as we call `die()` and do not clean up the memory at
all. But what's curious is that this is the only leak reported, because
we don't clean up any other allocated memory, either, and I have no idea
why the leak sanitizer treats this buffer specially.

In any case, we can work around the leak by shuffling things around a
bit. Instead of calling `gently_parse_list_objects_filter()` and dying
after we have modified the filter spec, we simply do so beforehand. Like
this we don't allocate the buffer in the error case, which makes the
reported leak go away.

It's not pretty, but it manages to make t6112 leak free.

Signed-off-by: Patrick Steinhardt <ps@pks.im>
Signed-off-by: Taylor Blau <me@ttaylorr.com>
---
 list-objects-filter-options.c       | 17 +++++++----------
 t/t6112-rev-list-filters-objects.sh |  1 +
 2 files changed, 8 insertions(+), 10 deletions(-)

diff --git a/list-objects-filter-options.c b/list-objects-filter-options.c
index 00611107d20293..fa72e81e4ad130 100644
--- a/list-objects-filter-options.c
+++ b/list-objects-filter-options.c
@@ -252,16 +252,14 @@ void parse_list_objects_filter(
 	const char *arg)
 {
 	struct strbuf errbuf = STRBUF_INIT;
-	int parse_error;
 
 	if (!filter_options->filter_spec.buf)
 		BUG("filter_options not properly initialized");
 
 	if (!filter_options->choice) {
+		if (gently_parse_list_objects_filter(filter_options, arg, &errbuf))
+			die("%s", errbuf.buf);
 		strbuf_addstr(&filter_options->filter_spec, arg);
-
-		parse_error = gently_parse_list_objects_filter(
-			filter_options, arg, &errbuf);
 	} else {
 		struct list_objects_filter_options *sub;
 
@@ -271,18 +269,17 @@ void parse_list_objects_filter(
 		 */
 		transform_to_combine_type(filter_options);
 
-		strbuf_addch(&filter_options->filter_spec, '+');
-		filter_spec_append_urlencode(filter_options, arg);
 		ALLOC_GROW_BY(filter_options->sub, filter_options->sub_nr, 1,
 			      filter_options->sub_alloc);
 		sub = &filter_options->sub[filter_options->sub_nr - 1];
 
 		list_objects_filter_init(sub);
-		parse_error = gently_parse_list_objects_filter(sub, arg,
-							       &errbuf);
+		if (gently_parse_list_objects_filter(sub, arg, &errbuf))
+			die("%s", errbuf.buf);
+
+		strbuf_addch(&filter_options->filter_spec, '+');
+		filter_spec_append_urlencode(filter_options, arg);
 	}
-	if (parse_error)
-		die("%s", errbuf.buf);
 }
 
 int opt_parse_list_objects_filter(const struct option *opt,
diff --git a/t/t6112-rev-list-filters-objects.sh b/t/t6112-rev-list-filters-objects.sh
index 0387f35a326d74..71e38491fa8bef 100755
--- a/t/t6112-rev-list-filters-objects.sh
+++ b/t/t6112-rev-list-filters-objects.sh
@@ -5,6 +5,7 @@ test_description='git rev-list using object filtering'
 GIT_TEST_DEFAULT_INITIAL_BRANCH_NAME=main
 export GIT_TEST_DEFAULT_INITIAL_BRANCH_NAME
 
+TEST_PASSES_SANITIZE_LEAK=true
 . ./test-lib.sh
 
 # Test the blob:none filter.

From e6e6315594a61010bed9c86e8609d4d7e583c811 Mon Sep 17 00:00:00 2001
From: Patrick Steinhardt <ps@pks.im>
Date: Wed, 23 Oct 2024 11:55:56 +0200
Subject: [PATCH 44/69] reftable/system: move "dir.h" to its only user

We still include "dir.h" in "reftable/system.h" evne though it is not
used by anything but by a single unit test. Move it over into that unit
test so that we don't accidentally use any functionality provided by it
in the reftable codebase.

Signed-off-by: Patrick Steinhardt <ps@pks.im>
Signed-off-by: Taylor Blau <me@ttaylorr.com>
---
 reftable/system.h               | 1 -
 t/unit-tests/t-reftable-stack.c | 1 +
 2 files changed, 1 insertion(+), 1 deletion(-)

diff --git a/reftable/system.h b/reftable/system.h
index 5ec85833434727..8564213475e27c 100644
--- a/reftable/system.h
+++ b/reftable/system.h
@@ -15,7 +15,6 @@ license that can be found in the LICENSE file or at
 #include "lockfile.h"
 #include "tempfile.h"
 #include "hash.h" /* hash ID, sizes.*/
-#include "dir.h" /* remove_dir_recursively, for tests.*/
 
 int hash_size(uint32_t id);
 
diff --git a/t/unit-tests/t-reftable-stack.c b/t/unit-tests/t-reftable-stack.c
index 72f6747064f621..1b4363a58fc5be 100644
--- a/t/unit-tests/t-reftable-stack.c
+++ b/t/unit-tests/t-reftable-stack.c
@@ -8,6 +8,7 @@ license that can be found in the LICENSE file or at
 
 #include "test-lib.h"
 #include "lib-reftable.h"
+#include "dir.h"
 #include "reftable/merged.h"
 #include "reftable/reader.h"
 #include "reftable/reftable-error.h"

From ac90e9f0b171cf91f2c103e9c77de919d8cd42c6 Mon Sep 17 00:00:00 2001
From: Patrick Steinhardt <ps@pks.im>
Date: Wed, 23 Oct 2024 11:55:59 +0200
Subject: [PATCH 45/69] reftable: explicitly handle hash format IDs

The hash format IDs are used for two different things across the
reftable codebase:

  - They are used as a 32 bit unsigned integer when reading and writing
    the header in order to identify the hash function.

  - They are used internally to identify which hash function is in use.

When one only considers the second usecase one might think that one can
easily change the representation of those hash IDs. But because those
IDs end up in the reftable header and footer on disk it is important
that those never change.

Create separate constants `REFTABLE_FORMAT_ID_*` and use them in
contexts where we read or write reftable headers. This serves multiple
purposes:

  - It allows us to more easily discern cases where we actually use
    those constants for the on-disk format.

  - It detangles us from the same constants that are defined in
    libgit.a, which is another required step to convert the reftable
    library to become standalone.

  - It makes the next step easier where we stop using `GIT_*_FORMAT_ID`
    constants in favor of a custom enum.

Signed-off-by: Patrick Steinhardt <ps@pks.im>
Signed-off-by: Taylor Blau <me@ttaylorr.com>
---
 reftable/basics.h |  7 +++++++
 reftable/reader.c | 10 ++++++----
 reftable/writer.c | 16 +++++++++++++++-
 3 files changed, 28 insertions(+), 5 deletions(-)

diff --git a/reftable/basics.h b/reftable/basics.h
index 7aa46d7c30d650..86141602e748e4 100644
--- a/reftable/basics.h
+++ b/reftable/basics.h
@@ -150,4 +150,11 @@ int common_prefix_size(struct reftable_buf *a, struct reftable_buf *b);
 
 int hash_size(uint32_t id);
 
+/*
+ * Format IDs that identify the hash function used by a reftable. Note that
+ * these constants end up on disk and thus mustn't change.
+ */
+#define REFTABLE_FORMAT_ID_SHA1   ((uint32_t) 0x73686131)
+#define REFTABLE_FORMAT_ID_SHA256 ((uint32_t) 0x73323536)
+
 #endif
diff --git a/reftable/reader.c b/reftable/reader.c
index 90dc950b5774cc..64eb6938efed8f 100644
--- a/reftable/reader.c
+++ b/reftable/reader.c
@@ -109,16 +109,18 @@ static int parse_footer(struct reftable_reader *r, uint8_t *footer,
 	if (r->version == 1) {
 		r->hash_id = GIT_SHA1_FORMAT_ID;
 	} else {
-		r->hash_id = get_be32(f);
-		switch (r->hash_id) {
-		case GIT_SHA1_FORMAT_ID:
+		switch (get_be32(f)) {
+		case REFTABLE_FORMAT_ID_SHA1:
+			r->hash_id = GIT_SHA1_FORMAT_ID;
 			break;
-		case GIT_SHA256_FORMAT_ID:
+		case REFTABLE_FORMAT_ID_SHA256:
+			r->hash_id = GIT_SHA256_FORMAT_ID;
 			break;
 		default:
 			err = REFTABLE_FORMAT_ERROR;
 			goto done;
 		}
+
 		f += 4;
 	}
 
diff --git a/reftable/writer.c b/reftable/writer.c
index fd136794d5a27b..9aa45de63401a2 100644
--- a/reftable/writer.c
+++ b/reftable/writer.c
@@ -103,8 +103,22 @@ static int writer_write_header(struct reftable_writer *w, uint8_t *dest)
 	put_be64(dest + 8, w->min_update_index);
 	put_be64(dest + 16, w->max_update_index);
 	if (writer_version(w) == 2) {
-		put_be32(dest + 24, w->opts.hash_id);
+		uint32_t hash_id;
+
+		switch (w->opts.hash_id) {
+		case GIT_SHA1_FORMAT_ID:
+			hash_id = REFTABLE_FORMAT_ID_SHA1;
+			break;
+		case GIT_SHA256_FORMAT_ID:
+			hash_id = REFTABLE_FORMAT_ID_SHA256;
+			break;
+		default:
+			return -1;
+		}
+
+		put_be32(dest + 24, hash_id);
 	}
+
 	return header_size(writer_version(w));
 }
 

From da2d199f96de5f5cea8e81cc53dbaa09c86ab3b7 Mon Sep 17 00:00:00 2001
From: Patrick Steinhardt <ps@pks.im>
Date: Wed, 23 Oct 2024 11:56:02 +0200
Subject: [PATCH 46/69] reftable/system: stop depending on "hash.h"

We include "hash.h" in "reftable/system.h" such that we can use hash
format IDs as well as the raw size of SHA1 and SHA256. As we are in the
process of converting the reftable library to become standalone we of
course cannot rely on those constants anymore.

Introduce a new `enum reftable_hash` to replace internal uses of the
hash format IDs and new constants that replace internal uses of the hash
size. Adapt the reftable backend to set up the correct hash function.

Signed-off-by: Patrick Steinhardt <ps@pks.im>
Signed-off-by: Taylor Blau <me@ttaylorr.com>
---
 refs/reftable-backend.c             | 12 +++++-
 reftable/basics.c                   | 13 ++++---
 reftable/basics.h                   |  2 +-
 reftable/merged.c                   |  4 +-
 reftable/merged.h                   |  3 +-
 reftable/reader.c                   |  8 ++--
 reftable/reader.h                   |  4 +-
 reftable/reftable-basics.h          | 13 +++++++
 reftable/reftable-merged.h          |  4 +-
 reftable/reftable-reader.h          |  2 +-
 reftable/reftable-record.h          | 12 +++---
 reftable/reftable-writer.h          |  2 +-
 reftable/stack.c                    |  4 +-
 reftable/system.h                   |  3 --
 reftable/writer.c                   |  8 ++--
 t/helper/test-reftable.c            |  4 +-
 t/unit-tests/lib-reftable.c         |  4 +-
 t/unit-tests/lib-reftable.h         |  2 +-
 t/unit-tests/t-reftable-block.c     | 40 +++++++++----------
 t/unit-tests/t-reftable-merged.c    | 26 ++++++-------
 t/unit-tests/t-reftable-pq.c        |  2 +-
 t/unit-tests/t-reftable-reader.c    |  4 +-
 t/unit-tests/t-reftable-readwrite.c | 40 +++++++++----------
 t/unit-tests/t-reftable-record.c    | 59 +++++++++++++++--------------
 t/unit-tests/t-reftable-stack.c     | 34 ++++++++---------
 25 files changed, 166 insertions(+), 143 deletions(-)

diff --git a/refs/reftable-backend.c b/refs/reftable-backend.c
index 3c6107c7ce5380..7d86d920970e7c 100644
--- a/refs/reftable-backend.c
+++ b/refs/reftable-backend.c
@@ -15,6 +15,7 @@
 #include "../object.h"
 #include "../path.h"
 #include "../refs.h"
+#include "../reftable/reftable-basics.h"
 #include "../reftable/reftable-stack.h"
 #include "../reftable/reftable-record.h"
 #include "../reftable/reftable-error.h"
@@ -289,7 +290,16 @@ static struct ref_store *reftable_be_init(struct repository *repo,
 	refs->store_flags = store_flags;
 	refs->log_all_ref_updates = repo_settings_get_log_all_ref_updates(repo);
 
-	refs->write_options.hash_id = repo->hash_algo->format_id;
+	switch (repo->hash_algo->format_id) {
+	case GIT_SHA1_FORMAT_ID:
+		refs->write_options.hash_id = REFTABLE_HASH_SHA1;
+		break;
+	case GIT_SHA256_FORMAT_ID:
+		refs->write_options.hash_id = REFTABLE_HASH_SHA256;
+		break;
+	default:
+		BUG("unknown hash algorithm %d", repo->hash_algo->format_id);
+	}
 	refs->write_options.default_permissions = calc_shared_perm(0666 & ~mask);
 	refs->write_options.disable_auto_compact =
 		!git_env_bool("GIT_TEST_REFTABLE_AUTOCOMPACTION", 1);
diff --git a/reftable/basics.c b/reftable/basics.c
index bc4fcc91446e07..7d84a5d62dead1 100644
--- a/reftable/basics.c
+++ b/reftable/basics.c
@@ -271,14 +271,15 @@ int common_prefix_size(struct reftable_buf *a, struct reftable_buf *b)
 	return p;
 }
 
-int hash_size(uint32_t id)
+int hash_size(enum reftable_hash id)
 {
+	if (!id)
+		return REFTABLE_HASH_SIZE_SHA1;
 	switch (id) {
-	case 0:
-	case GIT_SHA1_FORMAT_ID:
-		return GIT_SHA1_RAWSZ;
-	case GIT_SHA256_FORMAT_ID:
-		return GIT_SHA256_RAWSZ;
+	case REFTABLE_HASH_SHA1:
+		return REFTABLE_HASH_SIZE_SHA1;
+	case REFTABLE_HASH_SHA256:
+		return REFTABLE_HASH_SIZE_SHA256;
 	}
 	abort();
 }
diff --git a/reftable/basics.h b/reftable/basics.h
index 86141602e748e4..0b77d047ada0e6 100644
--- a/reftable/basics.h
+++ b/reftable/basics.h
@@ -148,7 +148,7 @@ char *reftable_strdup(const char *str);
 /* Find the longest shared prefix size of `a` and `b` */
 int common_prefix_size(struct reftable_buf *a, struct reftable_buf *b);
 
-int hash_size(uint32_t id);
+int hash_size(enum reftable_hash id);
 
 /*
  * Format IDs that identify the hash function used by a reftable. Note that
diff --git a/reftable/merged.c b/reftable/merged.c
index 514d6facf45403..5b93e20f429453 100644
--- a/reftable/merged.c
+++ b/reftable/merged.c
@@ -181,7 +181,7 @@ static void iterator_from_merged_iter(struct reftable_iterator *it,
 
 int reftable_merged_table_new(struct reftable_merged_table **dest,
 			      struct reftable_reader **readers, size_t n,
-			      uint32_t hash_id)
+			      enum reftable_hash hash_id)
 {
 	struct reftable_merged_table *m = NULL;
 	uint64_t last_max = 0;
@@ -293,7 +293,7 @@ int reftable_merged_table_init_log_iterator(struct reftable_merged_table *mt,
 	return merged_table_init_iter(mt, it, BLOCK_TYPE_LOG);
 }
 
-uint32_t reftable_merged_table_hash_id(struct reftable_merged_table *mt)
+enum reftable_hash reftable_merged_table_hash_id(struct reftable_merged_table *mt)
 {
 	return mt->hash_id;
 }
diff --git a/reftable/merged.h b/reftable/merged.h
index 89bd0c4b35b8e6..13a5fe4154e631 100644
--- a/reftable/merged.h
+++ b/reftable/merged.h
@@ -10,11 +10,12 @@ license that can be found in the LICENSE file or at
 #define MERGED_H
 
 #include "system.h"
+#include "basics.h"
 
 struct reftable_merged_table {
 	struct reftable_reader **readers;
 	size_t readers_len;
-	uint32_t hash_id;
+	enum reftable_hash hash_id;
 
 	/* If unset, produce deletions. This is useful for compaction. For the
 	 * full stack, deletions should be produced. */
diff --git a/reftable/reader.c b/reftable/reader.c
index 64eb6938efed8f..ea82955c9bcf1d 100644
--- a/reftable/reader.c
+++ b/reftable/reader.c
@@ -67,7 +67,7 @@ static int reader_get_block(struct reftable_reader *r,
 	return block_source_read_block(&r->source, dest, off, sz);
 }
 
-uint32_t reftable_reader_hash_id(struct reftable_reader *r)
+enum reftable_hash reftable_reader_hash_id(struct reftable_reader *r)
 {
 	return r->hash_id;
 }
@@ -107,14 +107,14 @@ static int parse_footer(struct reftable_reader *r, uint8_t *footer,
 	f += 8;
 
 	if (r->version == 1) {
-		r->hash_id = GIT_SHA1_FORMAT_ID;
+		r->hash_id = REFTABLE_HASH_SHA1;
 	} else {
 		switch (get_be32(f)) {
 		case REFTABLE_FORMAT_ID_SHA1:
-			r->hash_id = GIT_SHA1_FORMAT_ID;
+			r->hash_id = REFTABLE_HASH_SHA1;
 			break;
 		case REFTABLE_FORMAT_ID_SHA256:
-			r->hash_id = GIT_SHA256_FORMAT_ID;
+			r->hash_id = REFTABLE_HASH_SHA256;
 			break;
 		default:
 			err = REFTABLE_FORMAT_ERROR;
diff --git a/reftable/reader.h b/reftable/reader.h
index 010fbfe85118c3..d2b48a4849970c 100644
--- a/reftable/reader.h
+++ b/reftable/reader.h
@@ -37,8 +37,8 @@ struct reftable_reader {
 	/* Size of the file, excluding the footer. */
 	uint64_t size;
 
-	/* 'sha1' for SHA1, 's256' for SHA-256 */
-	uint32_t hash_id;
+	/* The hash function used for ref records. */
+	enum reftable_hash hash_id;
 
 	uint32_t block_size;
 	uint64_t min_update_index;
diff --git a/reftable/reftable-basics.h b/reftable/reftable-basics.h
index 6e8e636b7166fd..e0397ed5836969 100644
--- a/reftable/reftable-basics.h
+++ b/reftable/reftable-basics.h
@@ -11,6 +11,19 @@
 
 #include <stddef.h>
 
+/*
+ * Hash functions understood by the reftable library. Note that the values are
+ * arbitrary and somewhat random such that we can easily detect cases where the
+ * hash hasn't been properly set up.
+ */
+enum reftable_hash {
+	REFTABLE_HASH_SHA1   = 89,
+	REFTABLE_HASH_SHA256 = 247,
+};
+#define REFTABLE_HASH_SIZE_SHA1   20
+#define REFTABLE_HASH_SIZE_SHA256 32
+#define REFTABLE_HASH_SIZE_MAX    REFTABLE_HASH_SIZE_SHA256
+
 /* Overrides the functions to use for memory management. */
 void reftable_set_alloc(void *(*malloc)(size_t),
 			void *(*realloc)(void *, size_t), void (*free)(void *));
diff --git a/reftable/reftable-merged.h b/reftable/reftable-merged.h
index a970d5dd89ac34..f2d01c3ef82149 100644
--- a/reftable/reftable-merged.h
+++ b/reftable/reftable-merged.h
@@ -34,7 +34,7 @@ struct reftable_reader;
  */
 int reftable_merged_table_new(struct reftable_merged_table **dest,
 			      struct reftable_reader **readers, size_t n,
-			      uint32_t hash_id);
+			      enum reftable_hash hash_id);
 
 /* Initialize a merged table iterator for reading refs. */
 int reftable_merged_table_init_ref_iterator(struct reftable_merged_table *mt,
@@ -56,6 +56,6 @@ reftable_merged_table_min_update_index(struct reftable_merged_table *mt);
 void reftable_merged_table_free(struct reftable_merged_table *m);
 
 /* return the hash ID of the merged table. */
-uint32_t reftable_merged_table_hash_id(struct reftable_merged_table *m);
+enum reftable_hash reftable_merged_table_hash_id(struct reftable_merged_table *m);
 
 #endif
diff --git a/reftable/reftable-reader.h b/reftable/reftable-reader.h
index 6a2d0b693f5e4a..0085fbb903291a 100644
--- a/reftable/reftable-reader.h
+++ b/reftable/reftable-reader.h
@@ -54,7 +54,7 @@ int reftable_reader_init_log_iterator(struct reftable_reader *r,
 				      struct reftable_iterator *it);
 
 /* returns the hash ID used in this table. */
-uint32_t reftable_reader_hash_id(struct reftable_reader *r);
+enum reftable_hash reftable_reader_hash_id(struct reftable_reader *r);
 
 /* return an iterator for the refs pointing to `oid`. */
 int reftable_reader_refs_for(struct reftable_reader *r,
diff --git a/reftable/reftable-record.h b/reftable/reftable-record.h
index 2d42463c5811ba..ddd48eb5798577 100644
--- a/reftable/reftable-record.h
+++ b/reftable/reftable-record.h
@@ -9,7 +9,7 @@ license that can be found in the LICENSE file or at
 #ifndef REFTABLE_RECORD_H
 #define REFTABLE_RECORD_H
 
-#include "hash.h"
+#include "reftable-basics.h"
 #include <stdint.h>
 
 /*
@@ -40,10 +40,10 @@ struct reftable_ref_record {
 #define REFTABLE_NR_REF_VALUETYPES 4
 	} value_type;
 	union {
-		unsigned char val1[GIT_MAX_RAWSZ];
+		unsigned char val1[REFTABLE_HASH_SIZE_MAX];
 		struct {
-			unsigned char value[GIT_MAX_RAWSZ]; /* first hash  */
-			unsigned char target_value[GIT_MAX_RAWSZ]; /* second hash */
+			unsigned char value[REFTABLE_HASH_SIZE_MAX]; /* first hash  */
+			unsigned char target_value[REFTABLE_HASH_SIZE_MAX]; /* second hash */
 		} val2;
 		char *symref; /* referent, malloced 0-terminated string */
 	} value;
@@ -85,8 +85,8 @@ struct reftable_log_record {
 
 	union {
 		struct {
-			unsigned char new_hash[GIT_MAX_RAWSZ];
-			unsigned char old_hash[GIT_MAX_RAWSZ];
+			unsigned char new_hash[REFTABLE_HASH_SIZE_MAX];
+			unsigned char old_hash[REFTABLE_HASH_SIZE_MAX];
 			char *name;
 			char *email;
 			uint64_t time;
diff --git a/reftable/reftable-writer.h b/reftable/reftable-writer.h
index e4fc95378835ff..211860d08a4aac 100644
--- a/reftable/reftable-writer.h
+++ b/reftable/reftable-writer.h
@@ -33,7 +33,7 @@ struct reftable_write_options {
 	/* 4-byte identifier ("sha1", "s256") of the hash.
 	 * Defaults to SHA1 if unset
 	 */
-	uint32_t hash_id;
+	enum reftable_hash hash_id;
 
 	/* Default mode for creating files. If unset, use 0666 (+umask) */
 	unsigned int default_permissions;
diff --git a/reftable/stack.c b/reftable/stack.c
index c33979536efa3a..9ae716ff375f8e 100644
--- a/reftable/stack.c
+++ b/reftable/stack.c
@@ -73,7 +73,7 @@ int reftable_new_stack(struct reftable_stack **dest, const char *dir,
 	if (_opts)
 		opts = *_opts;
 	if (opts.hash_id == 0)
-		opts.hash_id = GIT_SHA1_FORMAT_ID;
+		opts.hash_id = REFTABLE_HASH_SHA1;
 
 	*dest = NULL;
 
@@ -1603,7 +1603,7 @@ struct segment suggest_compaction_segment(uint64_t *sizes, size_t n,
 
 static uint64_t *stack_table_sizes_for_compaction(struct reftable_stack *st)
 {
-	int version = (st->opts.hash_id == GIT_SHA1_FORMAT_ID) ? 1 : 2;
+	int version = (st->opts.hash_id == REFTABLE_HASH_SHA1) ? 1 : 2;
 	int overhead = header_size(version) - 1;
 	uint64_t *sizes;
 
diff --git a/reftable/system.h b/reftable/system.h
index 8564213475e27c..38d3534620e88e 100644
--- a/reftable/system.h
+++ b/reftable/system.h
@@ -14,8 +14,5 @@ license that can be found in the LICENSE file or at
 #include "git-compat-util.h"
 #include "lockfile.h"
 #include "tempfile.h"
-#include "hash.h" /* hash ID, sizes.*/
-
-int hash_size(uint32_t id);
 
 #endif
diff --git a/reftable/writer.c b/reftable/writer.c
index 9aa45de63401a2..ea2f831fc58876 100644
--- a/reftable/writer.c
+++ b/reftable/writer.c
@@ -79,7 +79,7 @@ static void options_set_defaults(struct reftable_write_options *opts)
 	}
 
 	if (opts->hash_id == 0) {
-		opts->hash_id = GIT_SHA1_FORMAT_ID;
+		opts->hash_id = REFTABLE_HASH_SHA1;
 	}
 	if (opts->block_size == 0) {
 		opts->block_size = DEFAULT_BLOCK_SIZE;
@@ -88,7 +88,7 @@ static void options_set_defaults(struct reftable_write_options *opts)
 
 static int writer_version(struct reftable_writer *w)
 {
-	return (w->opts.hash_id == 0 || w->opts.hash_id == GIT_SHA1_FORMAT_ID) ?
+	return (w->opts.hash_id == 0 || w->opts.hash_id == REFTABLE_HASH_SHA1) ?
 			     1 :
 			     2;
 }
@@ -106,10 +106,10 @@ static int writer_write_header(struct reftable_writer *w, uint8_t *dest)
 		uint32_t hash_id;
 
 		switch (w->opts.hash_id) {
-		case GIT_SHA1_FORMAT_ID:
+		case REFTABLE_HASH_SHA1:
 			hash_id = REFTABLE_FORMAT_ID_SHA1;
 			break;
-		case GIT_SHA256_FORMAT_ID:
+		case REFTABLE_HASH_SHA256:
 			hash_id = REFTABLE_FORMAT_ID_SHA256;
 			break;
 		default:
diff --git a/t/helper/test-reftable.c b/t/helper/test-reftable.c
index 5c8849d115b5a3..3c72ed985b3a44 100644
--- a/t/helper/test-reftable.c
+++ b/t/helper/test-reftable.c
@@ -156,7 +156,7 @@ int cmd__dump_reftable(int argc, const char **argv)
 	int opt_dump_blocks = 0;
 	int opt_dump_table = 0;
 	int opt_dump_stack = 0;
-	uint32_t opt_hash_id = GIT_SHA1_FORMAT_ID;
+	uint32_t opt_hash_id = REFTABLE_HASH_SHA1;
 	const char *arg = NULL, *argv0 = argv[0];
 
 	for (; argc > 1; argv++, argc--)
@@ -167,7 +167,7 @@ int cmd__dump_reftable(int argc, const char **argv)
 		else if (!strcmp("-t", argv[1]))
 			opt_dump_table = 1;
 		else if (!strcmp("-6", argv[1]))
-			opt_hash_id = GIT_SHA256_FORMAT_ID;
+			opt_hash_id = REFTABLE_HASH_SHA256;
 		else if (!strcmp("-s", argv[1]))
 			opt_dump_stack = 1;
 		else if (!strcmp("-?", argv[1]) || !strcmp("-h", argv[1])) {
diff --git a/t/unit-tests/lib-reftable.c b/t/unit-tests/lib-reftable.c
index 2ddf480588d36a..c1631f45275407 100644
--- a/t/unit-tests/lib-reftable.c
+++ b/t/unit-tests/lib-reftable.c
@@ -3,7 +3,7 @@
 #include "reftable/constants.h"
 #include "reftable/writer.h"
 
-void t_reftable_set_hash(uint8_t *p, int i, uint32_t id)
+void t_reftable_set_hash(uint8_t *p, int i, enum reftable_hash id)
 {
 	memset(p, (uint8_t)i, hash_size(id));
 }
@@ -82,7 +82,7 @@ void t_reftable_write_to_buf(struct reftable_buf *buf,
 		size_t off = i * (opts.block_size ? opts.block_size
 						  : DEFAULT_BLOCK_SIZE);
 		if (!off)
-			off = header_size(opts.hash_id == GIT_SHA256_FORMAT_ID ? 2 : 1);
+			off = header_size(opts.hash_id == REFTABLE_HASH_SHA256 ? 2 : 1);
 		check_char(buf->buf[off], ==, 'r');
 	}
 
diff --git a/t/unit-tests/lib-reftable.h b/t/unit-tests/lib-reftable.h
index d4950fed3da048..e4c360fa7eede9 100644
--- a/t/unit-tests/lib-reftable.h
+++ b/t/unit-tests/lib-reftable.h
@@ -6,7 +6,7 @@
 
 struct reftable_buf;
 
-void t_reftable_set_hash(uint8_t *p, int i, uint32_t id);
+void t_reftable_set_hash(uint8_t *p, int i, enum reftable_hash id);
 
 struct reftable_writer *t_reftable_strbuf_writer(struct reftable_buf *buf,
 						 struct reftable_write_options *opts);
diff --git a/t/unit-tests/t-reftable-block.c b/t/unit-tests/t-reftable-block.c
index f9af907117bce3..13e10807daed6f 100644
--- a/t/unit-tests/t-reftable-block.c
+++ b/t/unit-tests/t-reftable-block.c
@@ -36,7 +36,7 @@ static void t_ref_block_read_write(void)
 	block.len = block_size;
 	block_source_from_buf(&block.source ,&buf);
 	ret = block_writer_init(&bw, BLOCK_TYPE_REF, block.data, block_size,
-				header_off, hash_size(GIT_SHA1_FORMAT_ID));
+				header_off, hash_size(REFTABLE_HASH_SHA1));
 	check(!ret);
 
 	rec.u.ref.refname = (char *) "";
@@ -47,7 +47,7 @@ static void t_ref_block_read_write(void)
 	for (i = 0; i < N; i++) {
 		rec.u.ref.refname = xstrfmt("branch%02"PRIuMAX, (uintmax_t)i);
 		rec.u.ref.value_type = REFTABLE_REF_VAL1;
-		memset(rec.u.ref.value.val1, i, GIT_SHA1_RAWSZ);
+		memset(rec.u.ref.value.val1, i, REFTABLE_HASH_SIZE_SHA1);
 
 		recs[i] = rec;
 		ret = block_writer_add(&bw, &rec);
@@ -61,7 +61,7 @@ static void t_ref_block_read_write(void)
 
 	block_writer_release(&bw);
 
-	block_reader_init(&br, &block, header_off, block_size, GIT_SHA1_RAWSZ);
+	block_reader_init(&br, &block, header_off, block_size, REFTABLE_HASH_SIZE_SHA1);
 
 	block_iter_seek_start(&it, &br);
 
@@ -72,7 +72,7 @@ static void t_ref_block_read_write(void)
 			check_int(i, ==, N);
 			break;
 		}
-		check(reftable_record_equal(&recs[i], &rec, GIT_SHA1_RAWSZ));
+		check(reftable_record_equal(&recs[i], &rec, REFTABLE_HASH_SIZE_SHA1));
 	}
 
 	for (i = 0; i < N; i++) {
@@ -85,7 +85,7 @@ static void t_ref_block_read_write(void)
 		ret = block_iter_next(&it, &rec);
 		check_int(ret, ==, 0);
 
-		check(reftable_record_equal(&recs[i], &rec, GIT_SHA1_RAWSZ));
+		check(reftable_record_equal(&recs[i], &rec, REFTABLE_HASH_SIZE_SHA1));
 
 		want.len--;
 		ret = block_iter_seek_key(&it, &br, &want);
@@ -93,7 +93,7 @@ static void t_ref_block_read_write(void)
 
 		ret = block_iter_next(&it, &rec);
 		check_int(ret, ==, 0);
-		check(reftable_record_equal(&recs[10 * (i / 10)], &rec, GIT_SHA1_RAWSZ));
+		check(reftable_record_equal(&recs[10 * (i / 10)], &rec, REFTABLE_HASH_SIZE_SHA1));
 	}
 
 	block_reader_release(&br);
@@ -130,7 +130,7 @@ static void t_log_block_read_write(void)
 	block.len = block_size;
 	block_source_from_buf(&block.source ,&buf);
 	ret = block_writer_init(&bw, BLOCK_TYPE_LOG, block.data, block_size,
-				header_off, hash_size(GIT_SHA1_FORMAT_ID));
+				header_off, hash_size(REFTABLE_HASH_SHA1));
 	check(!ret);
 
 	for (i = 0; i < N; i++) {
@@ -150,7 +150,7 @@ static void t_log_block_read_write(void)
 
 	block_writer_release(&bw);
 
-	block_reader_init(&br, &block, header_off, block_size, GIT_SHA1_RAWSZ);
+	block_reader_init(&br, &block, header_off, block_size, REFTABLE_HASH_SIZE_SHA1);
 
 	block_iter_seek_start(&it, &br);
 
@@ -161,7 +161,7 @@ static void t_log_block_read_write(void)
 			check_int(i, ==, N);
 			break;
 		}
-		check(reftable_record_equal(&recs[i], &rec, GIT_SHA1_RAWSZ));
+		check(reftable_record_equal(&recs[i], &rec, REFTABLE_HASH_SIZE_SHA1));
 	}
 
 	for (i = 0; i < N; i++) {
@@ -175,7 +175,7 @@ static void t_log_block_read_write(void)
 		ret = block_iter_next(&it, &rec);
 		check_int(ret, ==, 0);
 
-		check(reftable_record_equal(&recs[i], &rec, GIT_SHA1_RAWSZ));
+		check(reftable_record_equal(&recs[i], &rec, REFTABLE_HASH_SIZE_SHA1));
 
 		want.len--;
 		ret = block_iter_seek_key(&it, &br, &want);
@@ -183,7 +183,7 @@ static void t_log_block_read_write(void)
 
 		ret = block_iter_next(&it, &rec);
 		check_int(ret, ==, 0);
-		check(reftable_record_equal(&recs[10 * (i / 10)], &rec, GIT_SHA1_RAWSZ));
+		check(reftable_record_equal(&recs[10 * (i / 10)], &rec, REFTABLE_HASH_SIZE_SHA1));
 	}
 
 	block_reader_release(&br);
@@ -220,7 +220,7 @@ static void t_obj_block_read_write(void)
 	block.len = block_size;
 	block_source_from_buf(&block.source, &buf);
 	ret = block_writer_init(&bw, BLOCK_TYPE_OBJ, block.data, block_size,
-				header_off, hash_size(GIT_SHA1_FORMAT_ID));
+				header_off, hash_size(REFTABLE_HASH_SHA1));
 	check(!ret);
 
 	for (i = 0; i < N; i++) {
@@ -242,7 +242,7 @@ static void t_obj_block_read_write(void)
 
 	block_writer_release(&bw);
 
-	block_reader_init(&br, &block, header_off, block_size, GIT_SHA1_RAWSZ);
+	block_reader_init(&br, &block, header_off, block_size, REFTABLE_HASH_SIZE_SHA1);
 
 	block_iter_seek_start(&it, &br);
 
@@ -253,7 +253,7 @@ static void t_obj_block_read_write(void)
 			check_int(i, ==, N);
 			break;
 		}
-		check(reftable_record_equal(&recs[i], &rec, GIT_SHA1_RAWSZ));
+		check(reftable_record_equal(&recs[i], &rec, REFTABLE_HASH_SIZE_SHA1));
 	}
 
 	for (i = 0; i < N; i++) {
@@ -266,7 +266,7 @@ static void t_obj_block_read_write(void)
 		ret = block_iter_next(&it, &rec);
 		check_int(ret, ==, 0);
 
-		check(reftable_record_equal(&recs[i], &rec, GIT_SHA1_RAWSZ));
+		check(reftable_record_equal(&recs[i], &rec, REFTABLE_HASH_SIZE_SHA1));
 	}
 
 	block_reader_release(&br);
@@ -304,7 +304,7 @@ static void t_index_block_read_write(void)
 	block.len = block_size;
 	block_source_from_buf(&block.source, &buf);
 	ret = block_writer_init(&bw, BLOCK_TYPE_INDEX, block.data, block_size,
-				header_off, hash_size(GIT_SHA1_FORMAT_ID));
+				header_off, hash_size(REFTABLE_HASH_SHA1));
 	check(!ret);
 
 	for (i = 0; i < N; i++) {
@@ -326,7 +326,7 @@ static void t_index_block_read_write(void)
 
 	block_writer_release(&bw);
 
-	block_reader_init(&br, &block, header_off, block_size, GIT_SHA1_RAWSZ);
+	block_reader_init(&br, &block, header_off, block_size, REFTABLE_HASH_SIZE_SHA1);
 
 	block_iter_seek_start(&it, &br);
 
@@ -337,7 +337,7 @@ static void t_index_block_read_write(void)
 			check_int(i, ==, N);
 			break;
 		}
-		check(reftable_record_equal(&recs[i], &rec, GIT_SHA1_RAWSZ));
+		check(reftable_record_equal(&recs[i], &rec, REFTABLE_HASH_SIZE_SHA1));
 	}
 
 	for (i = 0; i < N; i++) {
@@ -350,7 +350,7 @@ static void t_index_block_read_write(void)
 		ret = block_iter_next(&it, &rec);
 		check_int(ret, ==, 0);
 
-		check(reftable_record_equal(&recs[i], &rec, GIT_SHA1_RAWSZ));
+		check(reftable_record_equal(&recs[i], &rec, REFTABLE_HASH_SIZE_SHA1));
 
 		want.len--;
 		ret = block_iter_seek_key(&it, &br, &want);
@@ -358,7 +358,7 @@ static void t_index_block_read_write(void)
 
 		ret = block_iter_next(&it, &rec);
 		check_int(ret, ==, 0);
-		check(reftable_record_equal(&recs[10 * (i / 10)], &rec, GIT_SHA1_RAWSZ));
+		check(reftable_record_equal(&recs[10 * (i / 10)], &rec, REFTABLE_HASH_SIZE_SHA1));
 	}
 
 	block_reader_release(&br);
diff --git a/t/unit-tests/t-reftable-merged.c b/t/unit-tests/t-reftable-merged.c
index 484c18251f373f..0573d9470a6fbb 100644
--- a/t/unit-tests/t-reftable-merged.c
+++ b/t/unit-tests/t-reftable-merged.c
@@ -42,7 +42,7 @@ merged_table_from_records(struct reftable_ref_record **refs,
 		check(!err);
 	}
 
-	err = reftable_merged_table_new(&mt, *readers, n, GIT_SHA1_FORMAT_ID);
+	err = reftable_merged_table_new(&mt, *readers, n, REFTABLE_HASH_SHA1);
 	check(!err);
 	return mt;
 }
@@ -91,7 +91,7 @@ static void t_merged_single_record(void)
 
 	err = reftable_iterator_next_ref(&it, &ref);
 	check(!err);
-	check(reftable_ref_record_equal(&r2[0], &ref, GIT_SHA1_RAWSZ));
+	check(reftable_ref_record_equal(&r2[0], &ref, REFTABLE_HASH_SIZE_SHA1));
 	reftable_ref_record_release(&ref);
 	reftable_iterator_destroy(&it);
 	readers_destroy(readers, 3);
@@ -168,7 +168,7 @@ static void t_merged_refs(void)
 	check(!err);
 	err = reftable_iterator_seek_ref(&it, "a");
 	check(!err);
-	check_int(reftable_merged_table_hash_id(mt), ==, GIT_SHA1_FORMAT_ID);
+	check_int(reftable_merged_table_hash_id(mt), ==, REFTABLE_HASH_SHA1);
 	check_int(reftable_merged_table_min_update_index(mt), ==, 1);
 	check_int(reftable_merged_table_max_update_index(mt), ==, 3);
 
@@ -186,7 +186,7 @@ static void t_merged_refs(void)
 	check_int(ARRAY_SIZE(want), ==, len);
 	for (i = 0; i < len; i++)
 		check(reftable_ref_record_equal(want[i], &out[i],
-						 GIT_SHA1_RAWSZ));
+						 REFTABLE_HASH_SIZE_SHA1));
 	for (i = 0; i < len; i++)
 		reftable_ref_record_release(&out[i]);
 	reftable_free(out);
@@ -252,12 +252,12 @@ static void t_merged_seek_multiple_times(void)
 
 		err = reftable_iterator_next_ref(&it, &rec);
 		check(!err);
-		err = reftable_ref_record_equal(&rec, &r1[1], GIT_SHA1_RAWSZ);
+		err = reftable_ref_record_equal(&rec, &r1[1], REFTABLE_HASH_SIZE_SHA1);
 		check(err == 1);
 
 		err = reftable_iterator_next_ref(&it, &rec);
 		check(!err);
-		err = reftable_ref_record_equal(&rec, &r2[1], GIT_SHA1_RAWSZ);
+		err = reftable_ref_record_equal(&rec, &r2[1], REFTABLE_HASH_SIZE_SHA1);
 		check(err == 1);
 
 		err = reftable_iterator_next_ref(&it, &rec);
@@ -300,7 +300,7 @@ merged_table_from_log_records(struct reftable_log_record **logs,
 		check(!err);
 	}
 
-	err = reftable_merged_table_new(&mt, *readers, n, GIT_SHA1_FORMAT_ID);
+	err = reftable_merged_table_new(&mt, *readers, n, REFTABLE_HASH_SHA1);
 	check(!err);
 	return mt;
 }
@@ -377,7 +377,7 @@ static void t_merged_logs(void)
 	check(!err);
 	err = reftable_iterator_seek_log(&it, "a");
 	check(!err);
-	check_int(reftable_merged_table_hash_id(mt), ==, GIT_SHA1_FORMAT_ID);
+	check_int(reftable_merged_table_hash_id(mt), ==, REFTABLE_HASH_SHA1);
 	check_int(reftable_merged_table_min_update_index(mt), ==, 1);
 	check_int(reftable_merged_table_max_update_index(mt), ==, 3);
 
@@ -395,7 +395,7 @@ static void t_merged_logs(void)
 	check_int(ARRAY_SIZE(want), ==, len);
 	for (i = 0; i < len; i++)
 		check(reftable_log_record_equal(want[i], &out[i],
-						 GIT_SHA1_RAWSZ));
+						 REFTABLE_HASH_SIZE_SHA1));
 
 	err = merged_table_init_iter(mt, &it, BLOCK_TYPE_LOG);
 	check(!err);
@@ -404,7 +404,7 @@ static void t_merged_logs(void)
 	reftable_log_record_release(&out[0]);
 	err = reftable_iterator_next_log(&it, &out[0]);
 	check(!err);
-	check(reftable_log_record_equal(&out[0], &r3[0], GIT_SHA1_RAWSZ));
+	check(reftable_log_record_equal(&out[0], &r3[0], REFTABLE_HASH_SIZE_SHA1));
 	reftable_iterator_destroy(&it);
 
 	for (i = 0; i < len; i++)
@@ -448,11 +448,11 @@ static void t_default_write_opts(void)
 	check(!err);
 
 	hash_id = reftable_reader_hash_id(rd);
-	check_int(hash_id, ==, GIT_SHA1_FORMAT_ID);
+	check_int(hash_id, ==, REFTABLE_HASH_SHA1);
 
-	err = reftable_merged_table_new(&merged, &rd, 1, GIT_SHA256_FORMAT_ID);
+	err = reftable_merged_table_new(&merged, &rd, 1, REFTABLE_HASH_SHA256);
 	check_int(err, ==, REFTABLE_FORMAT_ERROR);
-	err = reftable_merged_table_new(&merged, &rd, 1, GIT_SHA1_FORMAT_ID);
+	err = reftable_merged_table_new(&merged, &rd, 1, REFTABLE_HASH_SHA1);
 	check(!err);
 
 	reftable_reader_decref(rd);
diff --git a/t/unit-tests/t-reftable-pq.c b/t/unit-tests/t-reftable-pq.c
index ada4c19f18afe7..272da05bea679a 100644
--- a/t/unit-tests/t-reftable-pq.c
+++ b/t/unit-tests/t-reftable-pq.c
@@ -132,7 +132,7 @@ static void t_merged_iter_pqueue_top(void)
 
 		merged_iter_pqueue_check(&pq);
 		check(pq_entry_equal(&top, &e));
-		check(reftable_record_equal(top.rec, &recs[i], GIT_SHA1_RAWSZ));
+		check(reftable_record_equal(top.rec, &recs[i], REFTABLE_HASH_SIZE_SHA1));
 		for (size_t j = 0; i < pq.len; j++) {
 			check(pq_less(&top, &pq.heap[j]));
 			check_int(top.index, >, j);
diff --git a/t/unit-tests/t-reftable-reader.c b/t/unit-tests/t-reftable-reader.c
index 19cb53b6415e94..546df6005e4cfc 100644
--- a/t/unit-tests/t-reftable-reader.c
+++ b/t/unit-tests/t-reftable-reader.c
@@ -31,7 +31,7 @@ static int t_reader_seek_once(void)
 	ret = reftable_iterator_next_ref(&it, &ref);
 	check(!ret);
 
-	ret = reftable_ref_record_equal(&ref, &records[0], GIT_SHA1_RAWSZ);
+	ret = reftable_ref_record_equal(&ref, &records[0], REFTABLE_HASH_SIZE_SHA1);
 	check_int(ret, ==, 1);
 
 	ret = reftable_iterator_next_ref(&it, &ref);
@@ -74,7 +74,7 @@ static int t_reader_reseek(void)
 		ret = reftable_iterator_next_ref(&it, &ref);
 		check(!ret);
 
-		ret = reftable_ref_record_equal(&ref, &records[0], GIT_SHA1_RAWSZ);
+		ret = reftable_ref_record_equal(&ref, &records[0], REFTABLE_HASH_SIZE_SHA1);
 		check_int(ret, ==, 1);
 
 		ret = reftable_iterator_next_ref(&it, &ref);
diff --git a/t/unit-tests/t-reftable-readwrite.c b/t/unit-tests/t-reftable-readwrite.c
index d279b86df0aeda..57896922eb1854 100644
--- a/t/unit-tests/t-reftable-readwrite.c
+++ b/t/unit-tests/t-reftable-readwrite.c
@@ -41,7 +41,7 @@ static void t_buffer(void)
 }
 
 static void write_table(char ***names, struct reftable_buf *buf, int N,
-			int block_size, uint32_t hash_id)
+			int block_size, enum reftable_hash hash_id)
 {
 	struct reftable_write_options opts = {
 		.block_size = block_size,
@@ -62,7 +62,7 @@ static void write_table(char ***names, struct reftable_buf *buf, int N,
 		refs[i].refname = (*names)[i] = xstrfmt("refs/heads/branch%02d", i);
 		refs[i].update_index = update_index;
 		refs[i].value_type = REFTABLE_REF_VAL1;
-		t_reftable_set_hash(refs[i].value.val1, i, GIT_SHA1_FORMAT_ID);
+		t_reftable_set_hash(refs[i].value.val1, i, REFTABLE_HASH_SHA1);
 	}
 
 	for (i = 0; i < N; i++) {
@@ -70,7 +70,7 @@ static void write_table(char ***names, struct reftable_buf *buf, int N,
 		logs[i].update_index = update_index;
 		logs[i].value_type = REFTABLE_LOG_UPDATE;
 		t_reftable_set_hash(logs[i].value.update.new_hash, i,
-				    GIT_SHA1_FORMAT_ID);
+				    REFTABLE_HASH_SHA1);
 		logs[i].value.update.message = (char *) "message";
 	}
 
@@ -104,7 +104,7 @@ static void t_log_buffer_size(void)
 	/* This tests buffer extension for log compression. Must use a random
 	   hash, to ensure that the compressed part is larger than the original.
 	*/
-	for (i = 0; i < GIT_SHA1_RAWSZ; i++) {
+	for (i = 0; i < REFTABLE_HASH_SIZE_SHA1; i++) {
 		log.value.update.old_hash[i] = (uint8_t)(git_rand() % 256);
 		log.value.update.new_hash[i] = (uint8_t)(git_rand() % 256);
 	}
@@ -191,9 +191,9 @@ static void t_log_write_read(void)
 		log.update_index = i;
 		log.value_type = REFTABLE_LOG_UPDATE;
 		t_reftable_set_hash(log.value.update.old_hash, i,
-				    GIT_SHA1_FORMAT_ID);
+				    REFTABLE_HASH_SHA1);
 		t_reftable_set_hash(log.value.update.new_hash, i + 1,
-				    GIT_SHA1_FORMAT_ID);
+				    REFTABLE_HASH_SHA1);
 
 		err = reftable_writer_add_log(w, &log);
 		check(!err);
@@ -326,7 +326,7 @@ static void t_table_read_write_sequential(void)
 	int err = 0;
 	int j = 0;
 
-	write_table(&names, &buf, N, 256, GIT_SHA1_FORMAT_ID);
+	write_table(&names, &buf, N, 256, REFTABLE_HASH_SHA1);
 
 	block_source_from_buf(&source, &buf);
 
@@ -361,7 +361,7 @@ static void t_table_write_small_table(void)
 	char **names;
 	struct reftable_buf buf = REFTABLE_BUF_INIT;
 	int N = 1;
-	write_table(&names, &buf, N, 4096, GIT_SHA1_FORMAT_ID);
+	write_table(&names, &buf, N, 4096, REFTABLE_HASH_SHA1);
 	check_int(buf.len, <, 200);
 	reftable_buf_release(&buf);
 	free_names(names);
@@ -378,7 +378,7 @@ static void t_table_read_api(void)
 	struct reftable_log_record log = { 0 };
 	struct reftable_iterator it = { 0 };
 
-	write_table(&names, &buf, N, 256, GIT_SHA1_FORMAT_ID);
+	write_table(&names, &buf, N, 256, REFTABLE_HASH_SHA1);
 
 	block_source_from_buf(&source, &buf);
 
@@ -400,7 +400,7 @@ static void t_table_read_api(void)
 	reftable_buf_release(&buf);
 }
 
-static void t_table_read_write_seek(int index, int hash_id)
+static void t_table_read_write_seek(int index, enum reftable_hash hash_id)
 {
 	char **names;
 	struct reftable_buf buf = REFTABLE_BUF_INIT;
@@ -467,24 +467,24 @@ static void t_table_read_write_seek(int index, int hash_id)
 
 static void t_table_read_write_seek_linear(void)
 {
-	t_table_read_write_seek(0, GIT_SHA1_FORMAT_ID);
+	t_table_read_write_seek(0, REFTABLE_HASH_SHA1);
 }
 
 static void t_table_read_write_seek_linear_sha256(void)
 {
-	t_table_read_write_seek(0, GIT_SHA256_FORMAT_ID);
+	t_table_read_write_seek(0, REFTABLE_HASH_SHA256);
 }
 
 static void t_table_read_write_seek_index(void)
 {
-	t_table_read_write_seek(1, GIT_SHA1_FORMAT_ID);
+	t_table_read_write_seek(1, REFTABLE_HASH_SHA1);
 }
 
 static void t_table_refs_for(int indexed)
 {
 	char **want_names;
 	int want_names_len = 0;
-	uint8_t want_hash[GIT_SHA1_RAWSZ];
+	uint8_t want_hash[REFTABLE_HASH_SIZE_SHA1];
 
 	struct reftable_write_options opts = {
 		.block_size = 256,
@@ -500,10 +500,10 @@ static void t_table_refs_for(int indexed)
 	want_names = reftable_calloc(N + 1, sizeof(*want_names));
 	check(want_names != NULL);
 
-	t_reftable_set_hash(want_hash, 4, GIT_SHA1_FORMAT_ID);
+	t_reftable_set_hash(want_hash, 4, REFTABLE_HASH_SHA1);
 
 	for (i = 0; i < N; i++) {
-		uint8_t hash[GIT_SHA1_RAWSZ];
+		uint8_t hash[REFTABLE_HASH_SIZE_SHA1];
 		char fill[51] = { 0 };
 		char name[100];
 		struct reftable_ref_record ref = { 0 };
@@ -517,9 +517,9 @@ static void t_table_refs_for(int indexed)
 
 		ref.value_type = REFTABLE_REF_VAL2;
 		t_reftable_set_hash(ref.value.val2.value, i / 4,
-				    GIT_SHA1_FORMAT_ID);
+				    REFTABLE_HASH_SHA1);
 		t_reftable_set_hash(ref.value.val2.target_value, 3 + i / 4,
-				    GIT_SHA1_FORMAT_ID);
+				    REFTABLE_HASH_SHA1);
 
 		/* 80 bytes / entry, so 3 entries per block. Yields 17
 		 */
@@ -527,8 +527,8 @@ static void t_table_refs_for(int indexed)
 		n = reftable_writer_add_ref(w, &ref);
 		check_int(n, ==, 0);
 
-		if (!memcmp(ref.value.val2.value, want_hash, GIT_SHA1_RAWSZ) ||
-		    !memcmp(ref.value.val2.target_value, want_hash, GIT_SHA1_RAWSZ))
+		if (!memcmp(ref.value.val2.value, want_hash, REFTABLE_HASH_SIZE_SHA1) ||
+		    !memcmp(ref.value.val2.target_value, want_hash, REFTABLE_HASH_SIZE_SHA1))
 			want_names[want_names_len++] = xstrdup(name);
 	}
 
diff --git a/t/unit-tests/t-reftable-record.c b/t/unit-tests/t-reftable-record.c
index eb98bf2da913df..42bc64cec877e5 100644
--- a/t/unit-tests/t-reftable-record.c
+++ b/t/unit-tests/t-reftable-record.c
@@ -7,6 +7,7 @@
 */
 
 #include "test-lib.h"
+#include "reftable/basics.h"
 #include "reftable/constants.h"
 #include "reftable/record.h"
 
@@ -17,10 +18,10 @@ static void t_copy(struct reftable_record *rec)
 
 	typ = reftable_record_type(rec);
 	reftable_record_init(&copy, typ);
-	reftable_record_copy_from(&copy, rec, GIT_SHA1_RAWSZ);
+	reftable_record_copy_from(&copy, rec, REFTABLE_HASH_SIZE_SHA1);
 	/* do it twice to catch memory leaks */
-	reftable_record_copy_from(&copy, rec, GIT_SHA1_RAWSZ);
-	check(reftable_record_equal(rec, &copy, GIT_SHA1_RAWSZ));
+	reftable_record_copy_from(&copy, rec, REFTABLE_HASH_SIZE_SHA1);
+	check(reftable_record_equal(rec, &copy, REFTABLE_HASH_SIZE_SHA1));
 
 	reftable_record_release(&copy);
 }
@@ -59,7 +60,7 @@ static void t_varint_roundtrip(void)
 
 static void set_hash(uint8_t *h, int j)
 {
-	for (int i = 0; i < hash_size(GIT_SHA1_FORMAT_ID); i++)
+	for (int i = 0; i < hash_size(REFTABLE_HASH_SHA1); i++)
 		h[i] = (j >> i) & 0xff;
 }
 
@@ -84,14 +85,14 @@ static void t_reftable_ref_record_comparison(void)
 		},
 	};
 
-	check(!reftable_record_equal(&in[0], &in[1], GIT_SHA1_RAWSZ));
+	check(!reftable_record_equal(&in[0], &in[1], REFTABLE_HASH_SIZE_SHA1));
 	check(!reftable_record_cmp(&in[0], &in[1]));
 
-	check(!reftable_record_equal(&in[1], &in[2], GIT_SHA1_RAWSZ));
+	check(!reftable_record_equal(&in[1], &in[2], REFTABLE_HASH_SIZE_SHA1));
 	check_int(reftable_record_cmp(&in[1], &in[2]), >, 0);
 
 	in[1].u.ref.value_type = in[0].u.ref.value_type;
-	check(reftable_record_equal(&in[0], &in[1], GIT_SHA1_RAWSZ));
+	check(reftable_record_equal(&in[0], &in[1], REFTABLE_HASH_SIZE_SHA1));
 	check(!reftable_record_cmp(&in[0], &in[1]));
 }
 
@@ -155,15 +156,15 @@ static void t_reftable_ref_record_roundtrip(void)
 		check_int(reftable_record_is_deletion(&in), ==, i == REFTABLE_REF_DELETION);
 
 		reftable_record_key(&in, &key);
-		n = reftable_record_encode(&in, dest, GIT_SHA1_RAWSZ);
+		n = reftable_record_encode(&in, dest, REFTABLE_HASH_SIZE_SHA1);
 		check_int(n, >, 0);
 
 		/* decode into a non-zero reftable_record to test for leaks. */
-		m = reftable_record_decode(&out, key, i, dest, GIT_SHA1_RAWSZ, &scratch);
+		m = reftable_record_decode(&out, key, i, dest, REFTABLE_HASH_SIZE_SHA1, &scratch);
 		check_int(n, ==, m);
 
 		check(reftable_ref_record_equal(&in.u.ref, &out.u.ref,
-						 GIT_SHA1_RAWSZ));
+						 REFTABLE_HASH_SIZE_SHA1));
 		reftable_record_release(&in);
 
 		reftable_buf_release(&key);
@@ -193,15 +194,15 @@ static void t_reftable_log_record_comparison(void)
 		},
 	};
 
-	check(!reftable_record_equal(&in[0], &in[1], GIT_SHA1_RAWSZ));
-	check(!reftable_record_equal(&in[1], &in[2], GIT_SHA1_RAWSZ));
+	check(!reftable_record_equal(&in[0], &in[1], REFTABLE_HASH_SIZE_SHA1));
+	check(!reftable_record_equal(&in[1], &in[2], REFTABLE_HASH_SIZE_SHA1));
 	check_int(reftable_record_cmp(&in[1], &in[2]), >, 0);
 	/* comparison should be reversed for equal keys, because
 	 * comparison is now performed on the basis of update indices */
 	check_int(reftable_record_cmp(&in[0], &in[1]), <, 0);
 
 	in[1].u.log.update_index = in[0].u.log.update_index;
-	check(reftable_record_equal(&in[0], &in[1], GIT_SHA1_RAWSZ));
+	check(reftable_record_equal(&in[0], &in[1], REFTABLE_HASH_SIZE_SHA1));
 	check(!reftable_record_cmp(&in[0], &in[1]));
 }
 
@@ -303,15 +304,15 @@ static void t_reftable_log_record_roundtrip(void)
 
 		reftable_record_key(&rec, &key);
 
-		n = reftable_record_encode(&rec, dest, GIT_SHA1_RAWSZ);
+		n = reftable_record_encode(&rec, dest, REFTABLE_HASH_SIZE_SHA1);
 		check_int(n, >=, 0);
 		valtype = reftable_record_val_type(&rec);
 		m = reftable_record_decode(&out, key, valtype, dest,
-					   GIT_SHA1_RAWSZ, &scratch);
+					   REFTABLE_HASH_SIZE_SHA1, &scratch);
 		check_int(n, ==, m);
 
 		check(reftable_log_record_equal(&in[i], &out.u.log,
-						 GIT_SHA1_RAWSZ));
+						 REFTABLE_HASH_SIZE_SHA1));
 		reftable_log_record_release(&in[i]);
 		reftable_buf_release(&key);
 		reftable_record_release(&out);
@@ -380,20 +381,20 @@ static void t_reftable_obj_record_comparison(void)
 		},
 	};
 
-	check(!reftable_record_equal(&in[0], &in[1], GIT_SHA1_RAWSZ));
+	check(!reftable_record_equal(&in[0], &in[1], REFTABLE_HASH_SIZE_SHA1));
 	check(!reftable_record_cmp(&in[0], &in[1]));
 
-	check(!reftable_record_equal(&in[1], &in[2], GIT_SHA1_RAWSZ));
+	check(!reftable_record_equal(&in[1], &in[2], REFTABLE_HASH_SIZE_SHA1));
 	check_int(reftable_record_cmp(&in[1], &in[2]), >, 0);
 
 	in[1].u.obj.offset_len = in[0].u.obj.offset_len;
-	check(reftable_record_equal(&in[0], &in[1], GIT_SHA1_RAWSZ));
+	check(reftable_record_equal(&in[0], &in[1], REFTABLE_HASH_SIZE_SHA1));
 	check(!reftable_record_cmp(&in[0], &in[1]));
 }
 
 static void t_reftable_obj_record_roundtrip(void)
 {
-	uint8_t testHash1[GIT_SHA1_RAWSZ] = { 1, 2, 3, 4, 0 };
+	uint8_t testHash1[REFTABLE_HASH_SIZE_SHA1] = { 1, 2, 3, 4, 0 };
 	uint64_t till9[] = { 1, 2, 3, 4, 500, 600, 700, 800, 9000 };
 	struct reftable_obj_record recs[3] = {
 		{
@@ -435,14 +436,14 @@ static void t_reftable_obj_record_roundtrip(void)
 		check(!reftable_record_is_deletion(&in));
 		t_copy(&in);
 		reftable_record_key(&in, &key);
-		n = reftable_record_encode(&in, dest, GIT_SHA1_RAWSZ);
+		n = reftable_record_encode(&in, dest, REFTABLE_HASH_SIZE_SHA1);
 		check_int(n, >, 0);
 		extra = reftable_record_val_type(&in);
 		m = reftable_record_decode(&out, key, extra, dest,
-					   GIT_SHA1_RAWSZ, &scratch);
+					   REFTABLE_HASH_SIZE_SHA1, &scratch);
 		check_int(n, ==, m);
 
-		check(reftable_record_equal(&in, &out, GIT_SHA1_RAWSZ));
+		check(reftable_record_equal(&in, &out, REFTABLE_HASH_SIZE_SHA1));
 		reftable_buf_release(&key);
 		reftable_record_release(&out);
 	}
@@ -473,14 +474,14 @@ static void t_reftable_index_record_comparison(void)
 	check(!reftable_buf_addstr(&in[1].u.idx.last_key, "refs/heads/master"));
 	check(!reftable_buf_addstr(&in[2].u.idx.last_key, "refs/heads/branch"));
 
-	check(!reftable_record_equal(&in[0], &in[1], GIT_SHA1_RAWSZ));
+	check(!reftable_record_equal(&in[0], &in[1], REFTABLE_HASH_SIZE_SHA1));
 	check(!reftable_record_cmp(&in[0], &in[1]));
 
-	check(!reftable_record_equal(&in[1], &in[2], GIT_SHA1_RAWSZ));
+	check(!reftable_record_equal(&in[1], &in[2], REFTABLE_HASH_SIZE_SHA1));
 	check_int(reftable_record_cmp(&in[1], &in[2]), >, 0);
 
 	in[1].u.idx.offset = in[0].u.idx.offset;
-	check(reftable_record_equal(&in[0], &in[1], GIT_SHA1_RAWSZ));
+	check(reftable_record_equal(&in[0], &in[1], REFTABLE_HASH_SIZE_SHA1));
 	check(!reftable_record_cmp(&in[0], &in[1]));
 
 	for (size_t i = 0; i < ARRAY_SIZE(in); i++)
@@ -516,15 +517,15 @@ static void t_reftable_index_record_roundtrip(void)
 
 	check(!reftable_record_is_deletion(&in));
 	check(!reftable_buf_cmp(&key, &in.u.idx.last_key));
-	n = reftable_record_encode(&in, dest, GIT_SHA1_RAWSZ);
+	n = reftable_record_encode(&in, dest, REFTABLE_HASH_SIZE_SHA1);
 	check_int(n, >, 0);
 
 	extra = reftable_record_val_type(&in);
-	m = reftable_record_decode(&out, key, extra, dest, GIT_SHA1_RAWSZ,
+	m = reftable_record_decode(&out, key, extra, dest, REFTABLE_HASH_SIZE_SHA1,
 				   &scratch);
 	check_int(m, ==, n);
 
-	check(reftable_record_equal(&in, &out, GIT_SHA1_RAWSZ));
+	check(reftable_record_equal(&in, &out, REFTABLE_HASH_SIZE_SHA1));
 
 	reftable_record_release(&out);
 	reftable_buf_release(&key);
diff --git a/t/unit-tests/t-reftable-stack.c b/t/unit-tests/t-reftable-stack.c
index 1b4363a58fc5be..13fd8d8f941fde 100644
--- a/t/unit-tests/t-reftable-stack.c
+++ b/t/unit-tests/t-reftable-stack.c
@@ -121,7 +121,7 @@ static void write_n_ref_tables(struct reftable_stack *st,
 
 		snprintf(buf, sizeof(buf), "refs/heads/branch-%04"PRIuMAX, (uintmax_t)i);
 		ref.refname = buf;
-		t_reftable_set_hash(ref.value.val1, i, GIT_SHA1_FORMAT_ID);
+		t_reftable_set_hash(ref.value.val1, i, REFTABLE_HASH_SHA1);
 
 		err = reftable_stack_add(st, &write_test_ref, &ref);
 		check(!err);
@@ -169,7 +169,7 @@ static void t_reftable_stack_add_one(void)
 
 	err = reftable_stack_read_ref(st, ref.refname, &dest);
 	check(!err);
-	check(reftable_ref_record_equal(&ref, &dest, GIT_SHA1_RAWSZ));
+	check(reftable_ref_record_equal(&ref, &dest, REFTABLE_HASH_SIZE_SHA1));
 	check_int(st->readers_len, >, 0);
 
 #ifndef GIT_WINDOWS_NATIVE
@@ -280,7 +280,7 @@ static void t_reftable_stack_transaction_api(void)
 	err = reftable_stack_read_ref(st, ref.refname, &dest);
 	check(!err);
 	check_int(REFTABLE_REF_SYMREF, ==, dest.value_type);
-	check(reftable_ref_record_equal(&ref, &dest, GIT_SHA1_RAWSZ));
+	check(reftable_ref_record_equal(&ref, &dest, REFTABLE_HASH_SIZE_SHA1));
 
 	reftable_ref_record_release(&dest);
 	reftable_stack_destroy(st);
@@ -340,7 +340,7 @@ static void t_reftable_stack_transaction_with_reload(void)
 	for (size_t i = 0; i < ARRAY_SIZE(refs); i++) {
 		err = reftable_stack_read_ref(st2, refs[i].refname, &ref);
 		check(!err);
-		check(reftable_ref_record_equal(&refs[i], &ref, GIT_SHA1_RAWSZ));
+		check(reftable_ref_record_equal(&refs[i], &ref, REFTABLE_HASH_SIZE_SHA1));
 	}
 
 	reftable_ref_record_release(&ref);
@@ -530,13 +530,13 @@ static void t_reftable_stack_add(void)
 		refs[i].refname = xstrdup(buf);
 		refs[i].update_index = i + 1;
 		refs[i].value_type = REFTABLE_REF_VAL1;
-		t_reftable_set_hash(refs[i].value.val1, i, GIT_SHA1_FORMAT_ID);
+		t_reftable_set_hash(refs[i].value.val1, i, REFTABLE_HASH_SHA1);
 
 		logs[i].refname = xstrdup(buf);
 		logs[i].update_index = N + i + 1;
 		logs[i].value_type = REFTABLE_LOG_UPDATE;
 		logs[i].value.update.email = xstrdup("identity@invalid");
-		t_reftable_set_hash(logs[i].value.update.new_hash, i, GIT_SHA1_FORMAT_ID);
+		t_reftable_set_hash(logs[i].value.update.new_hash, i, REFTABLE_HASH_SHA1);
 	}
 
 	for (i = 0; i < N; i++) {
@@ -562,7 +562,7 @@ static void t_reftable_stack_add(void)
 		int err = reftable_stack_read_ref(st, refs[i].refname, &dest);
 		check(!err);
 		check(reftable_ref_record_equal(&dest, refs + i,
-						 GIT_SHA1_RAWSZ));
+						 REFTABLE_HASH_SIZE_SHA1));
 		reftable_ref_record_release(&dest);
 	}
 
@@ -571,7 +571,7 @@ static void t_reftable_stack_add(void)
 		int err = reftable_stack_read_log(st, refs[i].refname, &dest);
 		check(!err);
 		check(reftable_log_record_equal(&dest, logs + i,
-						 GIT_SHA1_RAWSZ));
+						 REFTABLE_HASH_SIZE_SHA1));
 		reftable_log_record_release(&dest);
 	}
 
@@ -622,14 +622,14 @@ static void t_reftable_stack_iterator(void)
 		refs[i].refname = xstrfmt("branch%02"PRIuMAX, (uintmax_t)i);
 		refs[i].update_index = i + 1;
 		refs[i].value_type = REFTABLE_REF_VAL1;
-		t_reftable_set_hash(refs[i].value.val1, i, GIT_SHA1_FORMAT_ID);
+		t_reftable_set_hash(refs[i].value.val1, i, REFTABLE_HASH_SHA1);
 
 		logs[i].refname = xstrfmt("branch%02"PRIuMAX, (uintmax_t)i);
 		logs[i].update_index = i + 1;
 		logs[i].value_type = REFTABLE_LOG_UPDATE;
 		logs[i].value.update.email = xstrdup("johndoe@invalid");
 		logs[i].value.update.message = xstrdup("commit\n");
-		t_reftable_set_hash(logs[i].value.update.new_hash, i, GIT_SHA1_FORMAT_ID);
+		t_reftable_set_hash(logs[i].value.update.new_hash, i, REFTABLE_HASH_SHA1);
 	}
 
 	for (i = 0; i < N; i++) {
@@ -656,7 +656,7 @@ static void t_reftable_stack_iterator(void)
 		if (err > 0)
 			break;
 		check(!err);
-		check(reftable_ref_record_equal(&ref, &refs[i], GIT_SHA1_RAWSZ));
+		check(reftable_ref_record_equal(&ref, &refs[i], REFTABLE_HASH_SIZE_SHA1));
 		reftable_ref_record_release(&ref);
 	}
 	check_int(i, ==, N);
@@ -674,7 +674,7 @@ static void t_reftable_stack_iterator(void)
 		if (err > 0)
 			break;
 		check(!err);
-		check(reftable_log_record_equal(&log, &logs[i], GIT_SHA1_RAWSZ));
+		check(reftable_log_record_equal(&log, &logs[i], REFTABLE_HASH_SIZE_SHA1));
 		reftable_log_record_release(&log);
 	}
 	check_int(i, ==, N);
@@ -767,7 +767,7 @@ static void t_reftable_stack_tombstone(void)
 		if (i % 2 == 0) {
 			refs[i].value_type = REFTABLE_REF_VAL1;
 			t_reftable_set_hash(refs[i].value.val1, i,
-					    GIT_SHA1_FORMAT_ID);
+					    REFTABLE_HASH_SHA1);
 		}
 
 		logs[i].refname = xstrdup(buf);
@@ -776,7 +776,7 @@ static void t_reftable_stack_tombstone(void)
 		if (i % 2 == 0) {
 			logs[i].value_type = REFTABLE_LOG_UPDATE;
 			t_reftable_set_hash(logs[i].value.update.new_hash, i,
-					    GIT_SHA1_FORMAT_ID);
+					    REFTABLE_HASH_SHA1);
 			logs[i].value.update.email =
 				xstrdup("identity@invalid");
 		}
@@ -836,7 +836,7 @@ static void t_reftable_stack_hash_id(void)
 		.value.symref = (char *) "target",
 		.update_index = 1,
 	};
-	struct reftable_write_options opts32 = { .hash_id = GIT_SHA256_FORMAT_ID };
+	struct reftable_write_options opts32 = { .hash_id = REFTABLE_HASH_SHA256 };
 	struct reftable_stack *st32 = NULL;
 	struct reftable_write_options opts_default = { 0 };
 	struct reftable_stack *st_default = NULL;
@@ -859,7 +859,7 @@ static void t_reftable_stack_hash_id(void)
 	err = reftable_stack_read_ref(st_default, "master", &dest);
 	check(!err);
 
-	check(reftable_ref_record_equal(&ref, &dest, GIT_SHA1_RAWSZ));
+	check(reftable_ref_record_equal(&ref, &dest, REFTABLE_HASH_SIZE_SHA1));
 	reftable_ref_record_release(&dest);
 	reftable_stack_destroy(st);
 	reftable_stack_destroy(st_default);
@@ -909,7 +909,7 @@ static void t_reflog_expire(void)
 		logs[i].value.update.time = i;
 		logs[i].value.update.email = xstrdup("identity@invalid");
 		t_reftable_set_hash(logs[i].value.update.new_hash, i,
-				    GIT_SHA1_FORMAT_ID);
+				    REFTABLE_HASH_SHA1);
 	}
 
 	for (i = 1; i <= N; i++) {

From 3c58a50c5ee7b3008e6559e3d79f3ae89b8ed734 Mon Sep 17 00:00:00 2001
From: Patrick Steinhardt <ps@pks.im>
Date: Wed, 23 Oct 2024 11:56:04 +0200
Subject: [PATCH 47/69] reftable/stack: stop using `fsync_component()` directly

We're executing `fsync_component()` directly in the reftable library so
that we can fsync data to disk depending on "core.fsync". But as we're
in the process of converting the reftable library to become standalone
we cannot use that function in the library anymore.

Refactor the code such that users of the library can inject a custom
fsync function via the write options. This allows us to get rid of the
dependency on "write-or-die.h".

Signed-off-by: Patrick Steinhardt <ps@pks.im>
Signed-off-by: Taylor Blau <me@ttaylorr.com>
---
 refs/reftable-backend.c    |  7 ++++++
 reftable/reftable-writer.h |  6 +++++
 reftable/stack.c           | 49 +++++++++++++++++++++++++-------------
 3 files changed, 45 insertions(+), 17 deletions(-)

diff --git a/refs/reftable-backend.c b/refs/reftable-backend.c
index 7d86d920970e7c..2e774176eda3b2 100644
--- a/refs/reftable-backend.c
+++ b/refs/reftable-backend.c
@@ -24,6 +24,7 @@
 #include "../setup.h"
 #include "../strmap.h"
 #include "../trace2.h"
+#include "../write-or-die.h"
 #include "parse.h"
 #include "refs-internal.h"
 
@@ -273,6 +274,11 @@ static int reftable_be_config(const char *var, const char *value,
 	return 0;
 }
 
+static int reftable_be_fsync(int fd)
+{
+	return fsync_component(FSYNC_COMPONENT_REFERENCE, fd);
+}
+
 static struct ref_store *reftable_be_init(struct repository *repo,
 					  const char *gitdir,
 					  unsigned int store_flags)
@@ -304,6 +310,7 @@ static struct ref_store *reftable_be_init(struct repository *repo,
 	refs->write_options.disable_auto_compact =
 		!git_env_bool("GIT_TEST_REFTABLE_AUTOCOMPACTION", 1);
 	refs->write_options.lock_timeout_ms = 100;
+	refs->write_options.fsync = reftable_be_fsync;
 
 	git_config(reftable_be_config, &refs->write_options);
 
diff --git a/reftable/reftable-writer.h b/reftable/reftable-writer.h
index 211860d08a4aac..c85ef5a5bd1459 100644
--- a/reftable/reftable-writer.h
+++ b/reftable/reftable-writer.h
@@ -62,6 +62,12 @@ struct reftable_write_options {
 	 * negative value will cause us to block indefinitely.
 	 */
 	long lock_timeout_ms;
+
+	/*
+	 * Optional callback used to fsync files to disk. Falls back to using
+	 * fsync(3P) when unset.
+	 */
+	int (*fsync)(int fd);
 };
 
 /* reftable_block_stats holds statistics for a single block type */
diff --git a/reftable/stack.c b/reftable/stack.c
index 9ae716ff375f8e..df4f3237007308 100644
--- a/reftable/stack.c
+++ b/reftable/stack.c
@@ -8,7 +8,6 @@ license that can be found in the LICENSE file or at
 
 #include "stack.h"
 
-#include "../write-or-die.h"
 #include "system.h"
 #include "constants.h"
 #include "merged.h"
@@ -43,17 +42,28 @@ static int stack_filename(struct reftable_buf *dest, struct reftable_stack *st,
 	return 0;
 }
 
-static ssize_t reftable_fd_write(void *arg, const void *data, size_t sz)
+static int stack_fsync(struct reftable_stack *st, int fd)
 {
-	int *fdp = (int *)arg;
-	return write_in_full(*fdp, data, sz);
+	if (st->opts.fsync)
+		return st->opts.fsync(fd);
+	return fsync(fd);
 }
 
-static int reftable_fd_flush(void *arg)
+struct fd_writer {
+	struct reftable_stack *stack;
+	int fd;
+};
+
+static ssize_t fd_writer_write(void *arg, const void *data, size_t sz)
 {
-	int *fdp = (int *)arg;
+	struct fd_writer *writer = arg;
+	return write_in_full(writer->fd, data, sz);
+}
 
-	return fsync_component(FSYNC_COMPONENT_REFERENCE, *fdp);
+static int fd_writer_flush(void *arg)
+{
+	struct fd_writer *writer = arg;
+	return stack_fsync(writer->stack, writer->fd);
 }
 
 int reftable_new_stack(struct reftable_stack **dest, const char *dir,
@@ -765,7 +775,7 @@ int reftable_addition_commit(struct reftable_addition *add)
 		goto done;
 	}
 
-	err = fsync_component(FSYNC_COMPONENT_REFERENCE, lock_file_fd);
+	err = stack_fsync(add->stack, lock_file_fd);
 	if (err < 0) {
 		err = REFTABLE_IO_ERROR;
 		goto done;
@@ -858,8 +868,10 @@ int reftable_addition_add(struct reftable_addition *add,
 	struct reftable_buf next_name = REFTABLE_BUF_INIT;
 	struct reftable_writer *wr = NULL;
 	struct tempfile *tab_file = NULL;
+	struct fd_writer writer = {
+		.stack = add->stack,
+	};
 	int err = 0;
-	int tab_fd;
 
 	reftable_buf_reset(&next_name);
 
@@ -887,10 +899,10 @@ int reftable_addition_add(struct reftable_addition *add,
 			goto done;
 		}
 	}
-	tab_fd = get_tempfile_fd(tab_file);
 
-	err = reftable_writer_new(&wr, reftable_fd_write, reftable_fd_flush,
-				  &tab_fd, &add->stack->opts);
+	writer.fd = get_tempfile_fd(tab_file);
+	err = reftable_writer_new(&wr, fd_writer_write, fd_writer_flush,
+				  &writer, &add->stack->opts);
 	if (err < 0)
 		goto done;
 
@@ -973,8 +985,11 @@ static int stack_compact_locked(struct reftable_stack *st,
 	struct reftable_buf next_name = REFTABLE_BUF_INIT;
 	struct reftable_buf tab_file_path = REFTABLE_BUF_INIT;
 	struct reftable_writer *wr = NULL;
+	struct fd_writer writer=  {
+		.stack = st,
+	};
 	struct tempfile *tab_file;
-	int tab_fd, err = 0;
+	int err = 0;
 
 	err = format_name(&next_name, reftable_reader_min_update_index(st->readers[first]),
 			  reftable_reader_max_update_index(st->readers[last]));
@@ -994,7 +1009,6 @@ static int stack_compact_locked(struct reftable_stack *st,
 		err = REFTABLE_IO_ERROR;
 		goto done;
 	}
-	tab_fd = get_tempfile_fd(tab_file);
 
 	if (st->opts.default_permissions &&
 	    chmod(get_tempfile_path(tab_file), st->opts.default_permissions) < 0) {
@@ -1002,8 +1016,9 @@ static int stack_compact_locked(struct reftable_stack *st,
 		goto done;
 	}
 
-	err = reftable_writer_new(&wr, reftable_fd_write, reftable_fd_flush,
-				  &tab_fd, &st->opts);
+	writer.fd = get_tempfile_fd(tab_file);
+	err = reftable_writer_new(&wr, fd_writer_write, fd_writer_flush,
+				  &writer, &st->opts);
 	if (err < 0)
 		goto done;
 
@@ -1460,7 +1475,7 @@ static int stack_compact_range(struct reftable_stack *st,
 		goto done;
 	}
 
-	err = fsync_component(FSYNC_COMPONENT_REFERENCE, get_lock_file_fd(&tables_list_lock));
+	err = stack_fsync(st, get_lock_file_fd(&tables_list_lock));
 	if (err < 0) {
 		err = REFTABLE_IO_ERROR;
 		unlink(new_table_path.buf);

From 3f5e8d23cceb83eee92702c2139c9e7c7943b9eb Mon Sep 17 00:00:00 2001
From: Patrick Steinhardt <ps@pks.im>
Date: Wed, 23 Oct 2024 11:56:07 +0200
Subject: [PATCH 48/69] reftable/system: provide thin wrapper for tempfile
 subsystem

We use the tempfile subsystem to write temporary tables, but given that
we're in the process of converting the reftable library to become
standalone we cannot use this subsystem directly anymore. While we could
in theory convert the code to use mkstemp(3p) instead, we'd lose access
to our infrastructure that automatically prunes tempfiles via atexit(3p)
or signal handlers.

Provide a thin wrapper for the tempfile subsystem instead. Like this,
the compatibility shim is fully self-contained in "reftable/system.c".
Downstream users of the reftable library would have to implement their
own tempfile shims by replacing "system.c" with a custom version.

Signed-off-by: Patrick Steinhardt <ps@pks.im>
Signed-off-by: Taylor Blau <me@ttaylorr.com>
---
 Makefile          |  1 +
 reftable/stack.c  | 57 +++++++++++++++++++----------------------------
 reftable/system.c | 49 ++++++++++++++++++++++++++++++++++++++++
 reftable/system.h | 41 +++++++++++++++++++++++++++++++++-
 4 files changed, 113 insertions(+), 35 deletions(-)
 create mode 100644 reftable/system.c

diff --git a/Makefile b/Makefile
index feeed6f9321a50..50a79ad83fc0e4 100644
--- a/Makefile
+++ b/Makefile
@@ -2722,6 +2722,7 @@ REFTABLE_OBJS += reftable/pq.o
 REFTABLE_OBJS += reftable/reader.o
 REFTABLE_OBJS += reftable/record.o
 REFTABLE_OBJS += reftable/stack.o
+REFTABLE_OBJS += reftable/system.o
 REFTABLE_OBJS += reftable/tree.o
 REFTABLE_OBJS += reftable/writer.o
 
diff --git a/reftable/stack.c b/reftable/stack.c
index df4f3237007308..67b2117a1129cb 100644
--- a/reftable/stack.c
+++ b/reftable/stack.c
@@ -16,7 +16,6 @@ license that can be found in the LICENSE file or at
 #include "reftable-record.h"
 #include "reftable-merged.h"
 #include "writer.h"
-#include "tempfile.h"
 
 static int stack_try_add(struct reftable_stack *st,
 			 int (*write_table)(struct reftable_writer *wr,
@@ -867,7 +866,7 @@ int reftable_addition_add(struct reftable_addition *add,
 	struct reftable_buf tab_file_name = REFTABLE_BUF_INIT;
 	struct reftable_buf next_name = REFTABLE_BUF_INIT;
 	struct reftable_writer *wr = NULL;
-	struct tempfile *tab_file = NULL;
+	struct reftable_tmpfile tab_file = REFTABLE_TMPFILE_INIT;
 	struct fd_writer writer = {
 		.stack = add->stack,
 	};
@@ -887,20 +886,18 @@ int reftable_addition_add(struct reftable_addition *add,
 	if (err < 0)
 		goto done;
 
-	tab_file = mks_tempfile(temp_tab_file_name.buf);
-	if (!tab_file) {
-		err = REFTABLE_IO_ERROR;
+	err = tmpfile_from_pattern(&tab_file, temp_tab_file_name.buf);
+	if (err < 0)
 		goto done;
-	}
 	if (add->stack->opts.default_permissions) {
-		if (chmod(get_tempfile_path(tab_file),
+		if (chmod(tab_file.path,
 			  add->stack->opts.default_permissions)) {
 			err = REFTABLE_IO_ERROR;
 			goto done;
 		}
 	}
 
-	writer.fd = get_tempfile_fd(tab_file);
+	writer.fd = tab_file.fd;
 	err = reftable_writer_new(&wr, fd_writer_write, fd_writer_flush,
 				  &writer, &add->stack->opts);
 	if (err < 0)
@@ -918,11 +915,9 @@ int reftable_addition_add(struct reftable_addition *add,
 	if (err < 0)
 		goto done;
 
-	err = close_tempfile_gently(tab_file);
-	if (err < 0) {
-		err = REFTABLE_IO_ERROR;
+	err = tmpfile_close(&tab_file);
+	if (err < 0)
 		goto done;
-	}
 
 	if (wr->min_update_index < add->next_update_index) {
 		err = REFTABLE_API_ERROR;
@@ -945,11 +940,9 @@ int reftable_addition_add(struct reftable_addition *add,
 	  On windows, this relies on rand() picking a unique destination name.
 	  Maybe we should do retry loop as well?
 	 */
-	err = rename_tempfile(&tab_file, tab_file_name.buf);
-	if (err < 0) {
-		err = REFTABLE_IO_ERROR;
+	err = tmpfile_rename(&tab_file, tab_file_name.buf);
+	if (err < 0)
 		goto done;
-	}
 
 	REFTABLE_ALLOC_GROW(add->new_tables, add->new_tables_len + 1,
 			    add->new_tables_cap);
@@ -960,7 +953,7 @@ int reftable_addition_add(struct reftable_addition *add,
 	add->new_tables[add->new_tables_len++] = reftable_buf_detach(&next_name);
 
 done:
-	delete_tempfile(&tab_file);
+	tmpfile_delete(&tab_file);
 	reftable_buf_release(&temp_tab_file_name);
 	reftable_buf_release(&tab_file_name);
 	reftable_buf_release(&next_name);
@@ -980,7 +973,7 @@ uint64_t reftable_stack_next_update_index(struct reftable_stack *st)
 static int stack_compact_locked(struct reftable_stack *st,
 				size_t first, size_t last,
 				struct reftable_log_expiry_config *config,
-				struct tempfile **tab_file_out)
+				struct reftable_tmpfile *tab_file_out)
 {
 	struct reftable_buf next_name = REFTABLE_BUF_INIT;
 	struct reftable_buf tab_file_path = REFTABLE_BUF_INIT;
@@ -988,7 +981,7 @@ static int stack_compact_locked(struct reftable_stack *st,
 	struct fd_writer writer=  {
 		.stack = st,
 	};
-	struct tempfile *tab_file;
+	struct reftable_tmpfile tab_file = REFTABLE_TMPFILE_INIT;
 	int err = 0;
 
 	err = format_name(&next_name, reftable_reader_min_update_index(st->readers[first]),
@@ -1004,19 +997,17 @@ static int stack_compact_locked(struct reftable_stack *st,
 	if (err < 0)
 		goto done;
 
-	tab_file = mks_tempfile(tab_file_path.buf);
-	if (!tab_file) {
-		err = REFTABLE_IO_ERROR;
+	err = tmpfile_from_pattern(&tab_file, tab_file_path.buf);
+	if (err < 0)
 		goto done;
-	}
 
 	if (st->opts.default_permissions &&
-	    chmod(get_tempfile_path(tab_file), st->opts.default_permissions) < 0) {
+	    chmod(tab_file.path, st->opts.default_permissions) < 0) {
 		err = REFTABLE_IO_ERROR;
 		goto done;
 	}
 
-	writer.fd = get_tempfile_fd(tab_file);
+	writer.fd = tab_file.fd;
 	err = reftable_writer_new(&wr, fd_writer_write, fd_writer_flush,
 				  &writer, &st->opts);
 	if (err < 0)
@@ -1030,15 +1021,15 @@ static int stack_compact_locked(struct reftable_stack *st,
 	if (err < 0)
 		goto done;
 
-	err = close_tempfile_gently(tab_file);
+	err = tmpfile_close(&tab_file);
 	if (err < 0)
 		goto done;
 
 	*tab_file_out = tab_file;
-	tab_file = NULL;
+	tab_file = REFTABLE_TMPFILE_INIT;
 
 done:
-	delete_tempfile(&tab_file);
+	tmpfile_delete(&tab_file);
 	reftable_writer_free(wr);
 	reftable_buf_release(&next_name);
 	reftable_buf_release(&tab_file_path);
@@ -1171,7 +1162,7 @@ static int stack_compact_range(struct reftable_stack *st,
 	struct reftable_buf table_name = REFTABLE_BUF_INIT;
 	struct lock_file tables_list_lock = LOCK_INIT;
 	struct lock_file *table_locks = NULL;
-	struct tempfile *new_table = NULL;
+	struct reftable_tmpfile new_table = REFTABLE_TMPFILE_INIT;
 	int is_empty_table = 0, err = 0;
 	size_t first_to_replace, last_to_replace;
 	size_t i, nlocks = 0;
@@ -1439,11 +1430,9 @@ static int stack_compact_range(struct reftable_stack *st,
 		if (err < 0)
 			goto done;
 
-		err = rename_tempfile(&new_table, new_table_path.buf);
-		if (err < 0) {
-			err = REFTABLE_IO_ERROR;
+		err = tmpfile_rename(&new_table, new_table_path.buf);
+		if (err < 0)
 			goto done;
-		}
 	}
 
 	/*
@@ -1515,7 +1504,7 @@ static int stack_compact_range(struct reftable_stack *st,
 		rollback_lock_file(&table_locks[i]);
 	reftable_free(table_locks);
 
-	delete_tempfile(&new_table);
+	tmpfile_delete(&new_table);
 	reftable_buf_release(&new_table_name);
 	reftable_buf_release(&new_table_path);
 	reftable_buf_release(&tables_list_buf);
diff --git a/reftable/system.c b/reftable/system.c
new file mode 100644
index 00000000000000..01f96f03d8493d
--- /dev/null
+++ b/reftable/system.c
@@ -0,0 +1,49 @@
+#include "system.h"
+#include "basics.h"
+#include "reftable-error.h"
+#include "../tempfile.h"
+
+int tmpfile_from_pattern(struct reftable_tmpfile *out, const char *pattern)
+{
+	struct tempfile *tempfile;
+
+	tempfile = mks_tempfile(pattern);
+	if (!tempfile)
+		return REFTABLE_IO_ERROR;
+
+	out->path = tempfile->filename.buf;
+	out->fd = tempfile->fd;
+	out->priv = tempfile;
+
+	return 0;
+}
+
+int tmpfile_close(struct reftable_tmpfile *t)
+{
+	struct tempfile *tempfile = t->priv;
+	int ret = close_tempfile_gently(tempfile);
+	t->fd = -1;
+	if (ret < 0)
+		return REFTABLE_IO_ERROR;
+	return 0;
+}
+
+int tmpfile_delete(struct reftable_tmpfile *t)
+{
+	struct tempfile *tempfile = t->priv;
+	int ret = delete_tempfile(&tempfile);
+	*t = REFTABLE_TMPFILE_INIT;
+	if (ret < 0)
+		return REFTABLE_IO_ERROR;
+	return 0;
+}
+
+int tmpfile_rename(struct reftable_tmpfile *t, const char *path)
+{
+	struct tempfile *tempfile = t->priv;
+	int ret = rename_tempfile(&tempfile, path);
+	*t = REFTABLE_TMPFILE_INIT;
+	if (ret < 0)
+		return REFTABLE_IO_ERROR;
+	return 0;
+}
diff --git a/reftable/system.h b/reftable/system.h
index 38d3534620e88e..e7595800907615 100644
--- a/reftable/system.h
+++ b/reftable/system.h
@@ -13,6 +13,45 @@ license that can be found in the LICENSE file or at
 
 #include "git-compat-util.h"
 #include "lockfile.h"
-#include "tempfile.h"
+
+/*
+ * An implementation-specific temporary file. By making this specific to the
+ * implementation it becomes possible to tie temporary files into any kind of
+ * signal or atexit handlers for cleanup on abnormal situations.
+ */
+struct reftable_tmpfile {
+	const char *path;
+	int fd;
+	void *priv;
+};
+#define REFTABLE_TMPFILE_INIT ((struct reftable_tmpfile) { .fd = -1, })
+
+/*
+ * Create a temporary file from a pattern similar to how mkstemp(3p) would.
+ * The `pattern` shall not be modified. On success, the structure at `out` has
+ * been initialized such that it is ready for use. Returns 0 on success, a
+ * reftable error code on error.
+ */
+int tmpfile_from_pattern(struct reftable_tmpfile *out, const char *pattern);
+
+/*
+ * Close the temporary file's file descriptor without removing the file itself.
+ * This is a no-op in case the file has already been closed beforehand. Returns
+ * 0 on success, a reftable error code on error.
+ */
+int tmpfile_close(struct reftable_tmpfile *t);
+
+/*
+ * Close the temporary file and delete it. This is a no-op in case the file has
+ * already been deleted or renamed beforehand. Returns 0 on success, a reftable
+ * error code on error.
+ */
+int tmpfile_delete(struct reftable_tmpfile *t);
+
+/*
+ * Rename the temporary file to the provided path. The temporary file must be
+ * active. Return 0 on success, a reftable error code on error.
+ */
+int tmpfile_rename(struct reftable_tmpfile *t, const char *path);
 
 #endif

From ec282c1e2df3e22d1687a17d26dc87a33c6ae8f3 Mon Sep 17 00:00:00 2001
From: Patrick Steinhardt <ps@pks.im>
Date: Wed, 23 Oct 2024 11:56:10 +0200
Subject: [PATCH 49/69] reftable/stack: drop only use of
 `get_locked_file_path()`

We've got a single callsite where we call `get_locked_file_path()`. As
we're about to convert our usage of the lockfile subsystem to instead be
used via a compatibility shim we'd have to implement more logic for this
single callsite. While that would be okay if Git was the only supposed
user of the reftable library, it's a bit more awkward when considering
that we have to reimplement this functionality for every user of the
library eventually.

Refactor the code such that we don't call `get_locked_file_path()`
anymore.

Signed-off-by: Patrick Steinhardt <ps@pks.im>
Signed-off-by: Taylor Blau <me@ttaylorr.com>
---
 reftable/stack.c | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/reftable/stack.c b/reftable/stack.c
index 67b2117a1129cb..c1a4e25e3a2fdb 100644
--- a/reftable/stack.c
+++ b/reftable/stack.c
@@ -1493,9 +1493,15 @@ static int stack_compact_range(struct reftable_stack *st,
 	 */
 	for (i = 0; i < nlocks; i++) {
 		struct lock_file *table_lock = &table_locks[i];
-		char *table_path = get_locked_file_path(table_lock);
-		unlink(table_path);
-		reftable_free(table_path);
+		const char *lock_path = get_lock_file_path(table_lock);
+
+		reftable_buf_reset(&table_name);
+		err = reftable_buf_add(&table_name, lock_path,
+				       strlen(lock_path) - strlen(".lock"));
+		if (err)
+			continue;
+
+		unlink(table_name.buf);
 	}
 
 done:

From 37403254727dc44640e34e415528bd7bcb1379f2 Mon Sep 17 00:00:00 2001
From: Patrick Steinhardt <ps@pks.im>
Date: Wed, 23 Oct 2024 11:56:15 +0200
Subject: [PATCH 50/69] reftable/system: provide thin wrapper for lockfile
 subsystem

We use the lockfile subsystem to write lockfiles for "tables.list". As
with the tempfile subsystem, the lockfile subsystem also hooks into our
infrastructure to prune stale locks via atexit(3p) or signal handlers.

Furthermore, the lockfile subsystem also handles locking timeouts, which
do add quite a bit of logic. Having to reimplement that in the context
of Git wouldn't make a whole lot of sense, and it is quite likely that
downstream users of the reftable library may have a better idea for how
exactly to implement timeouts.

So again, provide a thin wrapper for the lockfile subsystem instead such
that the compatibility shim is fully self-contained.

Signed-off-by: Patrick Steinhardt <ps@pks.im>
Signed-off-by: Taylor Blau <me@ttaylorr.com>
---
 reftable/stack.c                    | 63 ++++++++++-------------
 reftable/system.c                   | 77 +++++++++++++++++++++++++++++
 reftable/system.h                   | 45 ++++++++++++++++-
 t/unit-tests/lib-reftable.c         |  1 +
 t/unit-tests/t-reftable-block.c     |  1 +
 t/unit-tests/t-reftable-pq.c        |  1 +
 t/unit-tests/t-reftable-readwrite.c |  1 +
 t/unit-tests/t-reftable-stack.c     |  2 +
 8 files changed, 154 insertions(+), 37 deletions(-)

diff --git a/reftable/stack.c b/reftable/stack.c
index c1a4e25e3a2fdb..1fffd75630266c 100644
--- a/reftable/stack.c
+++ b/reftable/stack.c
@@ -657,7 +657,7 @@ static int format_name(struct reftable_buf *dest, uint64_t min, uint64_t max)
 }
 
 struct reftable_addition {
-	struct lock_file tables_list_lock;
+	struct reftable_flock tables_list_lock;
 	struct reftable_stack *stack;
 
 	char **new_tables;
@@ -676,10 +676,8 @@ static int reftable_stack_init_addition(struct reftable_addition *add,
 
 	add->stack = st;
 
-	err = hold_lock_file_for_update_timeout(&add->tables_list_lock,
-						st->list_file,
-						LOCK_NO_DEREF,
-						st->opts.lock_timeout_ms);
+	err = flock_acquire(&add->tables_list_lock, st->list_file,
+			    st->opts.lock_timeout_ms);
 	if (err < 0) {
 		if (errno == EEXIST) {
 			err = REFTABLE_LOCK_ERROR;
@@ -689,7 +687,7 @@ static int reftable_stack_init_addition(struct reftable_addition *add,
 		goto done;
 	}
 	if (st->opts.default_permissions) {
-		if (chmod(get_lock_file_path(&add->tables_list_lock),
+		if (chmod(add->tables_list_lock.path,
 			  st->opts.default_permissions) < 0) {
 			err = REFTABLE_IO_ERROR;
 			goto done;
@@ -733,7 +731,7 @@ static void reftable_addition_close(struct reftable_addition *add)
 	add->new_tables_len = 0;
 	add->new_tables_cap = 0;
 
-	rollback_lock_file(&add->tables_list_lock);
+	flock_release(&add->tables_list_lock);
 	reftable_buf_release(&nm);
 }
 
@@ -749,7 +747,6 @@ void reftable_addition_destroy(struct reftable_addition *add)
 int reftable_addition_commit(struct reftable_addition *add)
 {
 	struct reftable_buf table_list = REFTABLE_BUF_INIT;
-	int lock_file_fd = get_lock_file_fd(&add->tables_list_lock);
 	int err = 0;
 	size_t i;
 
@@ -767,20 +764,20 @@ int reftable_addition_commit(struct reftable_addition *add)
 			goto done;
 	}
 
-	err = write_in_full(lock_file_fd, table_list.buf, table_list.len);
+	err = write_in_full(add->tables_list_lock.fd, table_list.buf, table_list.len);
 	reftable_buf_release(&table_list);
 	if (err < 0) {
 		err = REFTABLE_IO_ERROR;
 		goto done;
 	}
 
-	err = stack_fsync(add->stack, lock_file_fd);
+	err = stack_fsync(add->stack, add->tables_list_lock.fd);
 	if (err < 0) {
 		err = REFTABLE_IO_ERROR;
 		goto done;
 	}
 
-	err = commit_lock_file(&add->tables_list_lock);
+	err = flock_commit(&add->tables_list_lock);
 	if (err < 0) {
 		err = REFTABLE_IO_ERROR;
 		goto done;
@@ -1160,8 +1157,8 @@ static int stack_compact_range(struct reftable_stack *st,
 	struct reftable_buf new_table_name = REFTABLE_BUF_INIT;
 	struct reftable_buf new_table_path = REFTABLE_BUF_INIT;
 	struct reftable_buf table_name = REFTABLE_BUF_INIT;
-	struct lock_file tables_list_lock = LOCK_INIT;
-	struct lock_file *table_locks = NULL;
+	struct reftable_flock tables_list_lock = REFTABLE_FLOCK_INIT;
+	struct reftable_flock *table_locks = NULL;
 	struct reftable_tmpfile new_table = REFTABLE_TMPFILE_INIT;
 	int is_empty_table = 0, err = 0;
 	size_t first_to_replace, last_to_replace;
@@ -1179,10 +1176,7 @@ static int stack_compact_range(struct reftable_stack *st,
 	 * Hold the lock so that we can read "tables.list" and lock all tables
 	 * which are part of the user-specified range.
 	 */
-	err = hold_lock_file_for_update_timeout(&tables_list_lock,
-						st->list_file,
-						LOCK_NO_DEREF,
-						st->opts.lock_timeout_ms);
+	err = flock_acquire(&tables_list_lock, st->list_file, st->opts.lock_timeout_ms);
 	if (err < 0) {
 		if (errno == EEXIST)
 			err = REFTABLE_LOCK_ERROR;
@@ -1205,19 +1199,20 @@ static int stack_compact_range(struct reftable_stack *st,
 	 * older process is still busy compacting tables which are preexisting
 	 * from the point of view of the newer process.
 	 */
-	REFTABLE_CALLOC_ARRAY(table_locks, last - first + 1);
+	REFTABLE_ALLOC_ARRAY(table_locks, last - first + 1);
 	if (!table_locks) {
 		err = REFTABLE_OUT_OF_MEMORY_ERROR;
 		goto done;
 	}
+	for (i = 0; i < last - first + 1; i++)
+		table_locks[i] = REFTABLE_FLOCK_INIT;
 
 	for (i = last + 1; i > first; i--) {
 		err = stack_filename(&table_name, st, reader_name(st->readers[i - 1]));
 		if (err < 0)
 			goto done;
 
-		err = hold_lock_file_for_update(&table_locks[nlocks],
-						table_name.buf, LOCK_NO_DEREF);
+		err = flock_acquire(&table_locks[nlocks], table_name.buf, 0);
 		if (err < 0) {
 			/*
 			 * When the table is locked already we may do a
@@ -1253,7 +1248,7 @@ static int stack_compact_range(struct reftable_stack *st,
 		 * run into file descriptor exhaustion when we compress a lot
 		 * of tables.
 		 */
-		err = close_lock_file_gently(&table_locks[nlocks++]);
+		err = flock_close(&table_locks[nlocks++]);
 		if (err < 0) {
 			err = REFTABLE_IO_ERROR;
 			goto done;
@@ -1265,7 +1260,7 @@ static int stack_compact_range(struct reftable_stack *st,
 	 * "tables.list" lock while compacting the locked tables. This allows
 	 * concurrent updates to the stack to proceed.
 	 */
-	err = rollback_lock_file(&tables_list_lock);
+	err = flock_release(&tables_list_lock);
 	if (err < 0) {
 		err = REFTABLE_IO_ERROR;
 		goto done;
@@ -1288,10 +1283,7 @@ static int stack_compact_range(struct reftable_stack *st,
 	 * "tables.list". We'll then replace the compacted range of tables with
 	 * the new table.
 	 */
-	err = hold_lock_file_for_update_timeout(&tables_list_lock,
-						st->list_file,
-						LOCK_NO_DEREF,
-						st->opts.lock_timeout_ms);
+	err = flock_acquire(&tables_list_lock, st->list_file, st->opts.lock_timeout_ms);
 	if (err < 0) {
 		if (errno == EEXIST)
 			err = REFTABLE_LOCK_ERROR;
@@ -1301,7 +1293,7 @@ static int stack_compact_range(struct reftable_stack *st,
 	}
 
 	if (st->opts.default_permissions) {
-		if (chmod(get_lock_file_path(&tables_list_lock),
+		if (chmod(tables_list_lock.path,
 			  st->opts.default_permissions) < 0) {
 			err = REFTABLE_IO_ERROR;
 			goto done;
@@ -1456,7 +1448,7 @@ static int stack_compact_range(struct reftable_stack *st,
 			goto done;
 	}
 
-	err = write_in_full(get_lock_file_fd(&tables_list_lock),
+	err = write_in_full(tables_list_lock.fd,
 			    tables_list_buf.buf, tables_list_buf.len);
 	if (err < 0) {
 		err = REFTABLE_IO_ERROR;
@@ -1464,14 +1456,14 @@ static int stack_compact_range(struct reftable_stack *st,
 		goto done;
 	}
 
-	err = stack_fsync(st, get_lock_file_fd(&tables_list_lock));
+	err = stack_fsync(st, tables_list_lock.fd);
 	if (err < 0) {
 		err = REFTABLE_IO_ERROR;
 		unlink(new_table_path.buf);
 		goto done;
 	}
 
-	err = commit_lock_file(&tables_list_lock);
+	err = flock_commit(&tables_list_lock);
 	if (err < 0) {
 		err = REFTABLE_IO_ERROR;
 		unlink(new_table_path.buf);
@@ -1492,12 +1484,11 @@ static int stack_compact_range(struct reftable_stack *st,
 	 * readers, so it is expected that unlinking tables may fail.
 	 */
 	for (i = 0; i < nlocks; i++) {
-		struct lock_file *table_lock = &table_locks[i];
-		const char *lock_path = get_lock_file_path(table_lock);
+		struct reftable_flock *table_lock = &table_locks[i];
 
 		reftable_buf_reset(&table_name);
-		err = reftable_buf_add(&table_name, lock_path,
-				       strlen(lock_path) - strlen(".lock"));
+		err = reftable_buf_add(&table_name, table_lock->path,
+				       strlen(table_lock->path) - strlen(".lock"));
 		if (err)
 			continue;
 
@@ -1505,9 +1496,9 @@ static int stack_compact_range(struct reftable_stack *st,
 	}
 
 done:
-	rollback_lock_file(&tables_list_lock);
+	flock_release(&tables_list_lock);
 	for (i = 0; table_locks && i < nlocks; i++)
-		rollback_lock_file(&table_locks[i]);
+		flock_release(&table_locks[i]);
 	reftable_free(table_locks);
 
 	tmpfile_delete(&new_table);
diff --git a/reftable/system.c b/reftable/system.c
index 01f96f03d8493d..adf8e4d30b823c 100644
--- a/reftable/system.c
+++ b/reftable/system.c
@@ -1,6 +1,7 @@
 #include "system.h"
 #include "basics.h"
 #include "reftable-error.h"
+#include "../lockfile.h"
 #include "../tempfile.h"
 
 int tmpfile_from_pattern(struct reftable_tmpfile *out, const char *pattern)
@@ -47,3 +48,79 @@ int tmpfile_rename(struct reftable_tmpfile *t, const char *path)
 		return REFTABLE_IO_ERROR;
 	return 0;
 }
+
+int flock_acquire(struct reftable_flock *l, const char *target_path,
+		  long timeout_ms)
+{
+	struct lock_file *lockfile;
+	int err;
+
+	lockfile = reftable_malloc(sizeof(*lockfile));
+	if (!lockfile)
+		return REFTABLE_OUT_OF_MEMORY_ERROR;
+
+	err = hold_lock_file_for_update_timeout(lockfile, target_path, LOCK_NO_DEREF,
+						timeout_ms);
+	if (err < 0) {
+		reftable_free(lockfile);
+		if (errno == EEXIST)
+			return REFTABLE_LOCK_ERROR;
+		return -1;
+	}
+
+	l->fd = get_lock_file_fd(lockfile);
+	l->path = get_lock_file_path(lockfile);
+	l->priv = lockfile;
+
+	return 0;
+}
+
+int flock_close(struct reftable_flock *l)
+{
+	struct lock_file *lockfile = l->priv;
+	int ret;
+
+	if (!lockfile)
+		return REFTABLE_API_ERROR;
+
+	ret = close_lock_file_gently(lockfile);
+	l->fd = -1;
+	if (ret < 0)
+		return REFTABLE_IO_ERROR;
+
+	return 0;
+}
+
+int flock_release(struct reftable_flock *l)
+{
+	struct lock_file *lockfile = l->priv;
+	int ret;
+
+	if (!lockfile)
+		return 0;
+
+	ret = rollback_lock_file(lockfile);
+	reftable_free(lockfile);
+	*l = REFTABLE_FLOCK_INIT;
+	if (ret < 0)
+		return REFTABLE_IO_ERROR;
+
+	return 0;
+}
+
+int flock_commit(struct reftable_flock *l)
+{
+	struct lock_file *lockfile = l->priv;
+	int ret;
+
+	if (!lockfile)
+		return REFTABLE_API_ERROR;
+
+	ret = commit_lock_file(lockfile);
+	reftable_free(lockfile);
+	*l = REFTABLE_FLOCK_INIT;
+	if (ret < 0)
+		return REFTABLE_IO_ERROR;
+
+	return 0;
+}
diff --git a/reftable/system.h b/reftable/system.h
index e7595800907615..0859c3539c6286 100644
--- a/reftable/system.h
+++ b/reftable/system.h
@@ -12,7 +12,6 @@ license that can be found in the LICENSE file or at
 /* This header glues the reftable library to the rest of Git */
 
 #include "git-compat-util.h"
-#include "lockfile.h"
 
 /*
  * An implementation-specific temporary file. By making this specific to the
@@ -54,4 +53,48 @@ int tmpfile_delete(struct reftable_tmpfile *t);
  */
 int tmpfile_rename(struct reftable_tmpfile *t, const char *path);
 
+/*
+ * An implementation-specific file lock. Same as with `reftable_tmpfile`,
+ * making this specific to the implementation makes it possible to tie this
+ * into signal or atexit handlers such that we know to clean up stale locks on
+ * abnormal exits.
+ */
+struct reftable_flock {
+	const char *path;
+	int fd;
+	void *priv;
+};
+#define REFTABLE_FLOCK_INIT ((struct reftable_flock){ .fd = -1, })
+
+/*
+ * Acquire the lock for the given target path by exclusively creating a file
+ * with ".lock" appended to it. If that lock exists, we wait up to `timeout_ms`
+ * to acquire the lock. If `timeout_ms` is 0 we don't wait, if it is negative
+ * we block indefinitely.
+ *
+ * Retrun 0 on success, a reftable error code on error.
+ */
+int flock_acquire(struct reftable_flock *l, const char *target_path,
+		  long timeout_ms);
+
+/*
+ * Close the lockfile's file descriptor without removing the lock itself. This
+ * is a no-op in case the lockfile has already been closed beforehand. Returns
+ * 0 on success, a reftable error code on error.
+ */
+int flock_close(struct reftable_flock *l);
+
+/*
+ * Release the lock by unlinking the lockfile. This is a no-op in case the
+ * lockfile has already been released or committed beforehand. Returns 0 on
+ * success, a reftable error code on error.
+ */
+int flock_release(struct reftable_flock *l);
+
+/*
+ * Commit the lock by renaming the lockfile into place. Returns 0 on success, a
+ * reftable error code on error.
+ */
+int flock_commit(struct reftable_flock *l);
+
 #endif
diff --git a/t/unit-tests/lib-reftable.c b/t/unit-tests/lib-reftable.c
index c1631f45275407..d795dfb7c9974a 100644
--- a/t/unit-tests/lib-reftable.c
+++ b/t/unit-tests/lib-reftable.c
@@ -2,6 +2,7 @@
 #include "test-lib.h"
 #include "reftable/constants.h"
 #include "reftable/writer.h"
+#include "strbuf.h"
 
 void t_reftable_set_hash(uint8_t *p, int i, enum reftable_hash id)
 {
diff --git a/t/unit-tests/t-reftable-block.c b/t/unit-tests/t-reftable-block.c
index 13e10807daed6f..22040aeefa528c 100644
--- a/t/unit-tests/t-reftable-block.c
+++ b/t/unit-tests/t-reftable-block.c
@@ -11,6 +11,7 @@ license that can be found in the LICENSE file or at
 #include "reftable/blocksource.h"
 #include "reftable/constants.h"
 #include "reftable/reftable-error.h"
+#include "strbuf.h"
 
 static void t_ref_block_read_write(void)
 {
diff --git a/t/unit-tests/t-reftable-pq.c b/t/unit-tests/t-reftable-pq.c
index 272da05bea679a..f3f8a0cdf38579 100644
--- a/t/unit-tests/t-reftable-pq.c
+++ b/t/unit-tests/t-reftable-pq.c
@@ -9,6 +9,7 @@ license that can be found in the LICENSE file or at
 #include "test-lib.h"
 #include "reftable/constants.h"
 #include "reftable/pq.h"
+#include "strbuf.h"
 
 static void merged_iter_pqueue_check(const struct merged_iter_pqueue *pq)
 {
diff --git a/t/unit-tests/t-reftable-readwrite.c b/t/unit-tests/t-reftable-readwrite.c
index 57896922eb1854..91c881aedfa29c 100644
--- a/t/unit-tests/t-reftable-readwrite.c
+++ b/t/unit-tests/t-reftable-readwrite.c
@@ -13,6 +13,7 @@ license that can be found in the LICENSE file or at
 #include "reftable/reader.h"
 #include "reftable/reftable-error.h"
 #include "reftable/reftable-writer.h"
+#include "strbuf.h"
 
 static const int update_index = 5;
 
diff --git a/t/unit-tests/t-reftable-stack.c b/t/unit-tests/t-reftable-stack.c
index 13fd8d8f941fde..b2f6c1c37e9733 100644
--- a/t/unit-tests/t-reftable-stack.c
+++ b/t/unit-tests/t-reftable-stack.c
@@ -13,6 +13,8 @@ license that can be found in the LICENSE file or at
 #include "reftable/reader.h"
 #include "reftable/reftable-error.h"
 #include "reftable/stack.h"
+#include "strbuf.h"
+#include "tempfile.h"
 #include <dirent.h>
 
 static void clear_dir(const char *dirname)

From ae0f757d2fecea5fd4d17b8c49aab1b80e0446e8 Mon Sep 17 00:00:00 2001
From: Patrick Steinhardt <ps@pks.im>
Date: Sun, 27 Oct 2024 16:39:43 +0100
Subject: [PATCH 51/69] compat/mingw: support POSIX semantics for atomic
 renames

By default, Windows restricts access to files when those files have been
opened by another process. As explained in the preceding commits, these
restrictions can be loosened such that reads, writes and/or deletes of
files with open handles _are_ allowed.

While we set up those sharing flags in most relevant code paths now, we
still don't properly handle POSIX-style atomic renames in case the
target path is open. This is failure demonstrated by t0610, where one of
our tests spawns concurrent writes in a reftable-enabled repository and
expects all of them to succeed. This test fails most of the time because
the process that has acquired the "tables.list" lock is unable to rename
it into place while other processes are busy reading that file.

Windows 10 has introduced the `FILE_RENAME_FLAG_POSIX_SEMANTICS` flag
that allows us to fix this usecase [1]. When set, it is possible to
rename a file over a preexisting file even when the target file still
has handles open. Those handles must have been opened with the
`FILE_SHARE_DELETE` flag, which we have ensured in the preceding
commits.

Careful readers might have noticed that [1] does not mention the above
flag, but instead mentions `FILE_RENAME_POSIX_SEMANTICS`. This flag is
not for use with `SetFileInformationByHandle()` though, which is what we
use. And while the `FILE_RENAME_FLAG_POSIX_SEMANTICS` flag exists, it is
not documented on [2] or anywhere else as far as I can tell.

Unfortunately, we still support Windows systems older than Windows 10
that do not yet have this new flag. Our `_WIN32_WINNT` SDK version still
targets 0x0600, which is Windows Vista and later. And even though that
Windows version is out-of-support, bumping the SDK version all the way
to 0x0A00, which is Windows 10 and later, is not an option as it would
make it impossible to compile on Windows 8.1, which is still supported.
Instead, we have to manually declare the relevant infrastructure to make
this feature available and have fallback logic in place in case we run
on a Windows version that does not yet have this flag.

On another note: `mingw_rename()` has a retry loop that is used in case
deleting a file failed because it's still open in another process. One
might be pressed to not use this loop anymore when we can use POSIX
semantics. But unfortunately, we have to keep it around due to our
dependence on the `FILE_SHARE_DELETE` flag. While we know to set that
sharing flag now, other applications may not do so and may thus still
cause sharing violations when we try to rename a file.

This fixes concurrent writes in the reftable backend as demonstrated in
t0610, but may also end up fixing other usecases where Git wants to
perform renames.

[1]: https://learn.microsoft.com/en-us/windows-hardware/drivers/ddi/ntifs/ns-ntifs-_file_rename_information
[2]: https://learn.microsoft.com/en-us/windows/win32/api/winbase/ns-winbase-file_rename_info

Signed-off-by: Patrick Steinhardt <ps@pks.im>
Signed-off-by: Taylor Blau <me@ttaylorr.com>
---
 compat/mingw.c             | 87 ++++++++++++++++++++++++++++++++++++--
 t/t0610-reftable-basics.sh |  8 ++--
 2 files changed, 88 insertions(+), 7 deletions(-)

diff --git a/compat/mingw.c b/compat/mingw.c
index 0d9600543cb85d..c4320769db654d 100644
--- a/compat/mingw.c
+++ b/compat/mingw.c
@@ -2217,10 +2217,16 @@ int mingw_accept(int sockfd1, struct sockaddr *sa, socklen_t *sz)
 #undef rename
 int mingw_rename(const char *pold, const char *pnew)
 {
+	static int supports_file_rename_info_ex = 1;
 	DWORD attrs, gle;
 	int tries = 0;
 	wchar_t wpold[MAX_PATH], wpnew[MAX_PATH];
-	if (xutftowcs_path(wpold, pold) < 0 || xutftowcs_path(wpnew, pnew) < 0)
+	int wpnew_len;
+
+	if (xutftowcs_path(wpold, pold) < 0)
+		return -1;
+	wpnew_len = xutftowcs_path(wpnew, pnew);
+	if (wpnew_len < 0)
 		return -1;
 
 	/*
@@ -2231,11 +2237,84 @@ int mingw_rename(const char *pold, const char *pnew)
 		return 0;
 	if (errno != EEXIST)
 		return -1;
+
 repeat:
-	if (MoveFileExW(wpold, wpnew, MOVEFILE_REPLACE_EXISTING))
-		return 0;
+	if (supports_file_rename_info_ex) {
+		/*
+		 * Our minimum required Windows version is still set to Windows
+		 * Vista. We thus have to declare required infrastructure for
+		 * FileRenameInfoEx ourselves until we bump _WIN32_WINNT to
+		 * 0x0A00. Furthermore, we have to handle cases where the
+		 * FileRenameInfoEx call isn't supported yet.
+		 */
+#define FILE_RENAME_FLAG_REPLACE_IF_EXISTS                  0x00000001
+#define FILE_RENAME_FLAG_POSIX_SEMANTICS                    0x00000002
+		FILE_INFO_BY_HANDLE_CLASS FileRenameInfoEx = 22;
+		struct {
+			/*
+			 * This is usually an unnamed union, but that is not
+			 * part of ISO C99. We thus inline the field, as we
+			 * really only care for the Flags field anyway.
+			 */
+			DWORD Flags;
+			HANDLE RootDirectory;
+			DWORD FileNameLength;
+			/*
+			 * The actual structure is defined with a single-character
+			 * flex array so that the structure has to be allocated on
+			 * the heap. As we declare this structure ourselves though
+			 * we can avoid the allocation and define FileName to have
+			 * MAX_PATH bytes.
+			 */
+			WCHAR FileName[MAX_PATH];
+		} rename_info = { 0 };
+		HANDLE old_handle = INVALID_HANDLE_VALUE;
+		BOOL success;
+
+		old_handle = CreateFileW(wpold, DELETE,
+					 FILE_SHARE_WRITE | FILE_SHARE_READ | FILE_SHARE_DELETE,
+					 NULL, OPEN_EXISTING, FILE_ATTRIBUTE_NORMAL, NULL);
+		if (old_handle == INVALID_HANDLE_VALUE) {
+			errno = err_win_to_posix(GetLastError());
+			return -1;
+		}
+
+		rename_info.Flags = FILE_RENAME_FLAG_REPLACE_IF_EXISTS |
+				    FILE_RENAME_FLAG_POSIX_SEMANTICS;
+		rename_info.FileNameLength = wpnew_len * sizeof(WCHAR);
+		memcpy(rename_info.FileName, wpnew, wpnew_len * sizeof(WCHAR));
+
+		success = SetFileInformationByHandle(old_handle, FileRenameInfoEx,
+						     &rename_info, sizeof(rename_info));
+		gle = GetLastError();
+		CloseHandle(old_handle);
+		if (success)
+			return 0;
+
+		/*
+		 * When we see ERROR_INVALID_PARAMETER we can assume that the
+		 * current system doesn't support FileRenameInfoEx. Keep us
+		 * from using it in future calls and retry.
+		 */
+		if (gle == ERROR_INVALID_PARAMETER) {
+			supports_file_rename_info_ex = 0;
+			goto repeat;
+		}
+
+		/*
+		 * In theory, we shouldn't get ERROR_ACCESS_DENIED because we
+		 * always open files with FILE_SHARE_DELETE But in practice we
+		 * cannot assume that Git is the only one accessing files, and
+		 * other applications may not set FILE_SHARE_DELETE. So we have
+		 * to retry.
+		 */
+	} else {
+		if (MoveFileExW(wpold, wpnew, MOVEFILE_REPLACE_EXISTING))
+			return 0;
+		gle = GetLastError();
+	}
+
 	/* TODO: translate more errors */
-	gle = GetLastError();
 	if (gle == ERROR_ACCESS_DENIED &&
 	    (attrs = GetFileAttributesW(wpnew)) != INVALID_FILE_ATTRIBUTES) {
 		if (attrs & FILE_ATTRIBUTE_DIRECTORY) {
diff --git a/t/t0610-reftable-basics.sh b/t/t0610-reftable-basics.sh
index babec7993e3f3e..eaf6fab6d29f01 100755
--- a/t/t0610-reftable-basics.sh
+++ b/t/t0610-reftable-basics.sh
@@ -450,10 +450,12 @@ test_expect_success 'ref transaction: retry acquiring tables.list lock' '
 	)
 '
 
-# This test fails most of the time on Windows systems. The root cause is
+# This test fails most of the time on Cygwin systems. The root cause is
 # that Windows does not allow us to rename the "tables.list.lock" file into
-# place when "tables.list" is open for reading by a concurrent process.
-test_expect_success !WINDOWS 'ref transaction: many concurrent writers' '
+# place when "tables.list" is open for reading by a concurrent process. We have
+# worked around that in our MinGW-based rename emulation, but the Cygwin
+# emulation seems to be insufficient.
+test_expect_success !CYGWIN 'ref transaction: many concurrent writers' '
 	test_when_finished "rm -rf repo" &&
 	git init repo &&
 	(

From 014299066b475cecada06a950858504fa200156c Mon Sep 17 00:00:00 2001
From: Abhijeet Sonar <abhijeet.nkt@gmail.com>
Date: Sat, 26 Oct 2024 17:39:50 +0530
Subject: [PATCH 52/69] show-index: fix uninitialized hash function

As stated in the docs, show-index should use SHA1 as the default hash algorithm
when run outsize of a repository.  However, 'the_hash_algo' is currently left
uninitialized if we are not in a repository and no explicit hash function is
specified, causing a crash.  Fix it by falling back to SHA1 when it is found
uninitialized. Also add test that verifies this behaviour.

Signed-off-by: Abhijeet Sonar <abhijeet.nkt@gmail.com>
Signed-off-by: Taylor Blau <me@ttaylorr.com>
---
 builtin/show-index.c   | 3 +++
 t/t5300-pack-object.sh | 4 ++++
 2 files changed, 7 insertions(+)

diff --git a/builtin/show-index.c b/builtin/show-index.c
index f164c01bbea400..978ae704702b1f 100644
--- a/builtin/show-index.c
+++ b/builtin/show-index.c
@@ -38,6 +38,9 @@ int cmd_show_index(int argc,
 		repo_set_hash_algo(the_repository, hash_algo);
 	}
 
+	if (!the_hash_algo)
+		repo_set_hash_algo(the_repository, GIT_HASH_SHA1);
+
 	hashsz = the_hash_algo->rawsz;
 
 	if (fread(top_index, 2 * 4, 1, stdin) != 1)
diff --git a/t/t5300-pack-object.sh b/t/t5300-pack-object.sh
index 3b9dae331a5ea9..51fed26cc4c941 100755
--- a/t/t5300-pack-object.sh
+++ b/t/t5300-pack-object.sh
@@ -523,6 +523,10 @@ test_expect_success 'index-pack --strict <pack> works in non-repo' '
 	test_path_is_file foo.idx
 '
 
+test_expect_success SHA1 'show-index works OK outside a repository' '
+	nongit git show-index <foo.idx
+'
+
 test_expect_success !PTHREADS,!FAIL_PREREQS \
 	'index-pack --threads=N or pack.threads=N warns when no pthreads' '
 	test_must_fail git index-pack --threads=2 2>err &&

From 2be71276fbcc5dded2ce1f31ad5c3b31e1a086d1 Mon Sep 17 00:00:00 2001
From: Kristoffer Haugsbakk <code@khaugsbakk.name>
Date: Tue, 29 Oct 2024 21:41:44 +0100
Subject: [PATCH 53/69] Documentation/git-bundle.txt: mention full backup
 example
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Tell the user how to make a full backup of the repository right at the
start of the doc.

This is a requested use-case.[1]  But the doc is a bit unassuming
about it:

  “ If you want to match `git clone --mirror`, which would include your
    refs such as `refs/remotes/*`, use `--all`.

The user cannot be expected to formulate “I want a full backup” as “I
want to match `git clone --mirror`” for a bundle file or something.
Let’s drop this mention of `--all` later in the doc and frontload it.

† 1: E.g.:

    • https://stackoverflow.com/questions/5578270/fully-backup-a-git-repo
    • https://stackoverflow.com/questions/11792671/how-to-git-bundle-a-complete-repo

Signed-off-by: Kristoffer Haugsbakk <code@khaugsbakk.name>
Signed-off-by: Taylor Blau <me@ttaylorr.com>
---
 Documentation/git-bundle.txt | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/Documentation/git-bundle.txt b/Documentation/git-bundle.txt
index 3ab42a19cae4ac..0fa181c749d3f3 100644
--- a/Documentation/git-bundle.txt
+++ b/Documentation/git-bundle.txt
@@ -23,8 +23,8 @@ the "offline" transfer of Git objects without an active "server"
 sitting on the other side of the network connection.
 
 They can be used to create both incremental and full backups of a
-repository, and to relay the state of the references in one repository
-to another.
+repository (`git bundle create <file> --all`), and to relay the state of
+the references in one repository to another.
 
 Git commands that fetch or otherwise "read" via protocols such as
 `ssh://` and `https://` can also operate on bundle files. It is
@@ -203,8 +203,6 @@ It is okay to err on the side of caution, causing the bundle file
 to contain objects already in the destination, as these are ignored
 when unpacking at the destination.
 
-If you want to match `git clone --mirror`, which would include your
-refs such as `refs/remotes/*`, use `--all`.
 If you want to provide the same set of refs that a clone directly
 from the source repository would get, use `--branches --tags` for
 the `<git-rev-list-args>`.

From 4f68a403a79f1b549c656fecb678447ec059ba4a Mon Sep 17 00:00:00 2001
From: Kristoffer Haugsbakk <code@khaugsbakk.name>
Date: Tue, 29 Oct 2024 21:41:45 +0100
Subject: [PATCH 54/69] Documentation/git-bundle.txt: mention --all in spec.
 refs
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Mention `--all` as an alternative in “Specifying References”.

Signed-off-by: Kristoffer Haugsbakk <code@khaugsbakk.name>
Signed-off-by: Taylor Blau <me@ttaylorr.com>
---
 Documentation/git-bundle.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Documentation/git-bundle.txt b/Documentation/git-bundle.txt
index 0fa181c749d3f3..bce62bc8309625 100644
--- a/Documentation/git-bundle.txt
+++ b/Documentation/git-bundle.txt
@@ -132,7 +132,7 @@ SPECIFYING REFERENCES
 ---------------------
 
 Revisions must be accompanied by reference names to be packaged in a
-bundle.
+bundle.  Alternatively `--all` can be used to package all refs.
 
 More than one reference may be packaged, and more than one set of prerequisite objects can
 be specified.  The objects packaged are those not contained in the

From c4ac5fa725545ebde488b1ec4a0ab3890dc68a40 Mon Sep 17 00:00:00 2001
From: Kristoffer Haugsbakk <code@khaugsbakk.name>
Date: Tue, 29 Oct 2024 21:41:46 +0100
Subject: [PATCH 55/69] =?UTF-8?q?Documentation/git-bundle.txt:=20discuss?=
 =?UTF-8?q?=20na=C3=AFve=20backups?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

It might be naïve to think that those who need this education would end
up here in the first place.  But I think it’s good to mention this
high-level concept here on a command which provides a backup strategy.

Signed-off-by: Kristoffer Haugsbakk <code@khaugsbakk.name>
Signed-off-by: Taylor Blau <me@ttaylorr.com>
---
 Documentation/git-bundle.txt | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/Documentation/git-bundle.txt b/Documentation/git-bundle.txt
index bce62bc8309625..7bffd2e4a0547c 100644
--- a/Documentation/git-bundle.txt
+++ b/Documentation/git-bundle.txt
@@ -319,6 +319,20 @@ You can also see what references it offers:
 $ git ls-remote mybundle
 ----------------
 
+DISCUSSION
+----------
+
+A naive way to make a full backup of a repository is to use something to
+the effect of `cp -a <repo> <destination>`.  This is discouraged since
+the repository could be written to during the copy operation.  In turn
+some files at `<destination>` could be corrupted.
+
+This is why it is recommended to use Git tooling for making repository
+backups, either with this command or with e.g. linkgit:git-clone[1].
+
+See also linkgit:gitfaq[7], section "TRANSFERS" for a discussion of the
+problems associated with file syncing across systems.
+
 FILE FORMAT
 -----------
 

From fcae55aa53c0d6bb05b3010d4746ac5d898c214d Mon Sep 17 00:00:00 2001
From: Calvin Wan <calvinwan@google.com>
Date: Mon, 28 Oct 2024 16:34:52 -0400
Subject: [PATCH 56/69] fetch-pack: refactor packet writing
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Refactor write_fetch_command_and_capabilities() to be a more general
purpose function write_command_and_capabilities(), so that it can be
used by both fetch and future commands.

Here "command" means the "operations" supported by Git’s wire protocol
https://git-scm.com/docs/protocol-v2. An example would be a
git's subcommand, such as git-fetch(1); or an operation supported by
the server side such as "object-info" implemented in "a2ba162cda
(object-info: support for retrieving object info, 2021-04-20)".

The new write_command_and_capabilities() function is also moved to
connect.c, so that it becomes accessible to other commands.

Helped-by: Jonathan Tan <jonathantanmy@google.com>
Helped-by: Christian Couder <chriscool@tuxfamily.org>
Signed-off-by: Calvin Wan <calvinwan@google.com>
Signed-off-by: Eric Ju  <eric.peijian@gmail.com>
Signed-off-by: Taylor Blau <me@ttaylorr.com>
---
 connect.c    | 34 ++++++++++++++++++++++++++++++++++
 connect.h    |  4 ++++
 fetch-pack.c | 36 ++----------------------------------
 3 files changed, 40 insertions(+), 34 deletions(-)

diff --git a/connect.c b/connect.c
index 58f53d8dcbb87d..bb4e4eec44c216 100644
--- a/connect.c
+++ b/connect.c
@@ -688,6 +688,40 @@ int server_supports(const char *feature)
 	return !!server_feature_value(feature, NULL);
 }
 
+void write_command_and_capabilities(struct strbuf *req_buf, const char *command,
+									const struct string_list *server_options)
+{
+	const char *hash_name;
+	int advertise_sid;
+
+	git_config_get_bool("transfer.advertisesid", &advertise_sid);
+
+	ensure_server_supports_v2(command);
+	packet_buf_write(req_buf, "command=%s", command);
+	if (server_supports_v2("agent"))
+		packet_buf_write(req_buf, "agent=%s", git_user_agent_sanitized());
+	if (advertise_sid && server_supports_v2("session-id"))
+		packet_buf_write(req_buf, "session-id=%s", trace2_session_id());
+	if (server_options && server_options->nr) {
+		ensure_server_supports_v2("server-option");
+		for (int i = 0; i < server_options->nr; i++)
+			packet_buf_write(req_buf, "server-option=%s",
+					 server_options->items[i].string);
+	}
+
+	if (server_feature_v2("object-format", &hash_name)) {
+		const int hash_algo = hash_algo_by_name(hash_name);
+		if (hash_algo_by_ptr(the_hash_algo) != hash_algo)
+			die(_("mismatched algorithms: client %s; server %s"),
+				the_hash_algo->name, hash_name);
+		packet_buf_write(req_buf, "object-format=%s", the_hash_algo->name);
+	} else if (hash_algo_by_ptr(the_hash_algo) != GIT_HASH_SHA1) {
+		die(_("the server does not support algorithm '%s'"),
+			the_hash_algo->name);
+	}
+	packet_buf_delim(req_buf);
+}
+
 enum protocol {
 	PROTO_LOCAL = 1,
 	PROTO_FILE,
diff --git a/connect.h b/connect.h
index 1645126c17f889..2ed009066e89d2 100644
--- a/connect.h
+++ b/connect.h
@@ -1,6 +1,7 @@
 #ifndef CONNECT_H
 #define CONNECT_H
 
+#include "string-list.h"
 #include "protocol.h"
 
 #define CONNECT_VERBOSE       (1u << 0)
@@ -30,4 +31,7 @@ void check_stateless_delimiter(int stateless_rpc,
 			       struct packet_reader *reader,
 			       const char *error);
 
+void write_command_and_capabilities(struct strbuf *req_buf, const char *command,
+									const struct string_list *server_options);
+
 #endif
diff --git a/fetch-pack.c b/fetch-pack.c
index f752da93a80b25..533fb76f9548ae 100644
--- a/fetch-pack.c
+++ b/fetch-pack.c
@@ -1314,38 +1314,6 @@ static int add_haves(struct fetch_negotiator *negotiator,
 	return haves_added;
 }
 
-static void write_fetch_command_and_capabilities(struct strbuf *req_buf,
-						 const struct string_list *server_options)
-{
-	const char *hash_name;
-
-	ensure_server_supports_v2("fetch");
-	packet_buf_write(req_buf, "command=fetch");
-	if (server_supports_v2("agent"))
-		packet_buf_write(req_buf, "agent=%s", git_user_agent_sanitized());
-	if (advertise_sid && server_supports_v2("session-id"))
-		packet_buf_write(req_buf, "session-id=%s", trace2_session_id());
-	if (server_options && server_options->nr) {
-		int i;
-		ensure_server_supports_v2("server-option");
-		for (i = 0; i < server_options->nr; i++)
-			packet_buf_write(req_buf, "server-option=%s",
-					 server_options->items[i].string);
-	}
-
-	if (server_feature_v2("object-format", &hash_name)) {
-		int hash_algo = hash_algo_by_name(hash_name);
-		if (hash_algo_by_ptr(the_hash_algo) != hash_algo)
-			die(_("mismatched algorithms: client %s; server %s"),
-			    the_hash_algo->name, hash_name);
-		packet_buf_write(req_buf, "object-format=%s", the_hash_algo->name);
-	} else if (hash_algo_by_ptr(the_hash_algo) != GIT_HASH_SHA1) {
-		die(_("the server does not support algorithm '%s'"),
-		    the_hash_algo->name);
-	}
-	packet_buf_delim(req_buf);
-}
-
 static int send_fetch_request(struct fetch_negotiator *negotiator, int fd_out,
 			      struct fetch_pack_args *args,
 			      const struct ref *wants, struct oidset *common,
@@ -1356,7 +1324,7 @@ static int send_fetch_request(struct fetch_negotiator *negotiator, int fd_out,
 	int done_sent = 0;
 	struct strbuf req_buf = STRBUF_INIT;
 
-	write_fetch_command_and_capabilities(&req_buf, args->server_options);
+	write_command_and_capabilities(&req_buf, "fetch", args->server_options);
 
 	if (args->use_thin_pack)
 		packet_buf_write(&req_buf, "thin-pack");
@@ -2174,7 +2142,7 @@ void negotiate_using_fetch(const struct oid_array *negotiation_tips,
 					   the_repository, "%d",
 					   negotiation_round);
 		strbuf_reset(&req_buf);
-		write_fetch_command_and_capabilities(&req_buf, server_options);
+		write_command_and_capabilities(&req_buf, "fetch", server_options);
 
 		packet_buf_write(&req_buf, "wait-for-done");
 

From 9ddf78d08d936f26d1c2703089c528bd4c1c831c Mon Sep 17 00:00:00 2001
From: Calvin Wan <calvinwan@google.com>
Date: Mon, 28 Oct 2024 16:34:53 -0400
Subject: [PATCH 57/69] fetch-pack: move fetch initialization

There are some variables initialized at the start of the
do_fetch_pack_v2() state machine. Currently, they are initialized
in FETCH_CHECK_LOCAL, which is the initial state set at the beginning
of the function.

However, a subsequent patch will allow for another initial state,
while still requiring these initialized variables.
Move the initialization to be before the state machine,
so that they are set regardless of the initial state.

Note that there is no change in behavior, because we're moving code
from the beginning of the first state to just before the execution of
the state machine.

Helped-by: Jonathan Tan <jonathantanmy@google.com>
Helped-by: Christian Couder <chriscool@tuxfamily.org>
Signed-off-by: Calvin Wan <calvinwan@google.com>
Signed-off-by: Eric Ju  <eric.peijian@gmail.com>
Signed-off-by: Taylor Blau <me@ttaylorr.com>
---
 fetch-pack.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/fetch-pack.c b/fetch-pack.c
index 533fb76f9548ae..afffbcaafc9d25 100644
--- a/fetch-pack.c
+++ b/fetch-pack.c
@@ -1645,18 +1645,18 @@ static struct ref *do_fetch_pack_v2(struct fetch_pack_args *args,
 		reader.me = "fetch-pack";
 	}
 
+	/* v2 supports these by default */
+	allow_unadvertised_object_request |= ALLOW_REACHABLE_SHA1;
+	use_sideband = 2;
+	if (args->depth > 0 || args->deepen_since || args->deepen_not)
+		args->deepen = 1;
+
 	while (state != FETCH_DONE) {
 		switch (state) {
 		case FETCH_CHECK_LOCAL:
 			sort_ref_list(&ref, ref_compare_name);
 			QSORT(sought, nr_sought, cmp_ref_by_name);
 
-			/* v2 supports these by default */
-			allow_unadvertised_object_request |= ALLOW_REACHABLE_SHA1;
-			use_sideband = 2;
-			if (args->depth > 0 || args->deepen_since || args->deepen_not)
-				args->deepen = 1;
-
 			/* Filter 'ref' by 'sought' and those that aren't local */
 			mark_complete_and_common_ref(negotiator, args, &ref);
 			filter_refs(args, &ref, sought, nr_sought);

From bc529169803d546d98c11221310035e0e9ebc71d Mon Sep 17 00:00:00 2001
From: Calvin Wan <calvinwan@google.com>
Date: Mon, 28 Oct 2024 16:34:54 -0400
Subject: [PATCH 58/69] serve: advertise object-info feature

In order for a client to know what object-info components a server can
provide, advertise supported object-info features. This will allow a
client to decide whether to query the server for object-info or fetch
as a fallback.

Helped-by: Jonathan Tan <jonathantanmy@google.com>
Helped-by: Christian Couder <chriscool@tuxfamily.org>
Signed-off-by: Calvin Wan <calvinwan@google.com>
Signed-off-by: Eric Ju  <eric.peijian@gmail.com>
Signed-off-by: Taylor Blau <me@ttaylorr.com>
---
 serve.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/serve.c b/serve.c
index d674764a25d4cd..c3d80986420da6 100644
--- a/serve.c
+++ b/serve.c
@@ -70,7 +70,7 @@ static void session_id_receive(struct repository *r UNUSED,
 	trace2_data_string("transfer", NULL, "client-sid", client_sid);
 }
 
-static int object_info_advertise(struct repository *r, struct strbuf *value UNUSED)
+static int object_info_advertise(struct repository *r, struct strbuf *value)
 {
 	if (advertise_object_info == -1 &&
 	    repo_config_get_bool(r, "transfer.advertiseobjectinfo",
@@ -78,6 +78,8 @@ static int object_info_advertise(struct repository *r, struct strbuf *value UNUS
 		/* disabled by default */
 		advertise_object_info = 0;
 	}
+	if (value && advertise_object_info)
+		strbuf_addstr(value, "size");
 	return advertise_object_info;
 }
 

From 8c4368da08c39176479a751fcc2119b50c65c101 Mon Sep 17 00:00:00 2001
From: Calvin Wan <calvinwan@google.com>
Date: Mon, 28 Oct 2024 16:34:55 -0400
Subject: [PATCH 59/69] transport: add client support for object-info
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Sometimes it is useful to get information about an object without having
to download it completely. The server logic has already been implemented
in “a2ba162cda (object-info: support for retrieving object info,
2021-04-20)”.

Add client functions to communicate with the server.

The client currently supports requesting a list of object ids with
feature 'size' from a v2 server. If a server does not
advertise the feature, then the client falls back
to making the request through 'fetch'.

Helped-by: Jonathan Tan <jonathantanmy@google.com>
Helped-by: Christian Couder <chriscool@tuxfamily.org>
Signed-off-by: Calvin Wan <calvinwan@google.com>
Signed-off-by: Eric Ju  <eric.peijian@gmail.com>
Signed-off-by: Taylor Blau <me@ttaylorr.com>
---
 Makefile            |  1 +
 fetch-object-info.c | 95 +++++++++++++++++++++++++++++++++++++++++++++
 fetch-object-info.h | 18 +++++++++
 fetch-pack.c        |  3 ++
 fetch-pack.h        |  2 +
 transport-helper.c  | 11 +++++-
 transport.c         | 80 ++++++++++++++++++++++++++++++++++++--
 transport.h         | 11 ++++++
 8 files changed, 216 insertions(+), 5 deletions(-)
 create mode 100644 fetch-object-info.c
 create mode 100644 fetch-object-info.h

diff --git a/Makefile b/Makefile
index 8d8cc6ab90928b..3969ddcaa82196 100644
--- a/Makefile
+++ b/Makefile
@@ -1024,6 +1024,7 @@ LIB_OBJS += ewah/ewah_rlw.o
 LIB_OBJS += exec-cmd.o
 LIB_OBJS += fetch-negotiator.o
 LIB_OBJS += fetch-pack.o
+LIB_OBJS += fetch-object-info.o
 LIB_OBJS += fmt-merge-msg.o
 LIB_OBJS += fsck.o
 LIB_OBJS += fsmonitor.o
diff --git a/fetch-object-info.c b/fetch-object-info.c
new file mode 100644
index 00000000000000..4d7c2d265fb863
--- /dev/null
+++ b/fetch-object-info.c
@@ -0,0 +1,95 @@
+#include "git-compat-util.h"
+#include "gettext.h"
+#include "hex.h"
+#include "pkt-line.h"
+#include "connect.h"
+#include "oid-array.h"
+#include "object-store-ll.h"
+#include "fetch-object-info.h"
+#include "string-list.h"
+
+/**
+ * send_object_info_request sends git-cat-file object-info command and its
+ * arguments into the request buffer.
+ */
+static void send_object_info_request(const int fd_out, struct object_info_args *args)
+{
+	struct strbuf req_buf = STRBUF_INIT;
+
+	write_command_and_capabilities(&req_buf, "object-info", args->server_options);
+
+	if (unsorted_string_list_has_string(args->object_info_options, "size"))
+		packet_buf_write(&req_buf, "size");
+
+	if (args->oids) {
+		for (size_t i = 0; i < args->oids->nr; i++)
+			packet_buf_write(&req_buf, "oid %s", oid_to_hex(&args->oids->oid[i]));
+	}
+
+	packet_buf_flush(&req_buf);
+	if (write_in_full(fd_out, req_buf.buf, req_buf.len) < 0)
+		die_errno(_("unable to write request to remote"));
+
+	strbuf_release(&req_buf);
+}
+
+/**
+ * fetch_object_info sends git-cat-file object-info command into the request buf
+ * and read the results from packets.
+ */
+int fetch_object_info(const enum protocol_version version, struct object_info_args *args,
+					  struct packet_reader *reader, struct object_info *object_info_data,
+					  const int stateless_rpc, const int fd_out)
+{
+	int size_index = -1;
+
+	switch (version) {
+	case protocol_v2:
+		if (!server_supports_v2("object-info"))
+			return -1;
+		if (unsorted_string_list_has_string(args->object_info_options, "size")
+				&& !server_supports_feature("object-info", "size", 0))
+			return -1;
+		send_object_info_request(fd_out, args);
+		break;
+	case protocol_v1:
+	case protocol_v0:
+		die(_("wrong protocol version. expected v2"));
+	case protocol_unknown_version:
+		BUG("unknown protocol version");
+	}
+
+	for (size_t i = 0; i < args->object_info_options->nr; i++) {
+		if (packet_reader_read(reader) != PACKET_READ_NORMAL) {
+			check_stateless_delimiter(stateless_rpc, reader, "stateless delimiter expected");
+			return -1;
+		}
+		if (unsorted_string_list_has_string(args->object_info_options, reader->line)) {
+			if (!strcmp(reader->line, "size")) {
+				size_index = i;
+				for (size_t j = 0; j < args->oids->nr; j++)
+					object_info_data[j].sizep = xcalloc(1, sizeof(long));
+			}
+			continue;
+		}
+		return -1;
+	}
+
+	for (size_t i = 0; packet_reader_read(reader) == PACKET_READ_NORMAL && i < args->oids->nr; i++){
+		struct string_list object_info_values = STRING_LIST_INIT_DUP;
+
+		string_list_split(&object_info_values, reader->line, ' ', -1);
+		if (0 <= size_index) {
+			if (!strcmp(object_info_values.items[1 + size_index].string, ""))
+				die("object-info: not our ref %s",
+					object_info_values.items[0].string);
+
+			*object_info_data[i].sizep = strtoul(object_info_values.items[1 + size_index].string, NULL, 10);
+		}
+
+		string_list_clear(&object_info_values, 0);
+	}
+	check_stateless_delimiter(stateless_rpc, reader, "stateless delimiter expected");
+
+	return 0;
+}
diff --git a/fetch-object-info.h b/fetch-object-info.h
new file mode 100644
index 00000000000000..b1e545532f679f
--- /dev/null
+++ b/fetch-object-info.h
@@ -0,0 +1,18 @@
+#ifndef FETCH_OBJECT_INFO_H
+#define FETCH_OBJECT_INFO_H
+
+#include "pkt-line.h"
+#include "protocol.h"
+#include "object-store-ll.h"
+
+struct object_info_args {
+	struct string_list *object_info_options;
+	const struct string_list *server_options;
+	struct oid_array *oids;
+};
+
+int fetch_object_info(enum protocol_version version, struct object_info_args *args,
+					  struct packet_reader *reader, struct object_info *object_info_data,
+					  int stateless_rpc, int fd_out);
+
+#endif /* FETCH_OBJECT_INFO_H */
diff --git a/fetch-pack.c b/fetch-pack.c
index afffbcaafc9d25..8b4143d7523613 100644
--- a/fetch-pack.c
+++ b/fetch-pack.c
@@ -1651,6 +1651,9 @@ static struct ref *do_fetch_pack_v2(struct fetch_pack_args *args,
 	if (args->depth > 0 || args->deepen_since || args->deepen_not)
 		args->deepen = 1;
 
+	if (args->object_info)
+		state = FETCH_SEND_REQUEST;
+
 	while (state != FETCH_DONE) {
 		switch (state) {
 		case FETCH_CHECK_LOCAL:
diff --git a/fetch-pack.h b/fetch-pack.h
index b5c579cdae2508..cf7cedf1618379 100644
--- a/fetch-pack.h
+++ b/fetch-pack.h
@@ -16,6 +16,7 @@ struct fetch_pack_args {
 	const struct string_list *deepen_not;
 	struct list_objects_filter_options filter_options;
 	const struct string_list *server_options;
+	struct object_info *object_info_data;
 
 	/*
 	 * If not NULL, during packfile negotiation, fetch-pack will send "have"
@@ -42,6 +43,7 @@ struct fetch_pack_args {
 	unsigned reject_shallow_remote:1;
 	unsigned deepen:1;
 	unsigned refetch:1;
+	unsigned object_info:1;
 
 	/*
 	 * Indicate that the remote of this request is a promisor remote. The
diff --git a/transport-helper.c b/transport-helper.c
index 013ec79dc9cdc5..334b35174e3537 100644
--- a/transport-helper.c
+++ b/transport-helper.c
@@ -709,8 +709,8 @@ static int fetch_refs(struct transport *transport,
 
 	/*
 	 * If we reach here, then the server, the client, and/or the transport
-	 * helper does not support protocol v2. --negotiate-only requires
-	 * protocol v2.
+	 * helper does not support protocol v2. --negotiate-only and cat-file
+	 * remote-object-info require protocol v2.
 	 */
 	if (data->transport_options.acked_commits) {
 		warning(_("--negotiate-only requires protocol v2"));
@@ -726,6 +726,13 @@ static int fetch_refs(struct transport *transport,
 		free_refs(dummy);
 	}
 
+	/* fail the command explicitly to avoid further commands input. */
+	if (transport->smart_options->object_info)
+		die(_("remote-object-info requires protocol v2"));
+
+	if (!data->get_refs_list_called)
+		get_refs_list_using_list(transport, 0);
+
 	count = 0;
 	for (i = 0; i < nr_heads; i++)
 		if (!(to_fetch[i]->status & REF_STATUS_UPTODATE))
diff --git a/transport.c b/transport.c
index 47fda6a7732f4b..41e157d73d3242 100644
--- a/transport.c
+++ b/transport.c
@@ -9,6 +9,7 @@
 #include "hook.h"
 #include "pkt-line.h"
 #include "fetch-pack.h"
+#include "fetch-object-info.h"
 #include "remote.h"
 #include "connect.h"
 #include "send-pack.h"
@@ -418,6 +419,7 @@ static int fetch_refs_via_pack(struct transport *transport,
 	struct ref *refs = NULL;
 	struct fetch_pack_args args;
 	struct ref *refs_tmp = NULL, **to_fetch_dup = NULL;
+	struct ref *object_info_refs = NULL;
 
 	memset(&args, 0, sizeof(args));
 	args.uploadpack = data->options.uploadpack;
@@ -444,11 +446,71 @@ static int fetch_refs_via_pack(struct transport *transport,
 	args.server_options = transport->server_options;
 	args.negotiation_tips = data->options.negotiation_tips;
 	args.reject_shallow_remote = transport->smart_options->reject_shallow;
+	args.object_info = transport->smart_options->object_info;
+
+	if (transport->smart_options
+		&& transport->smart_options->object_info
+		&& transport->smart_options->object_info_oids->nr > 0) {
+		struct ref *ref_itr = object_info_refs = alloc_ref("");
+		struct packet_reader reader;
+		struct object_info_args obj_info_args = { 0 };
+
+		obj_info_args.server_options = transport->server_options;
+		obj_info_args.object_info_options = transport->smart_options->object_info_options;
+		obj_info_args.oids = transport->smart_options->object_info_oids;
+
+		connect_setup(transport, 0);
+		packet_reader_init(&reader, data->fd[0], NULL, 0,
+				PACKET_READ_CHOMP_NEWLINE |
+				PACKET_READ_GENTLE_ON_EOF |
+				PACKET_READ_DIE_ON_ERR_PACKET);
+
+		data->version = discover_version(&reader);
+		transport->hash_algo = reader.hash_algo;
+
+		if (!fetch_object_info(data->version, &obj_info_args, &reader,
+			data->options.object_info_data, transport->stateless_rpc,
+			data->fd[1])) {
+			/*
+			 * If the code reaches here, fetch_object_info is successful and
+			 * remote object info are retrieved from packets (i.e. without
+			 * downloading the objects).
+			 */
+			goto cleanup;
+		}
 
-	if (!data->finished_handshake) {
-		int i;
+		/*
+		 * If the code reaches here, it means we can't retrieve object info from
+		 * packets, and we will fallback to downland the pack files.
+		 * We set quiet and no_progress to be true, so that the internal call to
+		 * fetch-pack is less verbose.
+		 */
+		args.object_info_data = data->options.object_info_data;
+		args.quiet = 1;
+		args.no_progress = 1;
+
+		/*
+		 * Allocate memory for object info data according to oids.
+		 * The actual results will be retrieved later from the downloaded
+		 * pack files.
+		 */
+		for (size_t i = 0; i < transport->smart_options->object_info_oids->nr; i++) {
+			ref_itr->old_oid = transport->smart_options->object_info_oids->oid[i];
+			ref_itr->exact_oid = 1;
+			if (i == transport->smart_options->object_info_oids->nr - 1)
+				/* last element, no need to allocate to next */
+				ref_itr->next = NULL;
+			else
+				ref_itr->next = alloc_ref("");
+
+			ref_itr = ref_itr->next;
+		}
+
+		transport->remote_refs = object_info_refs;
+
+	} else if (!data->finished_handshake) {
 		int must_list_refs = 0;
-		for (i = 0; i < nr_heads; i++) {
+		for (int i = 0; i < nr_heads; i++) {
 			if (!to_fetch[i]->exact_oid) {
 				must_list_refs = 1;
 				break;
@@ -494,6 +556,17 @@ static int fetch_refs_via_pack(struct transport *transport,
 			  &transport->pack_lockfiles, data->version);
 
 	data->finished_handshake = 0;
+
+	/* Retrieve object info data from the downloaded pack files */
+	if (args.object_info) {
+		struct ref *ref_cpy_reader = object_info_refs;
+		for (int i = 0; ref_cpy_reader; i++) {
+			oid_object_info_extended(the_repository, &ref_cpy_reader->old_oid,
+				&args.object_info_data[i], OBJECT_INFO_LOOKUP_REPLACE);
+			ref_cpy_reader = ref_cpy_reader->next;
+		}
+	}
+
 	data->options.self_contained_and_connected =
 		args.self_contained_and_connected;
 	data->options.connectivity_checked = args.connectivity_checked;
@@ -504,6 +577,7 @@ static int fetch_refs_via_pack(struct transport *transport,
 		ret = -1;
 
 cleanup:
+	free_refs(object_info_refs);
 	close(data->fd[0]);
 	if (data->fd[1] >= 0)
 		close(data->fd[1]);
diff --git a/transport.h b/transport.h
index 44100fa9b7fdd6..42b8ee125113dc 100644
--- a/transport.h
+++ b/transport.h
@@ -5,6 +5,7 @@
 #include "remote.h"
 #include "list-objects-filter-options.h"
 #include "string-list.h"
+#include "object-store.h"
 
 struct git_transport_options {
 	unsigned thin : 1;
@@ -30,6 +31,12 @@ struct git_transport_options {
 	 */
 	unsigned connectivity_checked:1;
 
+	/*
+	 * Transport will attempt to pull only object-info. Fallbacks
+	 * to pulling entire object if object-info is not supported.
+	 */
+	unsigned object_info : 1;
+
 	int depth;
 	const char *deepen_since;
 	const struct string_list *deepen_not;
@@ -53,6 +60,10 @@ struct git_transport_options {
 	 * common commits to this oidset instead of fetching any packfiles.
 	 */
 	struct oidset *acked_commits;
+
+	struct oid_array *object_info_oids;
+	struct object_info *object_info_data;
+	struct string_list *object_info_options;
 };
 
 enum transport_family {

From f1ffe744a6c321ce63507edf870f9dc42b5d0029 Mon Sep 17 00:00:00 2001
From: Eric Ju <eric.peijian@gmail.com>
Date: Mon, 28 Oct 2024 16:34:56 -0400
Subject: [PATCH 60/69] cat-file: add declaration of variable i inside its for
 loop

Some code declares variable i and only uses it
in a for loop, not in any other logic outside the loop.

Change the declaration of i to be inside the for loop for readability.

Helped-by: Christian Couder <chriscool@tuxfamily.org>
Signed-off-by: Eric Ju <eric.peijian@gmail.com>
Signed-off-by: Taylor Blau <me@ttaylorr.com>
---
 builtin/cat-file.c | 11 +++--------
 1 file changed, 3 insertions(+), 8 deletions(-)

diff --git a/builtin/cat-file.c b/builtin/cat-file.c
index bfdfb51c7cb7b3..5db55fabc4c39c 100644
--- a/builtin/cat-file.c
+++ b/builtin/cat-file.c
@@ -673,12 +673,10 @@ static void dispatch_calls(struct batch_options *opt,
 		struct queued_cmd *cmd,
 		int nr)
 {
-	int i;
-
 	if (!opt->buffer_output)
 		die(_("flush is only for --buffer mode"));
 
-	for (i = 0; i < nr; i++)
+	for (size_t i = 0; i < nr; i++)
 		cmd[i].fn(opt, cmd[i].line, output, data);
 
 	fflush(stdout);
@@ -686,9 +684,7 @@ static void dispatch_calls(struct batch_options *opt,
 
 static void free_cmds(struct queued_cmd *cmd, size_t *nr)
 {
-	size_t i;
-
-	for (i = 0; i < *nr; i++)
+	for (size_t i = 0; i < *nr; i++)
 		FREE_AND_NULL(cmd[i].line);
 
 	*nr = 0;
@@ -714,7 +710,6 @@ static void batch_objects_command(struct batch_options *opt,
 	size_t alloc = 0, nr = 0;
 
 	while (strbuf_getdelim_strip_crlf(&input, stdin, opt->input_delim) != EOF) {
-		int i;
 		const struct parse_cmd *cmd = NULL;
 		const char *p = NULL, *cmd_end;
 		struct queued_cmd call = {0};
@@ -724,7 +719,7 @@ static void batch_objects_command(struct batch_options *opt,
 		if (isspace(*input.buf))
 			die(_("whitespace before command: '%s'"), input.buf);
 
-		for (i = 0; i < ARRAY_SIZE(commands); i++) {
+		for (size_t i = 0; i < ARRAY_SIZE(commands); i++) {
 			if (!skip_prefix(input.buf, commands[i].name, &cmd_end))
 				continue;
 

From 999bed85bdb3d76bec5100a4b8373e592960c818 Mon Sep 17 00:00:00 2001
From: Eric Ju <eric.peijian@gmail.com>
Date: Mon, 28 Oct 2024 16:34:57 -0400
Subject: [PATCH 61/69] cat-file: add remote-object-info to batch-command

Since the `info` command in cat-file --batch-command prints object info
for a given object, it is natural to add another command in cat-file
--batch-command to print object info for a given object from a remote.
Add `remote-object-info` to cat-file --batch-command.

While `info` takes object ids one at a time, this creates overhead when
making requests to a server so `remote-object-info` instead can take
multiple object ids at once.

cat-file --batch-command is generally implemented in the following
manner:

 - Receive and parse input from user
 - Call respective function attached to command
 - Get object info, print object info

In --buffer mode, this changes to:

 - Receive and parse input from user
 - Store respective function attached to command in a queue
 - After flush, loop through commands in queue
    - Call respective function attached to command
    - Get object info, print object info

Notice how the getting and printing of object info is accomplished one
at a time. As described above, this creates a problem for making
requests to a server. Therefore, `remote-object-info` is implemented in
the following manner:

 - Receive and parse input from user
 If command is `remote-object-info`:
    - Get object info from remote
    - Loop through and print each object info
 Else:
    - Call respective function attached to command
    - Parse input, get object info, print object info

And finally for --buffer mode `remote-object-info`:
 - Receive and parse input from user
 - Store respective function attached to command in a queue
 - After flush, loop through commands in queue:
    If command is `remote-object-info`:
        - Get object info from remote
        - Loop through and print each object info
    Else:
        - Call respective function attached to command
        - Get object info, print object info

To summarize, `remote-object-info` gets object info from the remote and
then loop through the object info passed in, printing the info.

In order for remote-object-info to avoid remote communication overhead
in the non-buffer mode, the objects are passed in as such:

remote-object-info <remote> <oid> <oid> ... <oid>

rather than

remote-object-info <remote> <oid>
remote-object-info <remote> <oid>
...
remote-object-info <remote> <oid>

Helped-by: Jonathan Tan <jonathantanmy@google.com>
Helped-by: Christian Couder <chriscool@tuxfamily.org>
Signed-off-by: Calvin Wan <calvinwan@google.com>
Signed-off-by: Eric Ju  <eric.peijian@gmail.com>
Signed-off-by: Taylor Blau <me@ttaylorr.com>
---
 Documentation/git-cat-file.txt         |  24 +-
 builtin/cat-file.c                     | 108 +++-
 object-file.c                          |  11 +
 object-store-ll.h                      |   3 +
 t/lib-cat-file.sh                      |  16 +
 t/t1006-cat-file.sh                    |  13 +-
 t/t1017-cat-file-remote-object-info.sh | 739 +++++++++++++++++++++++++
 7 files changed, 897 insertions(+), 17 deletions(-)
 create mode 100644 t/lib-cat-file.sh
 create mode 100755 t/t1017-cat-file-remote-object-info.sh

diff --git a/Documentation/git-cat-file.txt b/Documentation/git-cat-file.txt
index d5890ae3686f6b..f2be00b59955ad 100644
--- a/Documentation/git-cat-file.txt
+++ b/Documentation/git-cat-file.txt
@@ -149,6 +149,13 @@ info <object>::
 	Print object info for object reference `<object>`. This corresponds to the
 	output of `--batch-check`.
 
+remote-object-info <remote> <object>...::
+	Print object info for object references `<object>` at specified <remote> without
+	downloading objects from remote. If the object-info capability is not
+	supported by the server, the objects will be downloaded instead.
+	Error when no object references are provided.
+	This command may be combined with `--buffer`.
+
 flush::
 	Used with `--buffer` to execute all preceding commands that were issued
 	since the beginning or since the last flush was issued. When `--buffer`
@@ -290,7 +297,8 @@ newline. The available atoms are:
 	The full hex representation of the object name.
 
 `objecttype`::
-	The type of the object (the same as `cat-file -t` reports).
+	The type of the object (the same as `cat-file -t` reports). See
+	`CAVEATS` below. Not supported by `remote-object-info`.
 
 `objectsize`::
 	The size, in bytes, of the object (the same as `cat-file -s`
@@ -298,13 +306,14 @@ newline. The available atoms are:
 
 `objectsize:disk`::
 	The size, in bytes, that the object takes up on disk. See the
-	note about on-disk sizes in the `CAVEATS` section below.
+	note about on-disk sizes in the `CAVEATS` section below. Not
+	supported by `remote-object-info`.
 
 `deltabase`::
 	If the object is stored as a delta on-disk, this expands to the
 	full hex representation of the delta base object name.
 	Otherwise, expands to the null OID (all zeroes). See `CAVEATS`
-	below.
+	below. Not supported by `remote-object-info`.
 
 `rest`::
 	If this atom is used in the output string, input lines are split
@@ -314,7 +323,10 @@ newline. The available atoms are:
 	line) are output in place of the `%(rest)` atom.
 
 If no format is specified, the default format is `%(objectname)
-%(objecttype) %(objectsize)`.
+%(objecttype) %(objectsize)`, except for `remote-object-info` commands which use
+`%(objectname) %(objectsize)` for now because "%(objecttype)" is not supported yet.
+WARNING: When "%(objecttype)" is supported, the default format WILL be unified, so
+DO NOT RELY on the current the default format to stay the same!!!
 
 If `--batch` is specified, or if `--batch-command` is used with the `contents`
 command, the object information is followed by the object contents (consisting
@@ -396,6 +408,10 @@ scripting purposes.
 CAVEATS
 -------
 
+Note that since %(objecttype), %(objectsize:disk) and %(deltabase) are
+currently not supported by the `remote-object-info` command, we will error
+and exit when they are in the format string.
+
 Note that the sizes of objects on disk are reported accurately, but care
 should be taken in drawing conclusions about which refs or objects are
 responsible for disk usage. The size of a packed non-delta object may be
diff --git a/builtin/cat-file.c b/builtin/cat-file.c
index 5db55fabc4c39c..714c182f3960fa 100644
--- a/builtin/cat-file.c
+++ b/builtin/cat-file.c
@@ -24,6 +24,9 @@
 #include "promisor-remote.h"
 #include "mailmap.h"
 #include "write-or-die.h"
+#include "alias.h"
+#include "remote.h"
+#include "transport.h"
 
 enum batch_mode {
 	BATCH_MODE_CONTENTS,
@@ -42,9 +45,12 @@ struct batch_options {
 	char input_delim;
 	char output_delim;
 	const char *format;
+	int use_remote_info;
 };
 
 static const char *force_path;
+static struct object_info *remote_object_info;
+static struct oid_array object_info_oids = OID_ARRAY_INIT;
 
 static struct string_list mailmap = STRING_LIST_INIT_NODUP;
 static int use_mailmap;
@@ -528,7 +534,7 @@ static void batch_one_object(const char *obj_name,
 	enum get_oid_result result;
 
 	result = get_oid_with_context(the_repository, obj_name,
-				      flags, &data->oid, &ctx);
+								  flags, &data->oid, &ctx);
 	if (result != FOUND) {
 		switch (result) {
 		case MISSING_OBJECT:
@@ -576,6 +582,59 @@ static void batch_one_object(const char *obj_name,
 	object_context_release(&ctx);
 }
 
+static int get_remote_info(struct batch_options *opt, int argc, const char **argv)
+{
+	int retval = 0;
+	struct remote *remote = NULL;
+	struct object_id oid;
+	struct string_list object_info_options = STRING_LIST_INIT_NODUP;
+	static struct transport *gtransport;
+
+	/*
+	 * Change the format to "%(objectname) %(objectsize)" when
+	 * remote-object-info command is used. Once we start supporting objecttype
+	 * the default format should change to DEFAULT_FORMAT
+	*/
+	if (!opt->format)
+		opt->format = "%(objectname) %(objectsize)";
+
+	remote = remote_get(argv[0]);
+	if (!remote)
+		die(_("must supply valid remote when using remote-object-info"));
+
+	oid_array_clear(&object_info_oids);
+	for (size_t i = 1; i < argc; i++) {
+		if (get_oid_hex(argv[i], &oid))
+			die(_("Not a valid object name %s"), argv[i]);
+		oid_array_append(&object_info_oids, &oid);
+	}
+
+	gtransport = transport_get(remote, NULL);
+	if (gtransport->smart_options) {
+		CALLOC_ARRAY(remote_object_info, object_info_oids.nr);
+		gtransport->smart_options->object_info = 1;
+		gtransport->smart_options->object_info_oids = &object_info_oids;
+
+		/* 'objectsize' is the only option currently supported */
+		if (!strstr(opt->format, "%(objectsize)"))
+			die(_("%s is currently not supported with remote-object-info"), opt->format);
+
+		string_list_append(&object_info_options, "size");
+
+		if (object_info_options.nr > 0) {
+			gtransport->smart_options->object_info_options = &object_info_options;
+			gtransport->smart_options->object_info_data = remote_object_info;
+			retval = transport_fetch_refs(gtransport, NULL);
+		}
+	} else {
+		retval = -1;
+	}
+
+	string_list_clear(&object_info_options, 0);
+	transport_disconnect(gtransport);
+	return retval;
+}
+
 struct object_cb_data {
 	struct batch_options *opt;
 	struct expand_data *expand;
@@ -667,6 +726,52 @@ static void parse_cmd_info(struct batch_options *opt,
 	batch_one_object(line, output, opt, data);
 }
 
+static void parse_cmd_remote_object_info(struct batch_options *opt,
+			   const char *line,
+			   struct strbuf *output,
+			   struct expand_data *data)
+{
+	int count;
+	const char **argv;
+
+	char *line_to_split = xstrdup_or_null(line);
+	count = split_cmdline(line_to_split, &argv);
+	if (get_remote_info(opt, count, argv))
+		goto cleanup;
+
+	opt->use_remote_info = 1;
+	data->skip_object_info = 1;
+	for (size_t i = 0; i < object_info_oids.nr; i++) {
+
+		data->oid = object_info_oids.oid[i];
+
+		if (remote_object_info[i].sizep) {
+			data->size = *remote_object_info[i].sizep;
+		} else {
+			/*
+			 * When reaching here, it means remote-object-info can't retrieve
+			 * information from server without downloading them, and the objects
+			 * have been fetched to client already.
+			 * Print the information using the logic for local objects.
+			 */
+			data->skip_object_info = 0;
+		}
+
+		opt->batch_mode = BATCH_MODE_INFO;
+		batch_object_write(argv[i+1], output, opt, data, NULL, 0);
+
+	}
+	opt->use_remote_info = 0;
+	data->skip_object_info = 0;
+
+cleanup:
+	for (size_t i = 0; i < object_info_oids.nr; i++)
+		free_object_info_contents(&remote_object_info[i]);
+	free(line_to_split);
+	free(argv);
+	free(remote_object_info);
+}
+
 static void dispatch_calls(struct batch_options *opt,
 		struct strbuf *output,
 		struct expand_data *data,
@@ -698,6 +803,7 @@ static const struct parse_cmd {
 } commands[] = {
 	{ "contents", parse_cmd_contents, 1},
 	{ "info", parse_cmd_info, 1},
+	{ "remote-object-info", parse_cmd_remote_object_info, 1},
 	{ "flush", NULL, 0},
 };
 
diff --git a/object-file.c b/object-file.c
index b1a3463852c451..181cde98e1282b 100644
--- a/object-file.c
+++ b/object-file.c
@@ -3132,3 +3132,14 @@ int read_loose_object(const char *path,
 		munmap(map, mapsize);
 	return ret;
 }
+
+void free_object_info_contents(struct object_info *object_info)
+{
+	if (!object_info)
+		return;
+	free(object_info->typep);
+	free(object_info->sizep);
+	free(object_info->disk_sizep);
+	free(object_info->delta_base_oid);
+	free(object_info->type_name);
+}
diff --git a/object-store-ll.h b/object-store-ll.h
index 53b8e693b1b74f..611e2ca7089624 100644
--- a/object-store-ll.h
+++ b/object-store-ll.h
@@ -548,4 +548,7 @@ int for_each_object_in_pack(struct packed_git *p,
 int for_each_packed_object(each_packed_object_fn, void *,
 			   enum for_each_object_flags flags);
 
+/* Free pointers inside of object_info, but not object_info itself */
+void free_object_info_contents(struct object_info *object_info);
+
 #endif /* OBJECT_STORE_LL_H */
diff --git a/t/lib-cat-file.sh b/t/lib-cat-file.sh
new file mode 100644
index 00000000000000..9fb20be3083f2a
--- /dev/null
+++ b/t/lib-cat-file.sh
@@ -0,0 +1,16 @@
+# Library of git-cat-file related tests.
+
+# Print a string without a trailing newline
+echo_without_newline () {
+	printf '%s' "$*"
+}
+
+# Print a string without newlines and replaces them with a NULL character (\0).
+echo_without_newline_nul () {
+	echo_without_newline "$@" | tr '\n' '\0'
+}
+
+# Calculate the length of a string removing any leading spaces.
+strlen () {
+	echo_without_newline "$1" | wc -c | sed -e 's/^ *//'
+}
diff --git a/t/t1006-cat-file.sh b/t/t1006-cat-file.sh
index d36cd7c0863591..d8a851c427dfaa 100755
--- a/t/t1006-cat-file.sh
+++ b/t/t1006-cat-file.sh
@@ -4,6 +4,7 @@ test_description='git cat-file'
 
 TEST_PASSES_SANITIZE_LEAK=true
 . ./test-lib.sh
+. "$TEST_DIRECTORY"/lib-cat-file.sh
 
 test_cmdmode_usage () {
 	test_expect_code 129 "$@" 2>err &&
@@ -99,18 +100,6 @@ do
 	'
 done
 
-echo_without_newline () {
-    printf '%s' "$*"
-}
-
-echo_without_newline_nul () {
-	echo_without_newline "$@" | tr '\n' '\0'
-}
-
-strlen () {
-    echo_without_newline "$1" | wc -c | sed -e 's/^ *//'
-}
-
 run_tests () {
     type=$1
     oid=$2
diff --git a/t/t1017-cat-file-remote-object-info.sh b/t/t1017-cat-file-remote-object-info.sh
new file mode 100755
index 00000000000000..f4bff07311d7cc
--- /dev/null
+++ b/t/t1017-cat-file-remote-object-info.sh
@@ -0,0 +1,739 @@
+#!/bin/sh
+
+test_description='git cat-file --batch-command with remote-object-info command'
+
+GIT_TEST_DEFAULT_INITIAL_BRANCH_NAME=main
+export GIT_TEST_DEFAULT_INITIAL_BRANCH_NAME
+
+. ./test-lib.sh
+. "$TEST_DIRECTORY"/lib-cat-file.sh
+
+hello_content="Hello World"
+hello_size=$(strlen "$hello_content")
+hello_oid=$(echo_without_newline "$hello_content" | git hash-object --stdin)
+
+# This is how we get 13:
+# 13 = <file mode> + <a_space> + <file name> + <a_null>, where
+# file mode is 100644, which is 6 characters;
+# file name is hello, which is 5 characters
+# a space is 1 character and a null is 1 character
+tree_size=$(($(test_oid rawsz) + 13))
+
+commit_message="Initial commit"
+
+# This is how we get 137:
+# 137 = <tree header> + <a_space> + <a newline> +
+# <Author line> + <a newline> +
+# <Committer line> + <a newline> +
+# <a newline> +
+# <commit message length>
+# An easier way to calculate is: 1. use `git cat-file commit <commit hash> | wc -c`,
+# to get 177, 2. then deduct 40 hex characters to get 137
+commit_size=$(($(test_oid hexsz) + 137))
+
+tag_header_without_oid="type blob
+tag hellotag
+tagger $GIT_COMMITTER_NAME <$GIT_COMMITTER_EMAIL>"
+tag_header_without_timestamp="object $hello_oid
+$tag_header_without_oid"
+tag_description="This is a tag"
+tag_content="$tag_header_without_timestamp 0 +0000
+
+$tag_description"
+
+tag_oid=$(echo_without_newline "$tag_content" | git hash-object -t tag --stdin -w)
+tag_size=$(strlen "$tag_content")
+
+set_transport_variables () {
+	hello_oid=$(echo_without_newline "$hello_content" | git hash-object --stdin)
+	tree_oid=$(git -C "$1" write-tree)
+	commit_oid=$(echo_without_newline "$commit_message" | git -C "$1" commit-tree $tree_oid)
+	tag_oid=$(echo_without_newline "$tag_content" | git -C "$1" hash-object -t tag --stdin -w)
+	tag_size=$(strlen "$tag_content")
+}
+
+# This section tests --batch-command with remote-object-info command
+# Since "%(objecttype)" is currently not supported by the command remote-object-info ,
+# the filters are set to "%(objectname) %(objectsize)" in some test cases.
+
+# Test --batch-command remote-object-info with 'git://' transport with
+# transfer.advertiseobjectinfo set to true, i.e. server has object-info capability
+. "$TEST_DIRECTORY"/lib-git-daemon.sh
+start_git_daemon --export-all --enable=receive-pack
+daemon_parent=$GIT_DAEMON_DOCUMENT_ROOT_PATH/parent
+
+test_expect_success 'create repo to be served by git-daemon' '
+	git init "$daemon_parent" &&
+	echo_without_newline "$hello_content" > $daemon_parent/hello &&
+	git -C "$daemon_parent" update-index --add hello &&
+	git -C "$daemon_parent" config transfer.advertiseobjectinfo true &&
+	git clone "$GIT_DAEMON_URL/parent" -n "$daemon_parent/daemon_client_empty"
+'
+
+test_expect_success 'batch-command remote-object-info git://' '
+	(
+		set_transport_variables "$daemon_parent" &&
+		cd "$daemon_parent/daemon_client_empty" &&
+
+		# These results prove remote-object-info can get object info from the remote
+		echo "$hello_oid $hello_size" >expect &&
+		echo "$tree_oid $tree_size" >>expect &&
+		echo "$commit_oid $commit_size" >>expect &&
+		echo "$tag_oid $tag_size" >>expect &&
+
+		# These results prove remote-object-info did not download objects from the remote
+		echo "$hello_oid missing" >>expect &&
+		echo "$tree_oid missing" >>expect &&
+		echo "$commit_oid missing" >>expect &&
+		echo "$tag_oid missing" >>expect &&
+
+		git cat-file --batch-command="%(objectname) %(objectsize)" >actual <<-EOF &&
+		remote-object-info "$GIT_DAEMON_URL/parent" $hello_oid
+		remote-object-info "$GIT_DAEMON_URL/parent" $tree_oid
+		remote-object-info "$GIT_DAEMON_URL/parent" $commit_oid
+		remote-object-info "$GIT_DAEMON_URL/parent" $tag_oid
+		info $hello_oid
+		info $tree_oid
+		info $commit_oid
+		info $tag_oid
+		EOF
+		test_cmp expect actual
+	)
+'
+
+test_expect_success 'batch-command remote-object-info git:// multiple sha1 per line' '
+	(
+		set_transport_variables "$daemon_parent" &&
+		cd "$daemon_parent/daemon_client_empty" &&
+
+		# These results prove remote-object-info can get object info from the remote
+		echo "$hello_oid $hello_size" >expect &&
+		echo "$tree_oid $tree_size" >>expect &&
+		echo "$commit_oid $commit_size" >>expect &&
+		echo "$tag_oid $tag_size" >>expect &&
+
+		# These results prove remote-object-info did not download objects from the remote
+		echo "$hello_oid missing" >>expect &&
+		echo "$tree_oid missing" >>expect &&
+		echo "$commit_oid missing" >>expect &&
+		echo "$tag_oid missing" >>expect &&
+
+		git cat-file --batch-command="%(objectname) %(objectsize)" >actual <<-EOF &&
+		remote-object-info "$GIT_DAEMON_URL/parent" $hello_oid $tree_oid $commit_oid $tag_oid
+		info $hello_oid
+		info $tree_oid
+		info $commit_oid
+		info $tag_oid
+		EOF
+		test_cmp expect actual
+	)
+'
+
+test_expect_success 'batch-command remote-object-info git:// default filter' '
+	(
+		set_transport_variables "$daemon_parent" &&
+		cd "$daemon_parent/daemon_client_empty" &&
+
+		echo "$hello_oid $hello_size" >expect &&
+		echo "$tree_oid $tree_size" >>expect &&
+		echo "$commit_oid $commit_size" >>expect &&
+		echo "$tag_oid $tag_size" >>expect &&
+		GIT_TRACE_PACKET=1 git cat-file --batch-command >actual <<-EOF &&
+		remote-object-info "$GIT_DAEMON_URL/parent" $hello_oid $tree_oid
+		remote-object-info "$GIT_DAEMON_URL/parent" $commit_oid $tag_oid
+		EOF
+		test_cmp expect actual
+	)
+'
+
+test_expect_success 'batch-command --buffer remote-object-info git://' '
+	(
+		set_transport_variables "$daemon_parent" &&
+		cd "$daemon_parent/daemon_client_empty" &&
+
+		# These results prove remote-object-info can get object info from the remote
+		echo "$hello_oid $hello_size" >expect &&
+		echo "$tree_oid $tree_size" >>expect &&
+		echo "$commit_oid $commit_size" >>expect &&
+		echo "$tag_oid $tag_size" >>expect &&
+
+		# These results prove remote-object-info did not download objects from the remote
+		echo "$hello_oid missing" >>expect &&
+		echo "$tree_oid missing" >>expect &&
+		echo "$commit_oid missing" >>expect &&
+		echo "$tag_oid missing" >>expect &&
+
+		git cat-file --batch-command="%(objectname) %(objectsize)" --buffer >actual <<-EOF &&
+		remote-object-info "$GIT_DAEMON_URL/parent" $hello_oid $tree_oid
+		remote-object-info "$GIT_DAEMON_URL/parent" $commit_oid $tag_oid
+		info $hello_oid
+		info $tree_oid
+		info $commit_oid
+		info $tag_oid
+		flush
+		EOF
+		test_cmp expect actual
+	)
+'
+
+test_expect_success 'batch-command -Z remote-object-info git:// default filter' '
+	(
+		set_transport_variables "$daemon_parent" &&
+		cd "$daemon_parent/daemon_client_empty" &&
+
+		printf "%s\0" "$hello_oid $hello_size" >expect &&
+		printf "%s\0" "$tree_oid $tree_size" >>expect &&
+		printf "%s\0" "$commit_oid $commit_size" >>expect &&
+		printf "%s\0" "$tag_oid $tag_size" >>expect &&
+
+		printf "%s\0" "$hello_oid missing" >>expect &&
+		printf "%s\0" "$tree_oid missing" >>expect &&
+		printf "%s\0" "$commit_oid missing" >>expect &&
+		printf "%s\0" "$tag_oid missing" >>expect &&
+
+		batch_input="remote-object-info $GIT_DAEMON_URL/parent $hello_oid $tree_oid
+remote-object-info $GIT_DAEMON_URL/parent $commit_oid $tag_oid
+info $hello_oid
+info $tree_oid
+info $commit_oid
+info $tag_oid
+" &&
+		echo_without_newline_nul "$batch_input" >commands_null_delimited &&
+
+		git cat-file --batch-command -Z < commands_null_delimited >actual &&
+		test_cmp expect actual
+	)
+'
+
+# Test --batch-command remote-object-info with 'git://' and
+# transfer.advertiseobjectinfo set to false, i.e. server does not have object-info capability
+
+test_expect_success 'remote-object-info fallback git://: fetch objects to client' '
+	(
+		git -C "$daemon_parent" config transfer.advertiseobjectinfo false &&
+		set_transport_variables "$daemon_parent" &&
+		cd "$daemon_parent/daemon_client_empty" &&
+
+		# Prove object is not on the client
+		echo "$hello_oid missing" >expect &&
+		echo "$tree_oid missing" >>expect &&
+		echo "$commit_oid missing" >>expect &&
+		echo "$tag_oid missing" >>expect &&
+
+		# These results prove remote-object-info can retrieve object info
+		echo "$hello_oid $hello_size" >>expect &&
+		echo "$tree_oid $tree_size" >>expect &&
+		echo "$commit_oid $commit_size" >>expect &&
+		echo "$tag_oid $tag_size" >>expect &&
+
+		# These results are for the info command
+		# They prove objects are downloaded
+		echo "$hello_oid $hello_size" >>expect &&
+		echo "$tree_oid $tree_size" >>expect &&
+		echo "$commit_oid $commit_size" >>expect &&
+		echo "$tag_oid $tag_size" >>expect &&
+
+		git cat-file --batch-command="%(objectname) %(objectsize)" >actual <<-EOF &&
+		info $hello_oid
+		info $tree_oid
+		info $commit_oid
+		info $tag_oid
+		remote-object-info $GIT_DAEMON_URL/parent $hello_oid $tree_oid $commit_oid $tag_oid
+		info $hello_oid
+		info $tree_oid
+		info $commit_oid
+		info $tag_oid
+		EOF
+
+		# revert server state back
+		git -C "$daemon_parent" config transfer.advertiseobjectinfo true &&
+
+		test_cmp expect actual
+	)
+'
+
+stop_git_daemon
+
+# Test --batch-command remote-object-info with 'file://' transport with
+# transfer.advertiseobjectinfo set to true, i.e. server has object-info capability
+# shellcheck disable=SC2016
+test_expect_success 'create repo to be served by file:// transport' '
+	git init server &&
+	git -C server config protocol.version 2 &&
+	git -C server config transfer.advertiseobjectinfo true &&
+	echo_without_newline "$hello_content" > server/hello &&
+	git -C server update-index --add hello &&
+	git clone -n "file://$(pwd)/server" file_client_empty
+'
+
+test_expect_success 'batch-command remote-object-info file://' '
+	(
+		set_transport_variables "server" &&
+		server_path="$(pwd)/server" &&
+		cd file_client_empty &&
+
+		# These results prove remote-object-info can get object info from the remote
+		echo "$hello_oid $hello_size" >expect &&
+		echo "$tree_oid $tree_size" >>expect &&
+		echo "$commit_oid $commit_size" >>expect &&
+		echo "$tag_oid $tag_size" >>expect &&
+
+		# These results prove remote-object-info did not download objects from the remote
+		echo "$hello_oid missing" >>expect &&
+		echo "$tree_oid missing" >>expect &&
+		echo "$commit_oid missing" >>expect &&
+		echo "$tag_oid missing" >>expect &&
+
+		git cat-file --batch-command="%(objectname) %(objectsize)" >actual <<-EOF &&
+		remote-object-info "file://${server_path}" $hello_oid
+		remote-object-info "file://${server_path}" $tree_oid
+		remote-object-info "file://${server_path}" $commit_oid
+		remote-object-info "file://${server_path}" $tag_oid
+		info $hello_oid
+		info $tree_oid
+		info $commit_oid
+		info $tag_oid
+		EOF
+		test_cmp expect actual
+	)
+'
+
+test_expect_success 'batch-command remote-object-info file:// multiple sha1 per line' '
+	(
+		set_transport_variables "server" &&
+		server_path="$(pwd)/server" &&
+		cd file_client_empty &&
+
+		# These results prove remote-object-info can get object info from the remote
+		echo "$hello_oid $hello_size" >expect &&
+		echo "$tree_oid $tree_size" >>expect &&
+		echo "$commit_oid $commit_size" >>expect &&
+		echo "$tag_oid $tag_size" >>expect &&
+
+		# These results prove remote-object-info did not download objects from the remote
+		echo "$hello_oid missing" >>expect &&
+		echo "$tree_oid missing" >>expect &&
+		echo "$commit_oid missing" >>expect &&
+		echo "$tag_oid missing" >>expect &&
+
+
+		git cat-file --batch-command="%(objectname) %(objectsize)" >actual <<-EOF &&
+		remote-object-info "file://${server_path}" $hello_oid $tree_oid $commit_oid $tag_oid
+		info $hello_oid
+		info $tree_oid
+		info $commit_oid
+		info $tag_oid
+		EOF
+		test_cmp expect actual
+	)
+'
+
+test_expect_success 'batch-command --buffer remote-object-info file://' '
+	(
+		set_transport_variables "server" &&
+		server_path="$(pwd)/server" &&
+		cd file_client_empty &&
+
+		# These results prove remote-object-info can get object info from the remote
+		echo "$hello_oid $hello_size" >expect &&
+		echo "$tree_oid $tree_size" >>expect &&
+		echo "$commit_oid $commit_size" >>expect &&
+		echo "$tag_oid $tag_size" >>expect &&
+
+		# These results prove remote-object-info did not download objects from the remote
+		echo "$hello_oid missing" >>expect &&
+		echo "$tree_oid missing" >>expect &&
+		echo "$commit_oid missing" >>expect &&
+		echo "$tag_oid missing" >>expect &&
+
+		git cat-file --batch-command="%(objectname) %(objectsize)" --buffer >actual <<-EOF &&
+		remote-object-info "file://${server_path}" $hello_oid $tree_oid
+		remote-object-info "file://${server_path}" $commit_oid $tag_oid
+		info $hello_oid
+		info $tree_oid
+		info $commit_oid
+		info $tag_oid
+		flush
+		EOF
+		test_cmp expect actual
+	)
+'
+
+test_expect_success 'batch-command remote-object-info file:// default filter' '
+	(
+		set_transport_variables "server" &&
+		server_path="$(pwd)/server" &&
+		cd file_client_empty &&
+
+		echo "$hello_oid $hello_size" >expect &&
+		echo "$tree_oid $tree_size" >>expect &&
+		echo "$commit_oid $commit_size" >>expect &&
+		echo "$tag_oid $tag_size" >>expect &&
+
+		git cat-file --batch-command >actual <<-EOF &&
+		remote-object-info "file://${server_path}" $hello_oid $tree_oid
+		remote-object-info "file://${server_path}" $commit_oid $tag_oid
+		EOF
+		test_cmp expect actual
+	)
+'
+
+test_expect_success 'batch-command -Z remote-object-info file:// default filter' '
+	(
+		set_transport_variables "server" &&
+		server_path="$(pwd)/server" &&
+		cd file_client_empty &&
+
+		printf "%s\0" "$hello_oid $hello_size" >expect &&
+		printf "%s\0" "$tree_oid $tree_size" >>expect &&
+		printf "%s\0" "$commit_oid $commit_size" >>expect &&
+		printf "%s\0" "$tag_oid $tag_size" >>expect &&
+
+		printf "%s\0" "$hello_oid missing" >>expect &&
+		printf "%s\0" "$tree_oid missing" >>expect &&
+		printf "%s\0" "$commit_oid missing" >>expect &&
+		printf "%s\0" "$tag_oid missing" >>expect &&
+
+		batch_input="remote-object-info \"file://${server_path}\" $hello_oid $tree_oid
+remote-object-info \"file://${server_path}\" $commit_oid $tag_oid
+info $hello_oid
+info $tree_oid
+info $commit_oid
+info $tag_oid
+" &&
+		echo_without_newline_nul "$batch_input" >commands_null_delimited &&
+
+		git cat-file --batch-command -Z < commands_null_delimited >actual &&
+		test_cmp expect actual
+	)
+'
+
+# Test --batch-command remote-object-info with 'file://' and
+# transfer.advertiseobjectinfo set to false, i.e. server does not have object-info capability
+
+test_expect_success 'remote-object-info fallback file://: fetch objects to client' '
+	(
+		set_transport_variables "server" &&
+		server_path="$(pwd)/server" &&
+		git -C "${server_path}" config transfer.advertiseobjectinfo false &&
+		cd file_client_empty &&
+
+		# Prove object is not on the client
+		echo "$hello_oid missing" >expect &&
+		echo "$tree_oid missing" >>expect &&
+		echo "$commit_oid missing" >>expect &&
+		echo "$tag_oid missing" >>expect &&
+
+		# These results prove remote-object-info can retrieve object info
+		echo "$hello_oid $hello_size" >>expect &&
+		echo "$tree_oid $tree_size" >>expect &&
+		echo "$commit_oid $commit_size" >>expect &&
+		echo "$tag_oid $tag_size" >>expect &&
+
+		# These results are for the info command
+		# They prove objects are downloaded
+		echo "$hello_oid $hello_size" >>expect &&
+		echo "$tree_oid $tree_size" >>expect &&
+		echo "$commit_oid $commit_size" >>expect &&
+		echo "$tag_oid $tag_size" >>expect &&
+
+		git cat-file --batch-command="%(objectname) %(objectsize)" >actual <<-EOF &&
+		info $hello_oid
+		info $tree_oid
+		info $commit_oid
+		info $tag_oid
+		remote-object-info "file://${server_path}" $hello_oid $tree_oid $commit_oid $tag_oid
+		info $hello_oid
+		info $tree_oid
+		info $commit_oid
+		info $tag_oid
+		EOF
+
+		# revert server state back
+		git -C "${server_path}" config transfer.advertiseobjectinfo true &&
+		test_cmp expect actual
+	)
+'
+
+# Test --batch-command remote-object-info with 'http://' transport with
+# transfer.advertiseobjectinfo set to true, i.e. server has object-info capability
+
+. "$TEST_DIRECTORY"/lib-httpd.sh
+start_httpd
+
+test_expect_success 'create repo to be served by http:// transport' '
+	git init "$HTTPD_DOCUMENT_ROOT_PATH/http_parent" &&
+	git -C "$HTTPD_DOCUMENT_ROOT_PATH/http_parent" config http.receivepack true &&
+	git -C "$HTTPD_DOCUMENT_ROOT_PATH/http_parent" config transfer.advertiseobjectinfo true &&
+	echo_without_newline "$hello_content" > $HTTPD_DOCUMENT_ROOT_PATH/http_parent/hello &&
+	git -C "$HTTPD_DOCUMENT_ROOT_PATH/http_parent" update-index --add hello &&
+	git clone "$HTTPD_URL/smart/http_parent" -n "$HTTPD_DOCUMENT_ROOT_PATH/http_client_empty"
+'
+
+test_expect_success 'batch-command remote-object-info http://' '
+	(
+		set_transport_variables "$HTTPD_DOCUMENT_ROOT_PATH/http_parent" &&
+		cd "$HTTPD_DOCUMENT_ROOT_PATH/http_client_empty" &&
+
+		# These results prove remote-object-info can get object info from the remote
+		echo "$hello_oid $hello_size" >expect &&
+		echo "$tree_oid $tree_size" >>expect &&
+		echo "$commit_oid $commit_size" >>expect &&
+		echo "$tag_oid $tag_size" >>expect &&
+
+		# These results prove remote-object-info did not download objects from the remote
+		echo "$hello_oid missing" >>expect &&
+		echo "$tree_oid missing" >>expect &&
+		echo "$commit_oid missing" >>expect &&
+		echo "$tag_oid missing" >>expect &&
+
+		git cat-file --batch-command="%(objectname) %(objectsize)" >actual <<-EOF &&
+		remote-object-info "$HTTPD_URL/smart/http_parent" $hello_oid
+		remote-object-info "$HTTPD_URL/smart/http_parent" $tree_oid
+		remote-object-info "$HTTPD_URL/smart/http_parent" $commit_oid
+		remote-object-info "$HTTPD_URL/smart/http_parent" $tag_oid
+		info $hello_oid
+		info $tree_oid
+		info $commit_oid
+		info $tag_oid
+		EOF
+		test_cmp expect actual
+	)
+'
+
+test_expect_success 'batch-command remote-object-info http:// one line' '
+	(
+		set_transport_variables "$HTTPD_DOCUMENT_ROOT_PATH/http_parent" &&
+		cd "$HTTPD_DOCUMENT_ROOT_PATH/http_client_empty" &&
+
+		# These results prove remote-object-info can get object info from the remote
+		echo "$hello_oid $hello_size" >expect &&
+		echo "$tree_oid $tree_size" >>expect &&
+		echo "$commit_oid $commit_size" >>expect &&
+		echo "$tag_oid $tag_size" >>expect &&
+
+		# These results prove remote-object-info did not download objects from the remote
+		echo "$hello_oid missing" >>expect &&
+		echo "$tree_oid missing" >>expect &&
+		echo "$commit_oid missing" >>expect &&
+		echo "$tag_oid missing" >>expect &&
+
+		git cat-file --batch-command="%(objectname) %(objectsize)" >actual <<-EOF &&
+		remote-object-info "$HTTPD_URL/smart/http_parent" $hello_oid $tree_oid $commit_oid $tag_oid
+		info $hello_oid
+		info $tree_oid
+		info $commit_oid
+		info $tag_oid
+		EOF
+		test_cmp expect actual
+	)
+'
+
+test_expect_success 'batch-command --buffer remote-object-info http://' '
+	(
+		set_transport_variables "$HTTPD_DOCUMENT_ROOT_PATH/http_parent" &&
+		cd "$HTTPD_DOCUMENT_ROOT_PATH/http_client_empty" &&
+
+		# These results prove remote-object-info can get object info from the remote
+		echo "$hello_oid $hello_size" >expect &&
+		echo "$tree_oid $tree_size" >>expect &&
+		echo "$commit_oid $commit_size" >>expect &&
+		echo "$tag_oid $tag_size" >>expect &&
+
+		# These results prove remote-object-info did not download objects from the remote
+		echo "$hello_oid missing" >>expect &&
+		echo "$tree_oid missing" >>expect &&
+		echo "$commit_oid missing" >>expect &&
+		echo "$tag_oid missing" >>expect &&
+
+		git cat-file --batch-command="%(objectname) %(objectsize)" --buffer >actual <<-EOF &&
+		remote-object-info "$HTTPD_URL/smart/http_parent" $hello_oid $tree_oid
+		remote-object-info "$HTTPD_URL/smart/http_parent" $commit_oid $tag_oid
+		info $hello_oid
+		info $tree_oid
+		info $commit_oid
+		info $tag_oid
+		flush
+		EOF
+		test_cmp expect actual
+	)
+'
+
+test_expect_success 'batch-command remote-object-info http:// default filter' '
+	(
+		set_transport_variables "$HTTPD_DOCUMENT_ROOT_PATH/http_parent" &&
+		cd "$HTTPD_DOCUMENT_ROOT_PATH/http_client_empty" &&
+
+		echo "$hello_oid $hello_size" >expect &&
+		echo "$tree_oid $tree_size" >>expect &&
+		echo "$commit_oid $commit_size" >>expect &&
+		echo "$tag_oid $tag_size" >>expect &&
+
+		git cat-file --batch-command >actual <<-EOF &&
+		remote-object-info "$HTTPD_URL/smart/http_parent" $hello_oid $tree_oid
+		remote-object-info "$HTTPD_URL/smart/http_parent" $commit_oid $tag_oid
+		EOF
+		test_cmp expect actual
+	)
+'
+
+test_expect_success 'batch-command -Z remote-object-info http:// default filter' '
+	(
+		set_transport_variables "$HTTPD_DOCUMENT_ROOT_PATH/http_parent" &&
+		cd "$HTTPD_DOCUMENT_ROOT_PATH/http_client_empty" &&
+
+		printf "%s\0" "$hello_oid $hello_size" >expect &&
+		printf "%s\0" "$tree_oid $tree_size" >>expect &&
+		printf "%s\0" "$commit_oid $commit_size" >>expect &&
+		printf "%s\0" "$tag_oid $tag_size" >>expect &&
+
+		batch_input="remote-object-info $HTTPD_URL/smart/http_parent $hello_oid $tree_oid
+remote-object-info $HTTPD_URL/smart/http_parent $commit_oid $tag_oid
+" &&
+		echo_without_newline_nul "$batch_input" >commands_null_delimited &&
+
+		git cat-file --batch-command -Z < commands_null_delimited >actual &&
+		test_cmp expect actual
+	)
+'
+
+test_expect_success 'remote-object-info fails on unspported filter option (objectsize:disk)' '
+	(
+		set_transport_variables "$HTTPD_DOCUMENT_ROOT_PATH/http_parent" &&
+		cd "$HTTPD_DOCUMENT_ROOT_PATH/http_parent" &&
+
+		test_must_fail git cat-file --batch-command="%(objectsize:disk)" 2>err <<-EOF &&
+		remote-object-info "$HTTPD_URL/smart/http_parent" $hello_oid
+		EOF
+		test_grep "%(objectsize:disk) is currently not supported with remote-object-info" err
+	)
+'
+
+test_expect_success 'remote-object-info fails on unspported filter option (deltabase)' '
+	(
+		set_transport_variables "$HTTPD_DOCUMENT_ROOT_PATH/http_parent" &&
+		cd "$HTTPD_DOCUMENT_ROOT_PATH/http_parent" &&
+
+		test_must_fail git cat-file --batch-command="%(deltabase)" 2>err <<-EOF &&
+		remote-object-info "$HTTPD_URL/smart/http_parent" $hello_oid
+		EOF
+		test_grep "%(deltabase) is currently not supported with remote-object-info" err
+	)
+'
+
+test_expect_success 'remote-object-info fails on server with legacy protocol' '
+	(
+		set_transport_variables "$HTTPD_DOCUMENT_ROOT_PATH/http_parent" &&
+		cd "$HTTPD_DOCUMENT_ROOT_PATH/http_parent" &&
+
+		test_must_fail git -c protocol.version=0 cat-file --batch-command="%(objectname) %(objectsize)" 2>err <<-EOF &&
+		remote-object-info "$HTTPD_URL/smart/http_parent" $hello_oid
+		EOF
+		test_grep "remote-object-info requires protocol v2" err
+	)
+'
+
+test_expect_success 'remote-object-info fails on server with legacy protocol fallback' '
+	(
+		set_transport_variables "$HTTPD_DOCUMENT_ROOT_PATH/http_parent" &&
+		cd "$HTTPD_DOCUMENT_ROOT_PATH/http_parent" &&
+
+		test_must_fail git -c protocol.version=0 cat-file --batch-command 2>err <<-EOF &&
+		remote-object-info "$HTTPD_URL/smart/http_parent" $hello_oid
+		EOF
+		test_grep "remote-object-info requires protocol v2" err
+	)
+'
+
+test_expect_success 'remote-object-info fails on malformed OID' '
+	(
+		set_transport_variables "$HTTPD_DOCUMENT_ROOT_PATH/http_parent" &&
+		cd "$HTTPD_DOCUMENT_ROOT_PATH/http_parent" &&
+		malformed_object_id="this_id_is_not_valid" &&
+
+		test_must_fail git cat-file --batch-command="%(objectname) %(objectsize)" 2>err <<-EOF &&
+		remote-object-info "$HTTPD_URL/smart/http_parent" $malformed_object_id
+		EOF
+		test_grep "Not a valid object name '$malformed_object_id'" err
+	)
+'
+
+test_expect_success 'remote-object-info fails on malformed OID fallback' '
+	(
+		set_transport_variables "$HTTPD_DOCUMENT_ROOT_PATH/http_parent" &&
+		cd "$HTTPD_DOCUMENT_ROOT_PATH/http_parent" &&
+		malformed_object_id="this_id_is_not_valid" &&
+
+		test_must_fail git cat-file --batch-command 2>err <<-EOF &&
+		remote-object-info "$HTTPD_URL/smart/http_parent" $malformed_object_id
+		EOF
+		test_grep "Not a valid object name '$malformed_object_id'" err
+	)
+'
+
+test_expect_success 'remote-object-info fails on missing OID' '
+	(
+		set_transport_variables "$HTTPD_DOCUMENT_ROOT_PATH/http_parent" &&
+		git clone "$HTTPD_DOCUMENT_ROOT_PATH/http_parent" missing_oid_repo &&
+		test_commit -C missing_oid_repo message1 c.txt &&
+		cd missing_oid_repo &&
+
+		object_id=$(git rev-parse message1:c.txt) &&
+		test_must_fail git cat-file --batch-command="%(objectname) %(objectsize)" 2>err <<-EOF &&
+		remote-object-info "$HTTPD_URL/smart/http_parent" $object_id
+		EOF
+		test_grep "object-info: not our ref $object_id" err
+	)
+'
+
+# Test --batch-command remote-object-info with 'http://' transport and
+# transfer.advertiseobjectinfo set to false, i.e. server does not have object-info capability
+
+test_expect_success 'remote-object-info fallback http://: fetch objects to client' '
+	(
+		set_transport_variables "$HTTPD_DOCUMENT_ROOT_PATH/http_parent" &&
+		git -C "$HTTPD_DOCUMENT_ROOT_PATH/http_parent" config transfer.advertiseobjectinfo false &&
+		cd "$HTTPD_DOCUMENT_ROOT_PATH/http_client_empty" &&
+
+		# Prove object is not on the client
+		echo "$hello_oid missing" >expect &&
+		echo "$tree_oid missing" >>expect &&
+		echo "$commit_oid missing" >>expect &&
+		echo "$tag_oid missing" >>expect &&
+
+		# These results prove remote-object-info can retrieve object info
+		echo "$hello_oid $hello_size" >>expect &&
+		echo "$tree_oid $tree_size" >>expect &&
+		echo "$commit_oid $commit_size" >>expect &&
+		echo "$tag_oid $tag_size" >>expect &&
+
+		# These results are for the info command
+		# They prove objects are downloaded
+		echo "$hello_oid $hello_size" >>expect &&
+		echo "$tree_oid $tree_size" >>expect &&
+		echo "$commit_oid $commit_size" >>expect &&
+		echo "$tag_oid $tag_size" >>expect &&
+
+		git cat-file --batch-command >actual <<-EOF &&
+		info $hello_oid
+		info $tree_oid
+		info $commit_oid
+		info $tag_oid
+		remote-object-info "$HTTPD_URL/smart/http_parent" $hello_oid $tree_oid $commit_oid $tag_oid
+		info $hello_oid
+		info $tree_oid
+		info $commit_oid
+		info $tag_oid
+		EOF
+
+		# revert server state back
+		git -C "$HTTPD_DOCUMENT_ROOT_PATH/http_parent" config transfer.advertiseobjectinfo true &&
+
+		test_cmp expect actual
+	)
+'
+
+# DO NOT add non-httpd-specific tests here, because the last part of this
+# test script is only executed when httpd is available and enabled.
+
+test_done

From d9b67c373e763ea3426c9a24e1fc7f3985b702cd Mon Sep 17 00:00:00 2001
From: Derrick Stolee <stolee@gmail.com>
Date: Thu, 31 Oct 2024 06:26:58 +0000
Subject: [PATCH 62/69] path-walk: introduce an object walk by path

In anticipation of a few planned applications, introduce the most basic form
of a path-walk API. It currently assumes that there are no UNINTERESTING
objects, and does not include any complicated filters. It calls a function
pointer on groups of tree and blob objects as grouped by path. This only
includes objects the first time they are discovered, so an object that
appears at multiple paths will not be included in two batches.

These batches are collected in 'struct type_and_oid_list' objects, which
store an object type and an oid_array of objects.

The data structures are documented in 'struct path_walk_context', but in
summary the most important are:

  * 'paths_to_lists' is a strmap that connects a path to a
    type_and_oid_list for that path. To avoid conflicts in path names,
    we make sure that tree paths end in "/" (except the root path with
    is an empty string) and blob paths do not end in "/".

  * 'path_stack' is a string list that is added to in an append-only
    way. This stores the stack of our depth-first search on the heap
    instead of using recursion.

  * 'path_stack_pushed' is a strmap that stores path names that were
    already added to 'path_stack', to avoid repeating paths in the
    stack. Mostly, this saves us from quadratic lookups from doing
    unsorted checks into the string_list.

The coupling of 'path_stack' and 'path_stack_pushed' is protected by the
push_to_stack() method. Call this instead of inserting into these
structures directly.

The walk_objects_by_path() method initializes these structures and
starts walking commits from the given rev_info struct. The commits are
used to find the list of root trees which populate the start of our
depth-first search.

The core of our depth-first search is in a while loop that continues
while we have not indicated an early exit and our 'path_stack' still has
entries in it. The loop body pops a path off of the stack and "visits"
the path via the walk_path() method.

The walk_path() method gets the list of OIDs from the 'path_to_lists'
strmap and executes the callback method on that list with the given path
and type. If the OIDs correspond to tree objects, then iterate over all
trees in the list and run add_children() to add the child objects to
their own lists, adding new entries to the stack if necessary.

In testing, this depth-first search approach was the one that used the
least memory while iterating over the object lists. There is still a
chance that repositories with too-wide path patterns could cause memory
pressure issues. Limiting the stack size could be done in the future by
limiting how many objects are being considered in-progress, or by
visiting blob paths earlier than trees.

There are many future adaptations that could be made, but they are left for
future updates when consumers are ready to take advantage of those features.

Signed-off-by: Derrick Stolee <stolee@gmail.com>
Signed-off-by: Taylor Blau <me@ttaylorr.com>
---
 Documentation/technical/api-path-walk.txt |  45 ++++
 Makefile                                  |   1 +
 path-walk.c                               | 260 ++++++++++++++++++++++
 path-walk.h                               |  43 ++++
 4 files changed, 349 insertions(+)
 create mode 100644 Documentation/technical/api-path-walk.txt
 create mode 100644 path-walk.c
 create mode 100644 path-walk.h

diff --git a/Documentation/technical/api-path-walk.txt b/Documentation/technical/api-path-walk.txt
new file mode 100644
index 00000000000000..c550c77ca30754
--- /dev/null
+++ b/Documentation/technical/api-path-walk.txt
@@ -0,0 +1,45 @@
+Path-Walk API
+=============
+
+The path-walk API is used to walk reachable objects, but to visit objects
+in batches based on a common path they appear in, or by type.
+
+For example, all reachable commits are visited in a group. All tags are
+visited in a group. Then, all root trees are visited. At some point, all
+blobs reachable via a path `my/dir/to/A` are visited. When there are
+multiple paths possible to reach the same object, then only one of those
+paths is used to visit the object.
+
+Basics
+------
+
+To use the path-walk API, include `path-walk.h` and call
+`walk_objects_by_path()` with a customized `path_walk_info` struct. The
+struct is used to set all of the options for how the walk should proceed.
+Let's dig into the different options and their use.
+
+`path_fn` and `path_fn_data`::
+	The most important option is the `path_fn` option, which is a
+	function pointer to the callback that can execute logic on the
+	object IDs for objects grouped by type and path. This function
+	also receives a `data` value that corresponds to the
+	`path_fn_data` member, for providing custom data structures to
+	this callback function.
+
+`revs`::
+	To configure the exact details of the reachable set of objects,
+	use the `revs` member and initialize it using the revision
+	machinery in `revision.h`. Initialize `revs` using calls such as
+	`setup_revisions()` or `parse_revision_opt()`. Do not call
+	`prepare_revision_walk()`, as that will be called within
+	`walk_objects_by_path()`.
++
+It is also important that you do not specify the `--objects` flag for the
+`revs` struct. The revision walk should only be used to walk commits, and
+the objects will be walked in a separate way based on those starting
+commits.
+
+Examples
+--------
+
+See example usages in future changes.
diff --git a/Makefile b/Makefile
index 8d8cc6ab90928b..1b2abad3b4a9ee 100644
--- a/Makefile
+++ b/Makefile
@@ -1098,6 +1098,7 @@ LIB_OBJS += parse-options.o
 LIB_OBJS += patch-delta.o
 LIB_OBJS += patch-ids.o
 LIB_OBJS += path.o
+LIB_OBJS += path-walk.o
 LIB_OBJS += pathspec.o
 LIB_OBJS += pkt-line.o
 LIB_OBJS += preload-index.o
diff --git a/path-walk.c b/path-walk.c
new file mode 100644
index 00000000000000..9dc56aff88c297
--- /dev/null
+++ b/path-walk.c
@@ -0,0 +1,260 @@
+/*
+ * path-walk.c: implementation for path-based walks of the object graph.
+ */
+#include "git-compat-util.h"
+#include "path-walk.h"
+#include "blob.h"
+#include "commit.h"
+#include "dir.h"
+#include "hashmap.h"
+#include "hex.h"
+#include "object.h"
+#include "oid-array.h"
+#include "revision.h"
+#include "string-list.h"
+#include "strmap.h"
+#include "trace2.h"
+#include "tree.h"
+#include "tree-walk.h"
+
+struct type_and_oid_list
+{
+	enum object_type type;
+	struct oid_array oids;
+};
+
+#define TYPE_AND_OID_LIST_INIT { \
+	.type = OBJ_NONE, 	 \
+	.oids = OID_ARRAY_INIT	 \
+}
+
+struct path_walk_context {
+	/**
+	 * Repeats of data in 'struct path_walk_info' for
+	 * access with fewer characters.
+	 */
+	struct repository *repo;
+	struct rev_info *revs;
+	struct path_walk_info *info;
+
+	/**
+	 * Map a path to a 'struct type_and_oid_list'
+	 * containing the objects discovered at that
+	 * path.
+	 */
+	struct strmap paths_to_lists;
+
+	/**
+	 * Store the current list of paths in a stack, to
+	 * facilitate depth-first-search without recursion.
+	 *
+	 * Use path_stack_pushed to indicate whether a path
+	 * was previously added to path_stack.
+	 */
+	struct string_list path_stack;
+	struct strset path_stack_pushed;
+};
+
+static void push_to_stack(struct path_walk_context *ctx,
+			  const char *path)
+{
+	if (strset_contains(&ctx->path_stack_pushed, path))
+		return;
+
+	strset_add(&ctx->path_stack_pushed, path);
+	string_list_append(&ctx->path_stack, path);
+}
+
+static int add_children(struct path_walk_context *ctx,
+			const char *base_path,
+			struct object_id *oid)
+{
+	struct tree_desc desc;
+	struct name_entry entry;
+	struct strbuf path = STRBUF_INIT;
+	size_t base_len;
+	struct tree *tree = lookup_tree(ctx->repo, oid);
+
+	if (!tree) {
+		error(_("failed to walk children of tree %s: not found"),
+		      oid_to_hex(oid));
+		return -1;
+	} else if (parse_tree_gently(tree, 1)) {
+		die("bad tree object %s", oid_to_hex(oid));
+	}
+
+	strbuf_addstr(&path, base_path);
+	base_len = path.len;
+
+	parse_tree(tree);
+	init_tree_desc(&desc, &tree->object.oid, tree->buffer, tree->size);
+	while (tree_entry(&desc, &entry)) {
+		struct type_and_oid_list *list;
+		struct object *o;
+		/* Not actually true, but we will ignore submodules later. */
+		enum object_type type = S_ISDIR(entry.mode) ? OBJ_TREE : OBJ_BLOB;
+
+		/* Skip submodules. */
+		if (S_ISGITLINK(entry.mode))
+			continue;
+
+		if (type == OBJ_TREE) {
+			struct tree *child = lookup_tree(ctx->repo, &entry.oid);
+			o = child ? &child->object : NULL;
+		} else if (type == OBJ_BLOB) {
+			struct blob *child = lookup_blob(ctx->repo, &entry.oid);
+			o = child ? &child->object : NULL;
+		} else {
+			/* Wrong type? */
+			continue;
+		}
+
+		if (!o) /* report error?*/
+			continue;
+
+		strbuf_setlen(&path, base_len);
+		strbuf_add(&path, entry.path, entry.pathlen);
+
+		/*
+		 * Trees will end with "/" for concatenation and distinction
+		 * from blobs at the same path.
+		 */
+		if (type == OBJ_TREE)
+			strbuf_addch(&path, '/');
+
+		if (!(list = strmap_get(&ctx->paths_to_lists, path.buf))) {
+			CALLOC_ARRAY(list, 1);
+			list->type = type;
+			strmap_put(&ctx->paths_to_lists, path.buf, list);
+		}
+		push_to_stack(ctx, path.buf);
+
+		/* Skip this object if already seen. */
+		if (o->flags & SEEN)
+			continue;
+		o->flags |= SEEN;
+		oid_array_append(&list->oids, &entry.oid);
+	}
+
+	free_tree_buffer(tree);
+	strbuf_release(&path);
+	return 0;
+}
+
+/*
+ * For each path in paths_to_explore, walk the trees another level
+ * and add any found blobs to the batch (but only if they exist and
+ * haven't been added yet).
+ */
+static int walk_path(struct path_walk_context *ctx,
+		     const char *path)
+{
+	struct type_and_oid_list *list;
+	int ret = 0;
+
+	list = strmap_get(&ctx->paths_to_lists, path);
+
+	/* Evaluate function pointer on this data. */
+	ret = ctx->info->path_fn(path, &list->oids, list->type,
+				 ctx->info->path_fn_data);
+
+	/* Expand data for children. */
+	if (list->type == OBJ_TREE) {
+		for (size_t i = 0; i < list->oids.nr; i++) {
+			ret |= add_children(ctx,
+					    path,
+					    &list->oids.oid[i]);
+		}
+	}
+
+	oid_array_clear(&list->oids);
+	strmap_remove(&ctx->paths_to_lists, path, 1);
+	return ret;
+}
+
+static void clear_strmap(struct strmap *map)
+{
+	struct hashmap_iter iter;
+	struct strmap_entry *e;
+
+	hashmap_for_each_entry(&map->map, &iter, e, ent) {
+		struct type_and_oid_list *list = e->value;
+		oid_array_clear(&list->oids);
+	}
+	strmap_clear(map, 1);
+	strmap_init(map);
+}
+
+/**
+ * Given the configuration of 'info', walk the commits based on 'info->revs' and
+ * call 'info->path_fn' on each discovered path.
+ *
+ * Returns nonzero on an error.
+ */
+int walk_objects_by_path(struct path_walk_info *info)
+{
+	const char *root_path = "";
+	int ret = 0;
+	size_t commits_nr = 0, paths_nr = 0;
+	struct commit *c;
+	struct type_and_oid_list *root_tree_list;
+	struct path_walk_context ctx = {
+		.repo = info->revs->repo,
+		.revs = info->revs,
+		.info = info,
+		.path_stack = STRING_LIST_INIT_DUP,
+		.path_stack_pushed = STRSET_INIT,
+		.paths_to_lists = STRMAP_INIT
+	};
+
+	trace2_region_enter("path-walk", "commit-walk", info->revs->repo);
+
+	/* Insert a single list for the root tree into the paths. */
+	CALLOC_ARRAY(root_tree_list, 1);
+	root_tree_list->type = OBJ_TREE;
+	strmap_put(&ctx.paths_to_lists, root_path, root_tree_list);
+	push_to_stack(&ctx, root_path);
+
+	if (prepare_revision_walk(info->revs))
+		die(_("failed to setup revision walk"));
+
+	while ((c = get_revision(info->revs))) {
+		struct object_id *oid = get_commit_tree_oid(c);
+		struct tree *t;
+		commits_nr++;
+
+		oid = get_commit_tree_oid(c);
+		t = lookup_tree(info->revs->repo, oid);
+
+		if (!t) {
+			warning("could not find tree %s", oid_to_hex(oid));
+			continue;
+		}
+
+		if (t->object.flags & SEEN)
+			continue;
+		t->object.flags |= SEEN;
+		oid_array_append(&root_tree_list->oids, oid);
+	}
+
+	trace2_data_intmax("path-walk", ctx.repo, "commits", commits_nr);
+	trace2_region_leave("path-walk", "commit-walk", info->revs->repo);
+
+	trace2_region_enter("path-walk", "path-walk", info->revs->repo);
+	while (!ret && ctx.path_stack.nr) {
+		char *path = ctx.path_stack.items[ctx.path_stack.nr - 1].string;
+		ctx.path_stack.nr--;
+		paths_nr++;
+
+		ret = walk_path(&ctx, path);
+
+		free(path);
+	}
+	trace2_data_intmax("path-walk", ctx.repo, "paths", paths_nr);
+	trace2_region_leave("path-walk", "path-walk", info->revs->repo);
+
+	clear_strmap(&ctx.paths_to_lists);
+	strset_clear(&ctx.path_stack_pushed);
+	string_list_clear(&ctx.path_stack, 0);
+	return ret;
+}
diff --git a/path-walk.h b/path-walk.h
new file mode 100644
index 00000000000000..c9e94a98bc8f6b
--- /dev/null
+++ b/path-walk.h
@@ -0,0 +1,43 @@
+/*
+ * path-walk.h : Methods and structures for walking the object graph in batches
+ * by the paths that can reach those objects.
+ */
+#include "object.h" /* Required for 'enum object_type'. */
+
+struct rev_info;
+struct oid_array;
+
+/**
+ * The type of a function pointer for the method that is called on a list of
+ * objects reachable at a given path.
+ */
+typedef int (*path_fn)(const char *path,
+		       struct oid_array *oids,
+		       enum object_type type,
+		       void *data);
+
+struct path_walk_info {
+	/**
+	 * revs provides the definitions for the commit walk, including
+	 * which commits are UNINTERESTING or not.
+	 */
+	struct rev_info *revs;
+
+	/**
+	 * The caller wishes to execute custom logic on objects reachable at a
+	 * given path. Every reachable object will be visited exactly once, and
+	 * the first path to see an object wins. This may not be a stable choice.
+	 */
+	path_fn path_fn;
+	void *path_fn_data;
+};
+
+#define PATH_WALK_INFO_INIT { 0 }
+
+/**
+ * Given the configuration of 'info', walk the commits based on 'info->revs' and
+ * call 'info->path_fn' on each discovered path.
+ *
+ * Returns nonzero on an error.
+ */
+int walk_objects_by_path(struct path_walk_info *info);

From 44425c47ef76d227a71a9a2f9f2e90ff7e929209 Mon Sep 17 00:00:00 2001
From: Derrick Stolee <stolee@gmail.com>
Date: Thu, 31 Oct 2024 06:26:59 +0000
Subject: [PATCH 63/69] test-lib-functions: add test_cmp_sorted

This test helper will be helpful to reduce repeated logic in
t6601-path-walk.sh, but may be helpful elsewhere, too.

Signed-off-by: Derrick Stolee <stolee@gmail.com>
Signed-off-by: Taylor Blau <me@ttaylorr.com>
---
 t/test-lib-functions.sh | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/t/test-lib-functions.sh b/t/test-lib-functions.sh
index 78e054ab503a65..c17f6b05c9b4af 100644
--- a/t/test-lib-functions.sh
+++ b/t/test-lib-functions.sh
@@ -1268,6 +1268,16 @@ test_cmp () {
 	eval "$GIT_TEST_CMP" '"$@"'
 }
 
+# test_cmp_sorted runs test_cmp on sorted versions of the two
+# input files. Uses "$1.sorted" and "$2.sorted" as temp files.
+
+test_cmp_sorted () {
+	sort <"$1" >"$1.sorted" &&
+	sort <"$2" >"$2.sorted" &&
+	test_cmp "$1.sorted" "$2.sorted" &&
+	rm "$1.sorted" "$2.sorted"
+}
+
 # Check that the given config key has the expected value.
 #
 #    test_cmp_config [-C <dir>] <expected-value>

From 35e6cf99d94115fc3380a5e1d6e1c3a17e181f9c Mon Sep 17 00:00:00 2001
From: Derrick Stolee <stolee@gmail.com>
Date: Thu, 31 Oct 2024 06:27:00 +0000
Subject: [PATCH 64/69] t6601: add helper for testing path-walk API

Add some tests based on the current behavior, doing interesting checks
for different sets of branches, ranges, and the --boundary option. This
sets a baseline for the behavior and we can extend it as new options are
introduced.

It is important to mention that the behavior of the API will change soon as
we start to handle UNINTERESTING objects differently, but these tests will
demonstrate the change in behavior.

Signed-off-by: Derrick Stolee <stolee@gmail.com>
Signed-off-by: Taylor Blau <me@ttaylorr.com>
---
 Documentation/technical/api-path-walk.txt |   3 +-
 Makefile                                  |   1 +
 t/helper/test-path-walk.c                 |  86 ++++++++++++++++
 t/helper/test-tool.c                      |   1 +
 t/helper/test-tool.h                      |   1 +
 t/t6601-path-walk.sh                      | 118 ++++++++++++++++++++++
 6 files changed, 209 insertions(+), 1 deletion(-)
 create mode 100644 t/helper/test-path-walk.c
 create mode 100755 t/t6601-path-walk.sh

diff --git a/Documentation/technical/api-path-walk.txt b/Documentation/technical/api-path-walk.txt
index c550c77ca30754..662162ec70b38b 100644
--- a/Documentation/technical/api-path-walk.txt
+++ b/Documentation/technical/api-path-walk.txt
@@ -42,4 +42,5 @@ commits.
 Examples
 --------
 
-See example usages in future changes.
+See example usages in:
+	`t/helper/test-path-walk.c`
diff --git a/Makefile b/Makefile
index 1b2abad3b4a9ee..e0b9e14a683832 100644
--- a/Makefile
+++ b/Makefile
@@ -822,6 +822,7 @@ TEST_BUILTINS_OBJS += test-parse-options.o
 TEST_BUILTINS_OBJS += test-parse-pathspec-file.o
 TEST_BUILTINS_OBJS += test-partial-clone.o
 TEST_BUILTINS_OBJS += test-path-utils.o
+TEST_BUILTINS_OBJS += test-path-walk.o
 TEST_BUILTINS_OBJS += test-pcre2-config.o
 TEST_BUILTINS_OBJS += test-pkt-line.o
 TEST_BUILTINS_OBJS += test-proc-receive.o
diff --git a/t/helper/test-path-walk.c b/t/helper/test-path-walk.c
new file mode 100644
index 00000000000000..3c48f017fa0b06
--- /dev/null
+++ b/t/helper/test-path-walk.c
@@ -0,0 +1,86 @@
+#define USE_THE_REPOSITORY_VARIABLE
+
+#include "test-tool.h"
+#include "environment.h"
+#include "hex.h"
+#include "object-name.h"
+#include "object.h"
+#include "pretty.h"
+#include "revision.h"
+#include "setup.h"
+#include "parse-options.h"
+#include "path-walk.h"
+#include "oid-array.h"
+
+static const char * const path_walk_usage[] = {
+	N_("test-tool path-walk <options> -- <revision-options>"),
+	NULL
+};
+
+struct path_walk_test_data {
+	uintmax_t tree_nr;
+	uintmax_t blob_nr;
+};
+
+static int emit_block(const char *path, struct oid_array *oids,
+		      enum object_type type, void *data)
+{
+	struct path_walk_test_data *tdata = data;
+	const char *typestr;
+
+	switch (type) {
+	case OBJ_TREE:
+		typestr = "TREE";
+		tdata->tree_nr += oids->nr;
+		break;
+
+	case OBJ_BLOB:
+		typestr = "BLOB";
+		tdata->blob_nr += oids->nr;
+		break;
+
+	default:
+		BUG("we do not understand this type");
+	}
+
+	for (size_t i = 0; i < oids->nr; i++)
+		printf("%s:%s:%s\n", typestr, path, oid_to_hex(&oids->oid[i]));
+
+	return 0;
+}
+
+int cmd__path_walk(int argc, const char **argv)
+{
+	int res;
+	struct rev_info revs = REV_INFO_INIT;
+	struct path_walk_info info = PATH_WALK_INFO_INIT;
+	struct path_walk_test_data data = { 0 };
+	struct option options[] = {
+		OPT_END(),
+	};
+
+	initialize_repository(the_repository);
+	setup_git_directory();
+	revs.repo = the_repository;
+
+	argc = parse_options(argc, argv, NULL,
+			     options, path_walk_usage,
+			     PARSE_OPT_KEEP_UNKNOWN_OPT | PARSE_OPT_KEEP_ARGV0);
+
+	if (argc > 1)
+		setup_revisions(argc, argv, &revs, NULL);
+	else
+		usage(path_walk_usage[0]);
+
+	info.revs = &revs;
+	info.path_fn = emit_block;
+	info.path_fn_data = &data;
+
+	res = walk_objects_by_path(&info);
+
+	printf("trees:%" PRIuMAX "\n"
+	       "blobs:%" PRIuMAX "\n",
+	       data.tree_nr, data.blob_nr);
+
+	return res;
+}
diff --git a/t/helper/test-tool.c b/t/helper/test-tool.c
index 1ebb69a5dc4c17..43676e7b93a43f 100644
--- a/t/helper/test-tool.c
+++ b/t/helper/test-tool.c
@@ -52,6 +52,7 @@ static struct test_cmd cmds[] = {
 	{ "parse-subcommand", cmd__parse_subcommand },
 	{ "partial-clone", cmd__partial_clone },
 	{ "path-utils", cmd__path_utils },
+	{ "path-walk", cmd__path_walk },
 	{ "pcre2-config", cmd__pcre2_config },
 	{ "pkt-line", cmd__pkt_line },
 	{ "proc-receive", cmd__proc_receive },
diff --git a/t/helper/test-tool.h b/t/helper/test-tool.h
index 21802ac27da37f..9cfc5da6e57b00 100644
--- a/t/helper/test-tool.h
+++ b/t/helper/test-tool.h
@@ -45,6 +45,7 @@ int cmd__parse_pathspec_file(int argc, const char** argv);
 int cmd__parse_subcommand(int argc, const char **argv);
 int cmd__partial_clone(int argc, const char **argv);
 int cmd__path_utils(int argc, const char **argv);
+int cmd__path_walk(int argc, const char **argv);
 int cmd__pcre2_config(int argc, const char **argv);
 int cmd__pkt_line(int argc, const char **argv);
 int cmd__proc_receive(int argc, const char **argv);
diff --git a/t/t6601-path-walk.sh b/t/t6601-path-walk.sh
new file mode 100755
index 00000000000000..1f277b8829139a
--- /dev/null
+++ b/t/t6601-path-walk.sh
@@ -0,0 +1,118 @@
+#!/bin/sh
+
+test_description='direct path-walk API tests'
+
+. ./test-lib.sh
+
+test_expect_success 'setup test repository' '
+	git checkout -b base &&
+
+	mkdir left &&
+	mkdir right &&
+	echo a >a &&
+	echo b >left/b &&
+	echo c >right/c &&
+	git add . &&
+	git commit -m "first" &&
+
+	echo d >right/d &&
+	git add right &&
+	git commit -m "second" &&
+
+	echo bb >left/b &&
+	git commit -a -m "third" &&
+
+	git checkout -b topic HEAD~1 &&
+	echo cc >right/c &&
+	git commit -a -m "topic"
+'
+
+test_expect_success 'all' '
+	test-tool path-walk -- --all >out &&
+
+	cat >expect <<-EOF &&
+	TREE::$(git rev-parse topic^{tree})
+	TREE::$(git rev-parse base^{tree})
+	TREE::$(git rev-parse base~1^{tree})
+	TREE::$(git rev-parse base~2^{tree})
+	TREE:left/:$(git rev-parse base:left)
+	TREE:left/:$(git rev-parse base~2:left)
+	TREE:right/:$(git rev-parse topic:right)
+	TREE:right/:$(git rev-parse base~1:right)
+	TREE:right/:$(git rev-parse base~2:right)
+	trees:9
+	BLOB:a:$(git rev-parse base~2:a)
+	BLOB:left/b:$(git rev-parse base~2:left/b)
+	BLOB:left/b:$(git rev-parse base:left/b)
+	BLOB:right/c:$(git rev-parse base~2:right/c)
+	BLOB:right/c:$(git rev-parse topic:right/c)
+	BLOB:right/d:$(git rev-parse base~1:right/d)
+	blobs:6
+	EOF
+
+	test_cmp_sorted expect out
+'
+
+test_expect_success 'topic only' '
+	test-tool path-walk -- topic >out &&
+
+	cat >expect <<-EOF &&
+	TREE::$(git rev-parse topic^{tree})
+	TREE::$(git rev-parse base~1^{tree})
+	TREE::$(git rev-parse base~2^{tree})
+	TREE:left/:$(git rev-parse base~2:left)
+	TREE:right/:$(git rev-parse topic:right)
+	TREE:right/:$(git rev-parse base~1:right)
+	TREE:right/:$(git rev-parse base~2:right)
+	trees:7
+	BLOB:a:$(git rev-parse base~2:a)
+	BLOB:left/b:$(git rev-parse base~2:left/b)
+	BLOB:right/c:$(git rev-parse base~2:right/c)
+	BLOB:right/c:$(git rev-parse topic:right/c)
+	BLOB:right/d:$(git rev-parse base~1:right/d)
+	blobs:5
+	EOF
+
+	test_cmp_sorted expect out
+'
+
+test_expect_success 'topic, not base' '
+	test-tool path-walk -- topic --not base >out &&
+
+	cat >expect <<-EOF &&
+	TREE::$(git rev-parse topic^{tree})
+	TREE:left/:$(git rev-parse topic:left)
+	TREE:right/:$(git rev-parse topic:right)
+	trees:3
+	BLOB:a:$(git rev-parse topic:a)
+	BLOB:left/b:$(git rev-parse topic:left/b)
+	BLOB:right/c:$(git rev-parse topic:right/c)
+	BLOB:right/d:$(git rev-parse topic:right/d)
+	blobs:4
+	EOF
+
+	test_cmp_sorted expect out
+'
+
+test_expect_success 'topic, not base, boundary' '
+	test-tool path-walk -- --boundary topic --not base >out &&
+
+	cat >expect <<-EOF &&
+	TREE::$(git rev-parse topic^{tree})
+	TREE::$(git rev-parse base~1^{tree})
+	TREE:left/:$(git rev-parse base~1:left)
+	TREE:right/:$(git rev-parse topic:right)
+	TREE:right/:$(git rev-parse base~1:right)
+	trees:5
+	BLOB:a:$(git rev-parse base~1:a)
+	BLOB:left/b:$(git rev-parse base~1:left/b)
+	BLOB:right/c:$(git rev-parse base~1:right/c)
+	BLOB:right/c:$(git rev-parse topic:right/c)
+	BLOB:right/d:$(git rev-parse base~1:right/d)
+	blobs:5
+	EOF
+
+	test_cmp_sorted expect out
+'
+
+test_done

From ca51c2d6d231eed6c89294197dabe62593a9849e Mon Sep 17 00:00:00 2001
From: Derrick Stolee <derrickstolee@github.com>
Date: Thu, 31 Oct 2024 06:27:01 +0000
Subject: [PATCH 65/69] path-walk: allow consumer to specify object types

We add the ability to filter the object types in the path-walk API so
the callback function is called fewer times.

This adds the ability to ask for the commits in a list, as well. We
re-use the empty string for this set of objects because these are passed
directly to the callback function instead of being part of the
'path_stack'.

Future changes will add the ability to visit annotated tags.

Signed-off-by: Derrick Stolee <stolee@gmail.com>
Signed-off-by: Taylor Blau <me@ttaylorr.com>
---
 Documentation/technical/api-path-walk.txt |  9 ++++
 path-walk.c                               | 33 ++++++++++--
 path-walk.h                               | 14 ++++-
 t/helper/test-path-walk.c                 | 17 +++++-
 t/t6601-path-walk.sh                      | 63 +++++++++++++++++++++++
 5 files changed, 129 insertions(+), 7 deletions(-)

diff --git a/Documentation/technical/api-path-walk.txt b/Documentation/technical/api-path-walk.txt
index 662162ec70b38b..dce553b6114e1c 100644
--- a/Documentation/technical/api-path-walk.txt
+++ b/Documentation/technical/api-path-walk.txt
@@ -39,6 +39,15 @@ It is also important that you do not specify the `--objects` flag for the
 the objects will be walked in a separate way based on those starting
 commits.
 
+`commits`, `blobs`, `trees`::
+	By default, these members are enabled and signal that the path-walk
+	API should call the `path_fn` on objects of these types. Specialized
+	applications could disable some options to make it simpler to walk
+	the objects or to have fewer calls to `path_fn`.
++
+While it is possible to walk only commits in this way, consumers would be
+better off using the revision walk API instead.
+
 Examples
 --------
 
diff --git a/path-walk.c b/path-walk.c
index 9dc56aff88c297..14ad322bdd2470 100644
--- a/path-walk.c
+++ b/path-walk.c
@@ -98,6 +98,10 @@ static int add_children(struct path_walk_context *ctx,
 		if (S_ISGITLINK(entry.mode))
 			continue;
 
+		/* If the caller doesn't want blobs, then don't bother. */
+		if (!ctx->info->blobs && type == OBJ_BLOB)
+			continue;
+
 		if (type == OBJ_TREE) {
 			struct tree *child = lookup_tree(ctx->repo, &entry.oid);
 			o = child ? &child->object : NULL;
@@ -154,9 +158,11 @@ static int walk_path(struct path_walk_context *ctx,
 
 	list = strmap_get(&ctx->paths_to_lists, path);
 
-	/* Evaluate function pointer on this data. */
-	ret = ctx->info->path_fn(path, &list->oids, list->type,
-				 ctx->info->path_fn_data);
+	/* Evaluate function pointer on this data, if requested. */
+	if ((list->type == OBJ_TREE && ctx->info->trees) ||
+	    (list->type == OBJ_BLOB && ctx->info->blobs))
+		ret = ctx->info->path_fn(path, &list->oids, list->type,
+					ctx->info->path_fn_data);
 
 	/* Expand data for children. */
 	if (list->type == OBJ_TREE) {
@@ -198,6 +204,7 @@ int walk_objects_by_path(struct path_walk_info *info)
 	size_t commits_nr = 0, paths_nr = 0;
 	struct commit *c;
 	struct type_and_oid_list *root_tree_list;
+	struct type_and_oid_list *commit_list;
 	struct path_walk_context ctx = {
 		.repo = info->revs->repo,
 		.revs = info->revs,
@@ -209,6 +216,9 @@ int walk_objects_by_path(struct path_walk_info *info)
 
 	trace2_region_enter("path-walk", "commit-walk", info->revs->repo);
 
+	CALLOC_ARRAY(commit_list, 1);
+	commit_list->type = OBJ_COMMIT;
+
 	/* Insert a single list for the root tree into the paths. */
 	CALLOC_ARRAY(root_tree_list, 1);
 	root_tree_list->type = OBJ_TREE;
@@ -219,10 +229,18 @@ int walk_objects_by_path(struct path_walk_info *info)
 		die(_("failed to setup revision walk"));
 
 	while ((c = get_revision(info->revs))) {
-		struct object_id *oid = get_commit_tree_oid(c);
+		struct object_id *oid;
 		struct tree *t;
 		commits_nr++;
 
+		if (info->commits)
+			oid_array_append(&commit_list->oids,
+					 &c->object.oid);
+
+		/* If we only care about commits, then skip trees. */
+		if (!info->trees && !info->blobs)
+			continue;
+
 		oid = get_commit_tree_oid(c);
 		t = lookup_tree(info->revs->repo, oid);
 
@@ -240,6 +258,13 @@ int walk_objects_by_path(struct path_walk_info *info)
 	trace2_data_intmax("path-walk", ctx.repo, "commits", commits_nr);
 	trace2_region_leave("path-walk", "commit-walk", info->revs->repo);
 
+	/* Track all commits. */
+	if (info->commits)
+		ret = info->path_fn("", &commit_list->oids, OBJ_COMMIT,
+				    info->path_fn_data);
+	oid_array_clear(&commit_list->oids);
+	free(commit_list);
+
 	trace2_region_enter("path-walk", "path-walk", info->revs->repo);
 	while (!ret && ctx.path_stack.nr) {
 		char *path = ctx.path_stack.items[ctx.path_stack.nr - 1].string;
diff --git a/path-walk.h b/path-walk.h
index c9e94a98bc8f6b..2d2afc29b47d58 100644
--- a/path-walk.h
+++ b/path-walk.h
@@ -30,9 +30,21 @@ struct path_walk_info {
 	 */
 	path_fn path_fn;
 	void *path_fn_data;
+
+	/**
+	 * Initialize which object types the path_fn should be called on. This
+	 * could also limit the walk to skip blobs if not set.
+	 */
+	int commits;
+	int trees;
+	int blobs;
 };
 
-#define PATH_WALK_INFO_INIT { 0 }
+#define PATH_WALK_INFO_INIT {   \
+	.blobs = 1,		\
+	.trees = 1,		\
+	.commits = 1,		\
+}
 
 /**
  * Given the configuration of 'info', walk the commits based on 'info->revs' and
diff --git a/t/helper/test-path-walk.c b/t/helper/test-path-walk.c
index 3c48f017fa0b06..37c5e3e31e813b 100644
--- a/t/helper/test-path-walk.c
+++ b/t/helper/test-path-walk.c
@@ -18,6 +18,7 @@ static const char * const path_walk_usage[] = {
 };
 
 struct path_walk_test_data {
+	uintmax_t commit_nr;
 	uintmax_t tree_nr;
 	uintmax_t blob_nr;
 };
@@ -29,6 +30,11 @@ static int emit_block(const char *path, struct oid_array *oids,
 	const char *typestr;
 
 	switch (type) {
+	case OBJ_COMMIT:
+		typestr = "COMMIT";
+		tdata->commit_nr += oids->nr;
+		break;
+
 	case OBJ_TREE:
 		typestr = "TREE";
 		tdata->tree_nr += oids->nr;
@@ -56,6 +62,12 @@ int cmd__path_walk(int argc, const char **argv)
 	struct path_walk_info info = PATH_WALK_INFO_INIT;
 	struct path_walk_test_data data = { 0 };
 	struct option options[] = {
+		OPT_BOOL(0, "blobs", &info.blobs,
+			 N_("toggle inclusion of blob objects")),
+		OPT_BOOL(0, "commits", &info.commits,
+			 N_("toggle inclusion of commit objects")),
+		OPT_BOOL(0, "trees", &info.trees,
+			 N_("toggle inclusion of tree objects")),
 		OPT_END(),
 	};
 
@@ -78,9 +90,10 @@ int cmd__path_walk(int argc, const char **argv)
 
 	res = walk_objects_by_path(&info);
 
-	printf("trees:%" PRIuMAX "\n"
+	printf("commits:%" PRIuMAX "\n"
+	       "trees:%" PRIuMAX "\n"
 	       "blobs:%" PRIuMAX "\n",
-	       data.tree_nr, data.blob_nr);
+	       data.commit_nr, data.tree_nr, data.blob_nr);
 
 	return res;
 }
diff --git a/t/t6601-path-walk.sh b/t/t6601-path-walk.sh
index 1f277b8829139a..4b16a0a3c80512 100755
--- a/t/t6601-path-walk.sh
+++ b/t/t6601-path-walk.sh
@@ -31,6 +31,11 @@ test_expect_success 'all' '
 	test-tool path-walk -- --all >out &&
 
 	cat >expect <<-EOF &&
+	COMMIT::$(git rev-parse topic)
+	COMMIT::$(git rev-parse base)
+	COMMIT::$(git rev-parse base~1)
+	COMMIT::$(git rev-parse base~2)
+	commits:4
 	TREE::$(git rev-parse topic^{tree})
 	TREE::$(git rev-parse base^{tree})
 	TREE::$(git rev-parse base~1^{tree})
@@ -57,6 +62,10 @@ test_expect_success 'topic only' '
 	test-tool path-walk -- topic >out &&
 
 	cat >expect <<-EOF &&
+	COMMIT::$(git rev-parse topic)
+	COMMIT::$(git rev-parse base~1)
+	COMMIT::$(git rev-parse base~2)
+	commits:3
 	TREE::$(git rev-parse topic^{tree})
 	TREE::$(git rev-parse base~1^{tree})
 	TREE::$(git rev-parse base~2^{tree})
@@ -80,6 +89,8 @@ test_expect_success 'topic, not base' '
 	test-tool path-walk -- topic --not base >out &&
 
 	cat >expect <<-EOF &&
+	COMMIT::$(git rev-parse topic)
+	commits:1
 	TREE::$(git rev-parse topic^{tree})
 	TREE:left/:$(git rev-parse topic:left)
 	TREE:right/:$(git rev-parse topic:right)
@@ -94,10 +105,62 @@ test_expect_success 'topic, not base' '
 	test_cmp_sorted expect out
 '
 
+test_expect_success 'topic, not base, only blobs' '
+	test-tool path-walk --no-trees --no-commits \
+		-- topic --not base >out &&
+
+	cat >expect <<-EOF &&
+	commits:0
+	trees:0
+	BLOB:a:$(git rev-parse topic:a)
+	BLOB:left/b:$(git rev-parse topic:left/b)
+	BLOB:right/c:$(git rev-parse topic:right/c)
+	BLOB:right/d:$(git rev-parse topic:right/d)
+	blobs:4
+	EOF
+
+	test_cmp_sorted expect out
+'
+
+# No, this doesn't make a lot of sense for the path-walk API,
+# but it is possible to do.
+test_expect_success 'topic, not base, only commits' '
+	test-tool path-walk --no-blobs --no-trees \
+		-- topic --not base >out &&
+
+	cat >expect <<-EOF &&
+	COMMIT::$(git rev-parse topic)
+	commits:1
+	trees:0
+	blobs:0
+	EOF
+
+	test_cmp_sorted expect out
+'
+
+test_expect_success 'topic, not base, only trees' '
+	test-tool path-walk --no-blobs --no-commits \
+		-- topic --not base >out &&
+
+	cat >expect <<-EOF &&
+	commits:0
+	TREE::$(git rev-parse topic^{tree})
+	TREE:left/:$(git rev-parse topic:left)
+	TREE:right/:$(git rev-parse topic:right)
+	trees:3
+	blobs:0
+	EOF
+
+	test_cmp_sorted expect out
+'
+
 test_expect_success 'topic, not base, boundary' '
 	test-tool path-walk -- --boundary topic --not base >out &&
 
 	cat >expect <<-EOF &&
+	COMMIT::$(git rev-parse topic)
+	COMMIT::$(git rev-parse base~1)
+	commits:2
 	TREE::$(git rev-parse topic^{tree})
 	TREE::$(git rev-parse base~1^{tree})
 	TREE:left/:$(git rev-parse base~1:left)

From 8fa5e62bac3896a626442969ea8de0975f6a607c Mon Sep 17 00:00:00 2001
From: Derrick Stolee <stolee@gmail.com>
Date: Thu, 31 Oct 2024 06:27:02 +0000
Subject: [PATCH 66/69] path-walk: visit tags and cached objects

The rev_info that is specified for a path-walk traversal may specify
visiting tag refs (both lightweight and annotated) and also may specify
indexed objects (blobs and trees). Update the path-walk API to walk
these objects as well.

When walking tags, we need to peel the annotated objects until reaching
a non-tag object. If we reach a commit, then we can add it to the
pending objects to make sure we visit in the commit walk portion. If we
reach a tree, then we will assume that it is a root tree. If we reach a
blob, then we have no good path name and so add it to a new list of
"tagged blobs".

When the rev_info includes the "--indexed-objects" flag, then the
pending set includes blobs and trees found in the cache entries and
cache-tree. The cache entries are usually blobs, though they could be
trees in the case of a sparse index. The cache-tree stores
previously-hashed tree objects but these are cleared out when staging
objects below those paths. We add tests that demonstrate this.

The indexed objects come with a non-NULL 'path' value in the pending
item. This allows us to prepopulate the 'path_to_lists' strmap with
lists for these paths.

The tricky thing about this walk is that we will want to combine the
indexed objects walk with the commit walk, especially in the future case
of walking objects during a command like 'git repack'.

Whenever possible, we want the objects from the index to be grouped with
similar objects in history. We don't want to miss any paths that appear
only in the index and not in the commit history.

Thus, we need to be careful to let the path stack be populated initially
with only the root tree path (and possibly tags and tagged blobs) and go
through the normal depth-first search. Afterwards, if there are other
paths that are remaining in the paths_to_lists strmap, we should then
iterate through the stack and visit those objects recursively.

Signed-off-by: Derrick Stolee <stolee@gmail.com>
Signed-off-by: Taylor Blau <me@ttaylorr.com>
---
 Documentation/technical/api-path-walk.txt |   2 +-
 path-walk.c                               | 175 +++++++++++++++++++++-
 path-walk.h                               |   2 +
 t/helper/test-path-walk.c                 |  13 +-
 t/t6601-path-walk.sh                      | 154 ++++++++++++++++++-
 5 files changed, 336 insertions(+), 10 deletions(-)

diff --git a/Documentation/technical/api-path-walk.txt b/Documentation/technical/api-path-walk.txt
index dce553b6114e1c..6022c381b7c3f0 100644
--- a/Documentation/technical/api-path-walk.txt
+++ b/Documentation/technical/api-path-walk.txt
@@ -39,7 +39,7 @@ It is also important that you do not specify the `--objects` flag for the
 the objects will be walked in a separate way based on those starting
 commits.
 
-`commits`, `blobs`, `trees`::
+`commits`, `blobs`, `trees`, `tags`::
 	By default, these members are enabled and signal that the path-walk
 	API should call the `path_fn` on objects of these types. Specialized
 	applications could disable some options to make it simpler to walk
diff --git a/path-walk.c b/path-walk.c
index 14ad322bdd2470..eca0e5f3d5b28c 100644
--- a/path-walk.c
+++ b/path-walk.c
@@ -13,10 +13,13 @@
 #include "revision.h"
 #include "string-list.h"
 #include "strmap.h"
+#include "tag.h"
 #include "trace2.h"
 #include "tree.h"
 #include "tree-walk.h"
 
+static const char *root_path = "";
+
 struct type_and_oid_list
 {
 	enum object_type type;
@@ -158,9 +161,13 @@ static int walk_path(struct path_walk_context *ctx,
 
 	list = strmap_get(&ctx->paths_to_lists, path);
 
+	if (!list)
+		BUG("provided path '%s' that had no associated list", path);
+
 	/* Evaluate function pointer on this data, if requested. */
 	if ((list->type == OBJ_TREE && ctx->info->trees) ||
-	    (list->type == OBJ_BLOB && ctx->info->blobs))
+	    (list->type == OBJ_BLOB && ctx->info->blobs) ||
+	    (list->type == OBJ_TAG && ctx->info->tags))
 		ret = ctx->info->path_fn(path, &list->oids, list->type,
 					ctx->info->path_fn_data);
 
@@ -191,6 +198,134 @@ static void clear_strmap(struct strmap *map)
 	strmap_init(map);
 }
 
+static void setup_pending_objects(struct path_walk_info *info,
+				  struct path_walk_context *ctx)
+{
+	struct type_and_oid_list *tags = NULL;
+	struct type_and_oid_list *tagged_blobs = NULL;
+	struct type_and_oid_list *root_tree_list = NULL;
+
+	if (info->tags)
+		CALLOC_ARRAY(tags, 1);
+	if (info->blobs)
+		CALLOC_ARRAY(tagged_blobs, 1);
+	if (info->trees)
+		root_tree_list = strmap_get(&ctx->paths_to_lists, root_path);
+
+	/*
+	 * Pending objects include:
+	 * * Commits at branch tips.
+	 * * Annotated tags at tag tips.
+	 * * Any kind of object at lightweight tag tips.
+	 * * Trees and blobs in the index (with an associated path).
+	 */
+	for (size_t i = 0; i < info->revs->pending.nr; i++) {
+		struct object_array_entry *pending = info->revs->pending.objects + i;
+		struct object *obj = pending->item;
+
+		/* Commits will be picked up by revision walk. */
+		if (obj->type == OBJ_COMMIT)
+			continue;
+
+		/* Navigate annotated tag object chains. */
+		while (obj->type == OBJ_TAG) {
+			struct tag *tag = lookup_tag(info->revs->repo, &obj->oid);
+			if (!tag)
+				break;
+			if (tag->object.flags & SEEN)
+				break;
+			tag->object.flags |= SEEN;
+
+			if (tags)
+				oid_array_append(&tags->oids, &obj->oid);
+			obj = tag->tagged;
+		}
+
+		if (obj->type == OBJ_TAG)
+			continue;
+
+		/* We are now at a non-tag object. */
+		if (obj->flags & SEEN)
+			continue;
+		obj->flags |= SEEN;
+
+		switch (obj->type) {
+		case OBJ_TREE:
+			if (!info->trees)
+				continue;
+			if (pending->path) {
+				struct type_and_oid_list *list;
+				char *path = *pending->path ? xstrfmt("%s/", pending->path)
+							    : xstrdup("");
+				if (!(list = strmap_get(&ctx->paths_to_lists, path))) {
+					CALLOC_ARRAY(list, 1);
+					list->type = OBJ_TREE;
+					strmap_put(&ctx->paths_to_lists, path, list);
+				}
+				oid_array_append(&list->oids, &obj->oid);
+				free(path);
+			} else {
+				/* assume a root tree, such as a lightweight tag. */
+				oid_array_append(&root_tree_list->oids, &obj->oid);
+			}
+			break;
+
+		case OBJ_BLOB:
+			if (!info->blobs)
+				continue;
+			if (pending->path) {
+				struct type_and_oid_list *list;
+				char *path = pending->path;
+				if (!(list = strmap_get(&ctx->paths_to_lists, path))) {
+					CALLOC_ARRAY(list, 1);
+					list->type = OBJ_BLOB;
+					strmap_put(&ctx->paths_to_lists, path, list);
+				}
+				oid_array_append(&list->oids, &obj->oid);
+			} else {
+				/* assume a root tree, such as a lightweight tag. */
+				oid_array_append(&tagged_blobs->oids, &obj->oid);
+			}
+			break;
+
+		case OBJ_COMMIT:
+			/* Make sure it is in the object walk */
+			if (obj != pending->item)
+				add_pending_object(info->revs, obj, "");
+			break;
+
+		default:
+			BUG("should not see any other type here");
+		}
+	}
+
+	/*
+	 * Add tag objects and tagged blobs if they exist.
+	 */
+	if (tagged_blobs) {
+		if (tagged_blobs->oids.nr) {
+			const char *tagged_blob_path = "/tagged-blobs";
+			tagged_blobs->type = OBJ_BLOB;
+			push_to_stack(ctx, tagged_blob_path);
+			strmap_put(&ctx->paths_to_lists, tagged_blob_path, tagged_blobs);
+		} else {
+			oid_array_clear(&tagged_blobs->oids);
+			free(tagged_blobs);
+		}
+	}
+	if (tags) {
+		if (tags->oids.nr) {
+			const char *tag_path = "/tags";
+			tags->type = OBJ_TAG;
+			push_to_stack(ctx, tag_path);
+			strmap_put(&ctx->paths_to_lists, tag_path, tags);
+		} else {
+			oid_array_clear(&tags->oids);
+			free(tags);
+		}
+	}
+}
+
 /**
  * Given the configuration of 'info', walk the commits based on 'info->revs' and
  * call 'info->path_fn' on each discovered path.
@@ -199,7 +334,6 @@ static void clear_strmap(struct strmap *map)
  */
 int walk_objects_by_path(struct path_walk_info *info)
 {
-	const char *root_path = "";
 	int ret = 0;
 	size_t commits_nr = 0, paths_nr = 0;
 	struct commit *c;
@@ -219,15 +353,31 @@ int walk_objects_by_path(struct path_walk_info *info)
 	CALLOC_ARRAY(commit_list, 1);
 	commit_list->type = OBJ_COMMIT;
 
+	if (info->tags)
+		info->revs->tag_objects = 1;
+
 	/* Insert a single list for the root tree into the paths. */
 	CALLOC_ARRAY(root_tree_list, 1);
 	root_tree_list->type = OBJ_TREE;
 	strmap_put(&ctx.paths_to_lists, root_path, root_tree_list);
 	push_to_stack(&ctx, root_path);
 
+	/*
+	 * Set these values before preparing the walk to catch
+	 * lightweight tags pointing to non-commits and indexed objects.
+	 */
+	info->revs->blob_objects = info->blobs;
+	info->revs->tree_objects = info->trees;
+
 	if (prepare_revision_walk(info->revs))
 		die(_("failed to setup revision walk"));
 
+	info->revs->blob_objects = info->revs->tree_objects = 0;
+
+	trace2_region_enter("path-walk", "pending-walk", info->revs->repo);
+	setup_pending_objects(info, &ctx);
+	trace2_region_leave("path-walk", "pending-walk", info->revs->repo);
+
 	while ((c = get_revision(info->revs))) {
 		struct object_id *oid;
 		struct tree *t;
@@ -275,6 +425,27 @@ int walk_objects_by_path(struct path_walk_info *info)
 
 		free(path);
 	}
+
+	/* Are there paths remaining? Likely they are from indexed objects. */
+	if (!strmap_empty(&ctx.paths_to_lists)) {
+		struct hashmap_iter iter;
+		struct strmap_entry *entry;
+
+		strmap_for_each_entry(&ctx.paths_to_lists, &iter, entry) {
+			push_to_stack(&ctx, entry->key);
+		}
+
+		while (!ret && ctx.path_stack.nr) {
+			char *path = ctx.path_stack.items[ctx.path_stack.nr - 1].string;
+			ctx.path_stack.nr--;
+			paths_nr++;
+
+			ret = walk_path(&ctx, path);
+
+			free(path);
+		}
+	}
+
 	trace2_data_intmax("path-walk", ctx.repo, "paths", paths_nr);
 	trace2_region_leave("path-walk", "path-walk", info->revs->repo);
 
diff --git a/path-walk.h b/path-walk.h
index 2d2afc29b47d58..ca839f873e4dea 100644
--- a/path-walk.h
+++ b/path-walk.h
@@ -38,12 +38,14 @@ struct path_walk_info {
 	int commits;
 	int trees;
 	int blobs;
+	int tags;
 };
 
 #define PATH_WALK_INFO_INIT {   \
 	.blobs = 1,		\
 	.trees = 1,		\
 	.commits = 1,		\
+	.tags = 1,		\
 }
 
 /**
diff --git a/t/helper/test-path-walk.c b/t/helper/test-path-walk.c
index 37c5e3e31e813b..c6c60d68749dae 100644
--- a/t/helper/test-path-walk.c
+++ b/t/helper/test-path-walk.c
@@ -21,6 +21,7 @@ struct path_walk_test_data {
 	uintmax_t commit_nr;
 	uintmax_t tree_nr;
 	uintmax_t blob_nr;
+	uintmax_t tag_nr;
 };
 
 static int emit_block(const char *path, struct oid_array *oids,
@@ -45,6 +46,11 @@ static int emit_block(const char *path, struct oid_array *oids,
 		tdata->blob_nr += oids->nr;
 		break;
 
+	case OBJ_TAG:
+		typestr = "TAG";
+		tdata->tag_nr += oids->nr;
+		break;
+
 	default:
 		BUG("we do not understand this type");
 	}
@@ -66,6 +72,8 @@ int cmd__path_walk(int argc, const char **argv)
 			 N_("toggle inclusion of blob objects")),
 		OPT_BOOL(0, "commits", &info.commits,
 			 N_("toggle inclusion of commit objects")),
+		OPT_BOOL(0, "tags", &info.tags,
+			 N_("toggle inclusion of tag objects")),
 		OPT_BOOL(0, "trees", &info.trees,
 			 N_("toggle inclusion of tree objects")),
 		OPT_END(),
@@ -92,8 +100,9 @@ int cmd__path_walk(int argc, const char **argv)
 
 	printf("commits:%" PRIuMAX "\n"
 	       "trees:%" PRIuMAX "\n"
-	       "blobs:%" PRIuMAX "\n",
-	       data.commit_nr, data.tree_nr, data.blob_nr);
+	       "blobs:%" PRIuMAX "\n"
+	       "tags:%" PRIuMAX "\n",
+	       data.commit_nr, data.tree_nr, data.blob_nr, data.tag_nr);
 
 	return res;
 }
diff --git a/t/t6601-path-walk.sh b/t/t6601-path-walk.sh
index 4b16a0a3c80512..5ed6c79fbd1f18 100755
--- a/t/t6601-path-walk.sh
+++ b/t/t6601-path-walk.sh
@@ -7,24 +7,55 @@ test_description='direct path-walk API tests'
 test_expect_success 'setup test repository' '
 	git checkout -b base &&
 
+	# Make some objects that will only be reachable
+	# via non-commit tags.
+	mkdir child &&
+	echo file >child/file &&
+	git add child &&
+	git commit -m "will abandon" &&
+	git tag -a -m "tree" tree-tag HEAD^{tree} &&
+	echo file2 >file2 &&
+	git add file2 &&
+	git commit --amend -m "will abandon" &&
+	git tag tree-tag2 HEAD^{tree} &&
+
+	echo blob >file &&
+	blob_oid=$(git hash-object -t blob -w --stdin <file) &&
+	git tag -a -m "blob" blob-tag "$blob_oid" &&
+	echo blob2 >file2 &&
+	blob2_oid=$(git hash-object -t blob -w --stdin <file2) &&
+	git tag blob-tag2 "$blob2_oid" &&
+
+	rm -fr child file file2 &&
+
 	mkdir left &&
 	mkdir right &&
 	echo a >a &&
 	echo b >left/b &&
 	echo c >right/c &&
 	git add . &&
-	git commit -m "first" &&
+	git commit --amend -m "first" &&
+	git tag -m "first" first HEAD &&
 
 	echo d >right/d &&
 	git add right &&
 	git commit -m "second" &&
+	git tag -a -m "second (under)" second.1 HEAD &&
+	git tag -a -m "second (top)" second.2 second.1 &&
 
+	# Set up file/dir collision in history.
+	rm a &&
+	mkdir a &&
+	echo a >a/a &&
 	echo bb >left/b &&
-	git commit -a -m "third" &&
+	git add a left &&
+	git commit -m "third" &&
+	git tag -a -m "third" third &&
 
 	git checkout -b topic HEAD~1 &&
 	echo cc >right/c &&
-	git commit -a -m "topic"
+	git commit -a -m "topic" &&
+	git tag -a -m "fourth" fourth
 '
 
 test_expect_success 'all' '
@@ -40,19 +71,104 @@ test_expect_success 'all' '
 	TREE::$(git rev-parse base^{tree})
 	TREE::$(git rev-parse base~1^{tree})
 	TREE::$(git rev-parse base~2^{tree})
+	TREE::$(git rev-parse refs/tags/tree-tag^{})
+	TREE::$(git rev-parse refs/tags/tree-tag2^{})
+	TREE:a/:$(git rev-parse base:a)
 	TREE:left/:$(git rev-parse base:left)
 	TREE:left/:$(git rev-parse base~2:left)
 	TREE:right/:$(git rev-parse topic:right)
 	TREE:right/:$(git rev-parse base~1:right)
 	TREE:right/:$(git rev-parse base~2:right)
-	trees:9
+	TREE:child/:$(git rev-parse refs/tags/tree-tag^{}:child)
+	trees:13
 	BLOB:a:$(git rev-parse base~2:a)
+	BLOB:file2:$(git rev-parse refs/tags/tree-tag2^{}:file2)
 	BLOB:left/b:$(git rev-parse base~2:left/b)
 	BLOB:left/b:$(git rev-parse base:left/b)
 	BLOB:right/c:$(git rev-parse base~2:right/c)
 	BLOB:right/c:$(git rev-parse topic:right/c)
 	BLOB:right/d:$(git rev-parse base~1:right/d)
-	blobs:6
+	BLOB:/tagged-blobs:$(git rev-parse refs/tags/blob-tag^{})
+	BLOB:/tagged-blobs:$(git rev-parse refs/tags/blob-tag2^{})
+	BLOB:child/file:$(git rev-parse refs/tags/tree-tag^{}:child/file)
+	blobs:10
+	TAG:/tags:$(git rev-parse refs/tags/first)
+	TAG:/tags:$(git rev-parse refs/tags/second.1)
+	TAG:/tags:$(git rev-parse refs/tags/second.2)
+	TAG:/tags:$(git rev-parse refs/tags/third)
+	TAG:/tags:$(git rev-parse refs/tags/fourth)
+	TAG:/tags:$(git rev-parse refs/tags/tree-tag)
+	TAG:/tags:$(git rev-parse refs/tags/blob-tag)
+	tags:7
+	EOF
+
+	test_cmp_sorted expect out
+'
+
+test_expect_success 'indexed objects' '
+	test_when_finished git reset --hard &&
+
+	# stage change into index, adding a blob but
+	# also invalidating the cache-tree for the root
+	# and the "left" directory.
+	echo bogus >left/c &&
+	git add left &&
+
+	test-tool path-walk -- --indexed-objects >out &&
+
+	cat >expect <<-EOF &&
+	commits:0
+	TREE:right/:$(git rev-parse topic:right)
+	trees:1
+	BLOB:a:$(git rev-parse HEAD:a)
+	BLOB:left/b:$(git rev-parse HEAD:left/b)
+	BLOB:left/c:$(git rev-parse :left/c)
+	BLOB:right/c:$(git rev-parse HEAD:right/c)
+	BLOB:right/d:$(git rev-parse HEAD:right/d)
+	blobs:5
+	tags:0
+	EOF
+
+	test_cmp_sorted expect out
+'
+
+test_expect_success 'branches and indexed objects mix well' '
+	test_when_finished git reset --hard &&
+
+	# stage change into index, adding a blob but
+	# also invalidating the cache-tree for the root
+	# and the "right" directory.
+	echo fake >right/d &&
+	git add right &&
+
+	test-tool path-walk -- --indexed-objects --branches >out &&
+
+	cat >expect <<-EOF &&
+	COMMIT::$(git rev-parse topic)
+	COMMIT::$(git rev-parse base)
+	COMMIT::$(git rev-parse base~1)
+	COMMIT::$(git rev-parse base~2)
+	commits:4
+	TREE::$(git rev-parse topic^{tree})
+	TREE::$(git rev-parse base^{tree})
+	TREE::$(git rev-parse base~1^{tree})
+	TREE::$(git rev-parse base~2^{tree})
+	TREE:a/:$(git rev-parse base:a)
+	TREE:left/:$(git rev-parse base:left)
+	TREE:left/:$(git rev-parse base~2:left)
+	TREE:right/:$(git rev-parse topic:right)
+	TREE:right/:$(git rev-parse base~1:right)
+	TREE:right/:$(git rev-parse base~2:right)
+	trees:10
+	BLOB:a:$(git rev-parse base~2:a)
+	BLOB:left/b:$(git rev-parse base:left/b)
+	BLOB:left/b:$(git rev-parse base~2:left/b)
+	BLOB:right/c:$(git rev-parse base~2:right/c)
+	BLOB:right/c:$(git rev-parse topic:right/c)
+	BLOB:right/d:$(git rev-parse base~1:right/d)
+	BLOB:right/d:$(git rev-parse :right/d)
+	blobs:7
+	tags:0
 	EOF
 
 	test_cmp_sorted expect out
@@ -80,6 +196,7 @@ test_expect_success 'topic only' '
 	BLOB:right/c:$(git rev-parse topic:right/c)
 	BLOB:right/d:$(git rev-parse base~1:right/d)
 	blobs:5
+	tags:0
 	EOF
 
 	test_cmp_sorted expect out
@@ -100,6 +217,7 @@ test_expect_success 'topic, not base' '
 	BLOB:right/c:$(git rev-parse topic:right/c)
 	BLOB:right/d:$(git rev-parse topic:right/d)
 	blobs:4
+	tags:0
 	EOF
 
 	test_cmp_sorted expect out
@@ -117,6 +235,7 @@ test_expect_success 'topic, not base, only blobs' '
 	BLOB:right/c:$(git rev-parse topic:right/c)
 	BLOB:right/d:$(git rev-parse topic:right/d)
 	blobs:4
+	tags:0
 	EOF
 
 	test_cmp_sorted expect out
@@ -133,6 +252,7 @@ test_expect_success 'topic, not base, only commits' '
 	commits:1
 	trees:0
 	blobs:0
+	tags:0
 	EOF
 
 	test_cmp_sorted expect out
@@ -149,6 +269,7 @@ test_expect_success 'topic, not base, only trees' '
 	TREE:right/:$(git rev-parse topic:right)
 	trees:3
 	blobs:0
+	tags:0
 	EOF
 
 	test_cmp_sorted expect out
@@ -173,9 +294,32 @@ test_expect_success 'topic, not base, boundary' '
 	BLOB:right/c:$(git rev-parse topic:right/c)
 	BLOB:right/d:$(git rev-parse base~1:right/d)
 	blobs:5
+	tags:0
 	EOF
 
 	test_cmp_sorted expect out
 '
 
+test_expect_success 'trees are reported exactly once' '
+	test_when_finished "rm -rf unique-trees" &&
+	test_create_repo unique-trees &&
+	(
+		cd unique-trees &&
+		mkdir initial &&
+		test_commit initial/file &&
+
+		git switch -c move-to-top &&
+		git mv initial/file.t ./ &&
+		test_tick &&
+		git commit -m moved &&
+
+		git update-ref refs/heads/other HEAD
+	) &&
+
+	test-tool -C unique-trees path-walk -- --all >out &&
+	tree=$(git -C unique-trees rev-parse HEAD:) &&
+	grep "$tree" out >out-filtered &&
+	test_line_count = 1 out-filtered
+'
+
 test_done

From c99f26cb1c347ff76fdb06ebcae9a4eb34372a64 Mon Sep 17 00:00:00 2001
From: Derrick Stolee <stolee@gmail.com>
Date: Thu, 31 Oct 2024 06:27:03 +0000
Subject: [PATCH 67/69] path-walk: mark trees and blobs as UNINTERESTING

When the input rev_info has UNINTERESTING starting points, we want to be
sure that the UNINTERESTING flag is passed appropriately through the
objects. To match how this is done in places such as 'git pack-objects', we
use the mark_edges_uninteresting() method.

This method has an option for using the "sparse" walk, which is similar in
spirit to the path-walk API's walk. To be sure to keep it independent, add a
new 'prune_all_uninteresting' option to the path_walk_info struct.

To check how the UNINTERSTING flag is spread through our objects, extend the
'test-tool path-walk' command to output whether or not an object has that
flag. This changes our tests significantly, including the removal of some
objects that were previously visited due to the incomplete implementation.

Signed-off-by: Derrick Stolee <stolee@gmail.com>
Signed-off-by: Taylor Blau <me@ttaylorr.com>
---
 Documentation/technical/api-path-walk.txt |  8 +++
 path-walk.c                               | 73 +++++++++++++++++++++
 path-walk.h                               |  8 +++
 t/helper/test-path-walk.c                 | 10 ++-
 t/t6601-path-walk.sh                      | 79 +++++++++++++++++------
 5 files changed, 157 insertions(+), 21 deletions(-)

diff --git a/Documentation/technical/api-path-walk.txt b/Documentation/technical/api-path-walk.txt
index 6022c381b7c3f0..7075d0d5ab50fd 100644
--- a/Documentation/technical/api-path-walk.txt
+++ b/Documentation/technical/api-path-walk.txt
@@ -48,6 +48,14 @@ commits.
 While it is possible to walk only commits in this way, consumers would be
 better off using the revision walk API instead.
 
+`prune_all_uninteresting`::
+	By default, all reachable paths are emitted by the path-walk API.
+	This option allows consumers to declare that they are not
+	interested in paths where all included objects are marked with the
+	`UNINTERESTING` flag. This requires using the `boundary` option in
+	the revision walk so that the walk emits commits marked with the
+	`UNINTERESTING` flag.
+
 Examples
 --------
 
diff --git a/path-walk.c b/path-walk.c
index eca0e5f3d5b28c..6f658c28307077 100644
--- a/path-walk.c
+++ b/path-walk.c
@@ -8,6 +8,7 @@
 #include "dir.h"
 #include "hashmap.h"
 #include "hex.h"
+#include "list-objects.h"
 #include "object.h"
 #include "oid-array.h"
 #include "revision.h"
@@ -24,6 +25,7 @@ struct type_and_oid_list
 {
 	enum object_type type;
 	struct oid_array oids;
+	int maybe_interesting;
 };
 
 #define TYPE_AND_OID_LIST_INIT { \
@@ -140,6 +142,9 @@ static int add_children(struct path_walk_context *ctx,
 		if (o->flags & SEEN)
 			continue;
 		o->flags |= SEEN;
+
+		if (!(o->flags & UNINTERESTING))
+			list->maybe_interesting = 1;
 		oid_array_append(&list->oids, &entry.oid);
 	}
 
@@ -164,6 +169,43 @@ static int walk_path(struct path_walk_context *ctx,
 	if (!list)
 		BUG("provided path '%s' that had no associated list", path);
 
+	if (ctx->info->prune_all_uninteresting) {
+		/*
+		 * This is true if all objects were UNINTERESTING
+		 * when added to the list.
+		 */
+		if (!list->maybe_interesting)
+			return 0;
+
+		/*
+		 * But it's still possible that the objects were set
+		 * as UNINTERESTING after being added. Do a quick check.
+		 */
+		list->maybe_interesting = 0;
+		for (size_t i = 0;
+		     !list->maybe_interesting && i < list->oids.nr;
+		     i++) {
+			if (list->type == OBJ_TREE) {
+				struct tree *t = lookup_tree(ctx->repo,
+							     &list->oids.oid[i]);
+				if (t && !(t->object.flags & UNINTERESTING))
+					list->maybe_interesting = 1;
+			} else if (list->type == OBJ_BLOB) {
+				struct blob *b = lookup_blob(ctx->repo,
+							     &list->oids.oid[i]);
+				if (b && !(b->object.flags & UNINTERESTING))
+					list->maybe_interesting = 1;
+			} else {
+				/* Tags are always interesting if visited. */
+				list->maybe_interesting = 1;
+			}
+		}
+
+		/* We have confirmed that all objects are UNINTERESTING. */
+		if (!list->maybe_interesting)
+			return 0;
+	}
+
 	/* Evaluate function pointer on this data, if requested. */
 	if ((list->type == OBJ_TREE && ctx->info->trees) ||
 	    (list->type == OBJ_BLOB && ctx->info->blobs) ||
@@ -198,6 +240,26 @@ static void clear_strmap(struct strmap *map)
 	strmap_init(map);
 }
 
+static struct repository *edge_repo;
+static struct type_and_oid_list *edge_tree_list;
+
+static void show_edge(struct commit *commit)
+{
+	struct tree *t = repo_get_commit_tree(edge_repo, commit);
+
+	if (!t)
+		return;
+
+	if (commit->object.flags & UNINTERESTING)
+		t->object.flags |= UNINTERESTING;
+
+	if (t->object.flags & SEEN)
+		return;
+	t->object.flags |= SEEN;
+
+	oid_array_append(&edge_tree_list->oids, &t->object.oid);
+}
+
 static void setup_pending_objects(struct path_walk_info *info,
 				  struct path_walk_context *ctx)
 {
@@ -306,6 +368,7 @@ static void setup_pending_objects(struct path_walk_info *info,
 		if (tagged_blobs->oids.nr) {
 			const char *tagged_blob_path = "/tagged-blobs";
 			tagged_blobs->type = OBJ_BLOB;
+			tagged_blobs->maybe_interesting = 1;
 			push_to_stack(ctx, tagged_blob_path);
 			strmap_put(&ctx->paths_to_lists, tagged_blob_path, tagged_blobs);
 		} else {
@@ -317,6 +380,7 @@ static void setup_pending_objects(struct path_walk_info *info,
 		if (tags->oids.nr) {
 			const char *tag_path = "/tags";
 			tags->type = OBJ_TAG;
+			tags->maybe_interesting = 1;
 			push_to_stack(ctx, tag_path);
 			strmap_put(&ctx->paths_to_lists, tag_path, tags);
 		} else {
@@ -359,6 +423,7 @@ int walk_objects_by_path(struct path_walk_info *info)
 	/* Insert a single list for the root tree into the paths. */
 	CALLOC_ARRAY(root_tree_list, 1);
 	root_tree_list->type = OBJ_TREE;
+	root_tree_list->maybe_interesting = 1;
 	strmap_put(&ctx.paths_to_lists, root_path, root_tree_list);
 	push_to_stack(&ctx, root_path);
 
@@ -372,6 +437,14 @@ int walk_objects_by_path(struct path_walk_info *info)
 	if (prepare_revision_walk(info->revs))
 		die(_("failed to setup revision walk"));
 
+	/* Walk trees to mark them as UNINTERESTING. */
+	edge_repo = info->revs->repo;
+	edge_tree_list = root_tree_list;
+	mark_edges_uninteresting(info->revs, show_edge,
+				 info->prune_all_uninteresting);
+	edge_repo = NULL;
+	edge_tree_list = NULL;
+
 	info->revs->blob_objects = info->revs->tree_objects = 0;
 
 	trace2_region_enter("path-walk", "pending-walk", info->revs->repo);
diff --git a/path-walk.h b/path-walk.h
index ca839f873e4dea..de0db007dc9a5f 100644
--- a/path-walk.h
+++ b/path-walk.h
@@ -39,6 +39,14 @@ struct path_walk_info {
 	int trees;
 	int blobs;
 	int tags;
+
+	/**
+	 * When 'prune_all_uninteresting' is set and a path has all objects
+	 * marked as UNINTERESTING, then the path-walk will not visit those
+	 * objects. It will not call path_fn on those objects and will not
+	 * walk the children of such trees.
+	 */
+	int prune_all_uninteresting;
 };
 
 #define PATH_WALK_INFO_INIT {   \
diff --git a/t/helper/test-path-walk.c b/t/helper/test-path-walk.c
index c6c60d68749dae..06b103d87607dd 100644
--- a/t/helper/test-path-walk.c
+++ b/t/helper/test-path-walk.c
@@ -55,8 +55,12 @@ static int emit_block(const char *path, struct oid_array *oids,
 		BUG("we do not understand this type");
 	}
 
-	for (size_t i = 0; i < oids->nr; i++)
-		printf("%s:%s:%s\n", typestr, path, oid_to_hex(&oids->oid[i]));
+	for (size_t i = 0; i < oids->nr; i++) {
+		struct object *o = lookup_unknown_object(the_repository,
+							 &oids->oid[i]);
+		printf("%s:%s:%s%s\n", typestr, path, oid_to_hex(&oids->oid[i]),
+		       o->flags & UNINTERESTING ? ":UNINTERESTING" : "");
+	}
 
 	return 0;
 }
@@ -76,6 +80,8 @@ int cmd__path_walk(int argc, const char **argv)
 			 N_("toggle inclusion of tag objects")),
 		OPT_BOOL(0, "trees", &info.trees,
 			 N_("toggle inclusion of tree objects")),
+		OPT_BOOL(0, "prune", &info.prune_all_uninteresting,
+			 N_("toggle pruning of uninteresting paths")),
 		OPT_END(),
 	};
 
diff --git a/t/t6601-path-walk.sh b/t/t6601-path-walk.sh
index 5ed6c79fbd1f18..a561c21d484550 100755
--- a/t/t6601-path-walk.sh
+++ b/t/t6601-path-walk.sh
@@ -209,13 +209,13 @@ test_expect_success 'topic, not base' '
 	COMMIT::$(git rev-parse topic)
 	commits:1
 	TREE::$(git rev-parse topic^{tree})
-	TREE:left/:$(git rev-parse topic:left)
+	TREE:left/:$(git rev-parse base~1:left):UNINTERESTING
 	TREE:right/:$(git rev-parse topic:right)
 	trees:3
-	BLOB:a:$(git rev-parse topic:a)
-	BLOB:left/b:$(git rev-parse topic:left/b)
+	BLOB:a:$(git rev-parse base~1:a):UNINTERESTING
+	BLOB:left/b:$(git rev-parse base~1:left/b):UNINTERESTING
 	BLOB:right/c:$(git rev-parse topic:right/c)
-	BLOB:right/d:$(git rev-parse topic:right/d)
+	BLOB:right/d:$(git rev-parse base~1:right/d):UNINTERESTING
 	blobs:4
 	tags:0
 	EOF
@@ -223,6 +223,29 @@ test_expect_success 'topic, not base' '
 	test_cmp_sorted expect out
 '
 
+test_expect_success 'fourth, blob-tag2, not base' '
+	test-tool path-walk -- fourth blob-tag2 --not base >out &&
+
+	cat >expect <<-EOF &&
+	COMMIT::$(git rev-parse topic)
+	commits:1
+	TREE::$(git rev-parse topic^{tree})
+	TREE:left/:$(git rev-parse base~1:left):UNINTERESTING
+	TREE:right/:$(git rev-parse topic:right)
+	trees:3
+	BLOB:a:$(git rev-parse base~1:a):UNINTERESTING
+	BLOB:left/b:$(git rev-parse base~1:left/b):UNINTERESTING
+	BLOB:right/c:$(git rev-parse topic:right/c)
+	BLOB:right/d:$(git rev-parse base~1:right/d):UNINTERESTING
+	BLOB:/tagged-blobs:$(git rev-parse refs/tags/blob-tag2^{})
+	blobs:5
+	TAG:/tags:$(git rev-parse fourth)
+	tags:1
+	EOF
+
+	test_cmp_sorted expect out
+'
+
 test_expect_success 'topic, not base, only blobs' '
 	test-tool path-walk --no-trees --no-commits \
 		-- topic --not base >out &&
@@ -230,10 +253,10 @@ test_expect_success 'topic, not base, only blobs' '
 	cat >expect <<-EOF &&
 	commits:0
 	trees:0
-	BLOB:a:$(git rev-parse topic:a)
-	BLOB:left/b:$(git rev-parse topic:left/b)
+	BLOB:a:$(git rev-parse base~1:a):UNINTERESTING
+	BLOB:left/b:$(git rev-parse base~1:left/b):UNINTERESTING
 	BLOB:right/c:$(git rev-parse topic:right/c)
-	BLOB:right/d:$(git rev-parse topic:right/d)
+	BLOB:right/d:$(git rev-parse base~1:right/d):UNINTERESTING
 	blobs:4
 	tags:0
 	EOF
@@ -265,7 +288,7 @@ test_expect_success 'topic, not base, only trees' '
 	cat >expect <<-EOF &&
 	commits:0
 	TREE::$(git rev-parse topic^{tree})
-	TREE:left/:$(git rev-parse topic:left)
+	TREE:left/:$(git rev-parse base~1:left):UNINTERESTING
 	TREE:right/:$(git rev-parse topic:right)
 	trees:3
 	blobs:0
@@ -280,19 +303,19 @@ test_expect_success 'topic, not base, boundary' '
 
 	cat >expect <<-EOF &&
 	COMMIT::$(git rev-parse topic)
-	COMMIT::$(git rev-parse base~1)
+	COMMIT::$(git rev-parse base~1):UNINTERESTING
 	commits:2
 	TREE::$(git rev-parse topic^{tree})
-	TREE::$(git rev-parse base~1^{tree})
-	TREE:left/:$(git rev-parse base~1:left)
+	TREE::$(git rev-parse base~1^{tree}):UNINTERESTING
+	TREE:left/:$(git rev-parse base~1:left):UNINTERESTING
 	TREE:right/:$(git rev-parse topic:right)
-	TREE:right/:$(git rev-parse base~1:right)
+	TREE:right/:$(git rev-parse base~1:right):UNINTERESTING
 	trees:5
-	BLOB:a:$(git rev-parse base~1:a)
-	BLOB:left/b:$(git rev-parse base~1:left/b)
-	BLOB:right/c:$(git rev-parse base~1:right/c)
+	BLOB:a:$(git rev-parse base~1:a):UNINTERESTING
+	BLOB:left/b:$(git rev-parse base~1:left/b):UNINTERESTING
+	BLOB:right/c:$(git rev-parse base~1:right/c):UNINTERESTING
 	BLOB:right/c:$(git rev-parse topic:right/c)
-	BLOB:right/d:$(git rev-parse base~1:right/d)
+	BLOB:right/d:$(git rev-parse base~1:right/d):UNINTERESTING
 	blobs:5
 	tags:0
 	EOF
@@ -300,6 +323,27 @@ test_expect_success 'topic, not base, boundary' '
 	test_cmp_sorted expect out
 '
 
+test_expect_success 'topic, not base, boundary with pruning' '
+	test-tool path-walk --prune -- --boundary topic --not base >out &&
+
+	cat >expect <<-EOF &&
+	COMMIT::$(git rev-parse topic)
+	COMMIT::$(git rev-parse base~1):UNINTERESTING
+	commits:2
+	TREE::$(git rev-parse topic^{tree})
+	TREE::$(git rev-parse base~1^{tree}):UNINTERESTING
+	TREE:right/:$(git rev-parse topic:right)
+	TREE:right/:$(git rev-parse base~1:right):UNINTERESTING
+	trees:4
+	BLOB:right/c:$(git rev-parse base~1:right/c):UNINTERESTING
+	BLOB:right/c:$(git rev-parse topic:right/c)
+	blobs:2
+	tags:0
+	EOF
+
+	test_cmp_sorted expect out
+'
+
 test_expect_success 'trees are reported exactly once' '
 	test_when_finished "rm -rf unique-trees" &&
 	test_create_repo unique-trees &&
@@ -307,15 +351,12 @@ test_expect_success 'trees are reported exactly once' '
 		cd unique-trees &&
 		mkdir initial &&
 		test_commit initial/file &&
-
 		git switch -c move-to-top &&
 		git mv initial/file.t ./ &&
 		test_tick &&
 		git commit -m moved &&
-
 		git update-ref refs/heads/other HEAD
 	) &&
-
 	test-tool -C unique-trees path-walk -- --all >out &&
 	tree=$(git -C unique-trees rev-parse HEAD:) &&
 	grep "$tree" out >out-filtered &&

From 5b107fa1a3daa1a1ad73476a788bac1a2050e4b8 Mon Sep 17 00:00:00 2001
From: Taylor Blau <me@ttaylorr.com>
Date: Fri, 1 Nov 2024 15:39:38 -0400
Subject: [PATCH 68/69] ###


From 8f346e1ba802d029f1229ed9fcdd82b6c13a7cff Mon Sep 17 00:00:00 2001
From: Taylor Blau <me@ttaylorr.com>
Date: Fri, 1 Nov 2024 15:40:20 -0400
Subject: [PATCH 69/69] ### match next