Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
46 changes: 43 additions & 3 deletions cache-tree.c
Original file line number Diff line number Diff line change
Expand Up @@ -192,22 +192,62 @@ static int verify_cache(struct index_state *istate, int flags)
for (i = 0; i + 1 < istate->cache_nr; i++) {
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Junio C Hamano wrote on the Git mailing list (how to reply to this email):

"Elijah Newren via GitGitGadget" <gitgitgadget@gmail.com> writes:

> I could not find any caller in current git that both allows the index to
> get into this state and then tries to write it out without doing other
> checks beyond the verify_cache() call in cache_tree_update(), but
> verify_cache() is documented as a safety net for preventing corrupt
> trees and should actually provide that guarantee.

Oh, absolutely.  This kind of tightening is very much appreciated.

> diff --git a/cache-tree.c b/cache-tree.c
> index 7881b42aa2..f11844fe72 100644
> --- a/cache-tree.c
> +++ b/cache-tree.c
> @@ -192,22 +192,62 @@ static int verify_cache(struct index_state *istate, int flags)
>  	for (i = 0; i + 1 < istate->cache_nr; i++) {
>  		/* path/file always comes after path because of the way
>  		 * the cache is sorted.  Also path can appear only once,
> -		 * which means conflicting one would immediately follow.
> +		 * so path/file is likely the immediately following path
> +		 * but might be separated if there is e.g. a
> +		 * path-internal/... file.
>  		 */
>  		const struct cache_entry *this_ce = istate->cache[i];
>  		const struct cache_entry *next_ce = istate->cache[i + 1];
>  		const char *this_name = this_ce->name;
>  		const char *next_name = next_ce->name;
>  		int this_len = ce_namelen(this_ce);
> +		const char *conflict_name = NULL;
> +
>  		if (this_len < ce_namelen(next_ce) &&
> -		    next_name[this_len] == '/' &&
> +		    next_name[this_len] <= '/' &&
>  		    strncmp(this_name, next_name, this_len) == 0) {
> +			if (next_name[this_len] == '/') {
> +				conflict_name = next_name;
> +			} else if (next_name[this_len] < '/') {
> +				/*
> +				 * The immediately next entry shares our
> +				 * prefix but sorts before "path/" (e.g.,
> +				 * "path-internal" between "path" and
> +				 * "path/file", since '-' (0x2D) < '/'
> +				 * (0x2F)).  Binary search to find where
> +				 * "path/" would be and check for a D/F
> +				 * conflict there.
> +				 */
> +				struct cache_entry *other;
> +				struct strbuf probe = STRBUF_INIT;
> +				int pos;
> +
> +				strbuf_add(&probe, this_name, this_len);
> +				strbuf_addch(&probe, '/');
> +				pos = index_name_pos_sparse(istate,
> +							    probe.buf,
> +							    probe.len);
> +				strbuf_release(&probe);
> +
> +				if (pos < 0)
> +					pos = -pos - 1;
> +				if (pos >= (int)istate->cache_nr)
> +					continue;
> +				other = istate->cache[pos];
> +				if (ce_namelen(other) > this_len &&
> +				    other->name[this_len] == '/' &&
> +				    !strncmp(this_name, other->name, this_len))
> +					conflict_name = other->name;
> +			}
> +		}

The narrow and tall comment block is a sign that this loop is
getting too deeply nested.  I wonder if it makes it easier to follow
if we extract this new logic into a small helper function on its
own?

What the code checks and how it does so both make sense to me, though.

Thanks.

/* path/file always comes after path because of the way
* the cache is sorted. Also path can appear only once,
* which means conflicting one would immediately follow.
* so path/file is likely the immediately following path
* but might be separated if there is e.g. a
* path-internal/... file.
*/
const struct cache_entry *this_ce = istate->cache[i];
const struct cache_entry *next_ce = istate->cache[i + 1];
const char *this_name = this_ce->name;
const char *next_name = next_ce->name;
int this_len = ce_namelen(this_ce);
const char *conflict_name = NULL;

if (this_len < ce_namelen(next_ce) &&
next_name[this_len] == '/' &&
next_name[this_len] <= '/' &&
strncmp(this_name, next_name, this_len) == 0) {
if (next_name[this_len] == '/') {
conflict_name = next_name;
} else if (next_name[this_len] < '/') {
/*
* The immediately next entry shares our
* prefix but sorts before "path/" (e.g.,
* "path-internal" between "path" and
* "path/file", since '-' (0x2D) < '/'
* (0x2F)). Binary search to find where
* "path/" would be and check for a D/F
* conflict there.
*/
struct cache_entry *other;
struct strbuf probe = STRBUF_INIT;
int pos;

strbuf_add(&probe, this_name, this_len);
strbuf_addch(&probe, '/');
pos = index_name_pos_sparse(istate,
probe.buf,
probe.len);
strbuf_release(&probe);

if (pos < 0)
pos = -pos - 1;
if (pos >= (int)istate->cache_nr)
continue;
other = istate->cache[pos];
if (ce_namelen(other) > this_len &&
other->name[this_len] == '/' &&
!strncmp(this_name, other->name, this_len))
conflict_name = other->name;
}
}

if (conflict_name) {
if (10 < ++funny) {
fprintf(stderr, "...\n");
break;
}
fprintf(stderr, "You have both %s and %s\n",
this_name, next_name);
this_name, conflict_name);
}
}
if (funny)
Expand Down
78 changes: 44 additions & 34 deletions merge-ort.c
Original file line number Diff line number Diff line change
Expand Up @@ -728,6 +728,8 @@ static void clear_or_reinit_internal_opts(struct merge_options_internal *opti,
strintmap_clear_func(&renames->deferred[i].possible_trivial_merges);
strset_clear_func(&renames->deferred[i].target_dirs);
renames->deferred[i].trivial_merges_okay = 1; /* 1 == maybe */
free(renames->pairs[i].queue);
diff_queue_init(&renames->pairs[i]);
}
renames->cached_pairs_valid_side = 0;
renames->dir_rename_mask = 0;
Expand Down Expand Up @@ -1008,32 +1010,34 @@ static int traverse_trees_wrapper(struct index_state *istate,
info->traverse_path = renames->callback_data_traverse_path;
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Junio C Hamano wrote on the Git mailing list (how to reply to this email):

"Elijah Newren via GitGitGadget" <gitgitgadget@gmail.com> writes:

> From: Elijah Newren <newren@gmail.com>
>
> traverse_trees_wrapper() saves entries from a first pass through
> traverse_trees() and then replays them through the real callback
> (collect_merge_info_callback).  However, the replay loop silently
> discards the callback return value.  This means any error reported by
> the callback during replay -- including a future check for malformed
> trees -- would be ignored, allowing the merge to proceed with corrupt
> state.
>
> Capture the return value, stop the loop on negative (error) returns,
> and propagate the error to the caller.  Note that the callback returns
> a positive mask value on success, so we normalize non-negative returns
> to 0 for the caller.

All makes perfect sense.

How would the externally visible behaviour change at this step?

Upon an error from the callback, we used to keep going and processed
other callback data in the renames structure.  We now leave the rest
unprocessed.

The caller of this helper would never have seen a failure, but now
they will.  Both callers, collect_merge_info_callback() and
handle_deferred_entries(), are reacting to a negative "error" return
well (perhaps because they sometimes call traverse_trees() in the
same control flow, which does return an error already), so
presumably there is no downside caused by aborting the innermost
process upon the first error return.



> Signed-off-by: Elijah Newren <newren@gmail.com>
> ---
>  merge-ort.c | 14 ++++++++------
>  1 file changed, 8 insertions(+), 6 deletions(-)
>
> diff --git a/merge-ort.c b/merge-ort.c
> index 00923ce3cd..4b8e32209d 100644
> --- a/merge-ort.c
> +++ b/merge-ort.c
> @@ -1008,18 +1008,20 @@ static int traverse_trees_wrapper(struct index_state *istate,
>  	info->traverse_path = renames->callback_data_traverse_path;
>  	info->fn = old_fn;
>  	for (i = old_offset; i < renames->callback_data_nr; ++i) {
> -		info->fn(n,
> -			 renames->callback_data[i].mask,
> -			 renames->callback_data[i].dirmask,
> -			 renames->callback_data[i].names,
> -			 info);
> +		ret = info->fn(n,
> +			       renames->callback_data[i].mask,
> +			       renames->callback_data[i].dirmask,
> +			       renames->callback_data[i].names,
> +			       info);
> +		if (ret < 0)
> +			break;
>  	}
>  
>  	renames->callback_data_nr = old_offset;
>  	free(renames->callback_data_traverse_path);
>  	renames->callback_data_traverse_path = old_callback_data_traverse_path;
>  	info->traverse_path = NULL;
> -	return 0;
> +	return ret < 0 ? ret : 0;
>  }
>  
>  static void setup_path_info(struct merge_options *opt,

info->fn = old_fn;
for (i = old_offset; i < renames->callback_data_nr; ++i) {
info->fn(n,
renames->callback_data[i].mask,
renames->callback_data[i].dirmask,
renames->callback_data[i].names,
info);
ret = info->fn(n,
renames->callback_data[i].mask,
renames->callback_data[i].dirmask,
renames->callback_data[i].names,
info);
if (ret < 0)
break;
}

renames->callback_data_nr = old_offset;
free(renames->callback_data_traverse_path);
renames->callback_data_traverse_path = old_callback_data_traverse_path;
info->traverse_path = NULL;
return 0;
return ret < 0 ? ret : 0;
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Junio C Hamano wrote on the Git mailing list (how to reply to this email):

"Elijah Newren via GitGitGadget" <gitgitgadget@gmail.com> writes:

> From: Elijah Newren <newren@gmail.com>
>
> Trees with duplicate entries are malformed; fsck reports "contains
> duplicate file entries" for them.  merge-ort has from the beginning
> assumed that we would never hit such trees.  It was written with the
> assumption that traverse_trees() calls collect_merge_info_callback() at
> most once per path.  The "sanity checks" in that callback (added in
> d2bc1994f363 (merge-ort: implement a very basic collect_merge_info(),
> 2020-12-13)) verify properties of each individual call but not that
> invariant.  The strmap_put() in setup_path_info() silently overwrites
> the entry from any prior call for the same path, because it assumed
> there would be no other path.  Unfortunately, supplemental data
> structures for various optimizations could still be tweaked before the
> extra paths were overwritten, and those data structures not matching
> expected state could trip various assertions.
>
> Change the return type of setup_path_info() from void to int to allow us
> to detect this case, and abort the merge with a clear error message when
> it occurs.

OK.

> @@ -1081,9 +1081,11 @@ static void setup_path_info(struct merge_options *opt,
>  			 */
>  			mi->is_null = 1;
>  	}
> -	strmap_put(&opt->priv->paths, fullpath, mi);
> +	if (strmap_put(&opt->priv->paths, fullpath, mi))
> +		return error(_("tree has duplicate entries for '%s'"), fullpath);

OK.  I was wondering what _other_ kind of malformed trees would the
updated code by this change is prepared to handle (most notably,
tree entries must be sorted, and one way to detect duplicate is to
remember one single path that we saw earlier, which would work as
long as the entries are sorted).  This "ah, we saw that path already"
approach is much more robust in that it does not have to depend on a
sorted tree.

Makes sense.

}

static void setup_path_info(struct merge_options *opt,
struct string_list_item *result,
const char *current_dir_name,
int current_dir_name_len,
char *fullpath, /* we'll take over ownership */
struct name_entry *names,
struct name_entry *merged_version,
unsigned is_null, /* boolean */
unsigned df_conflict, /* boolean */
unsigned filemask,
unsigned dirmask,
int resolved /* boolean */)
static int setup_path_info(struct merge_options *opt,
struct string_list_item *result,
const char *current_dir_name,
int current_dir_name_len,
char *fullpath, /* we'll take over ownership */
struct name_entry *names,
struct name_entry *merged_version,
unsigned is_null, /* boolean */
unsigned df_conflict, /* boolean */
unsigned filemask,
unsigned dirmask,
int resolved /* boolean */)
{
/* result->util is void*, so mi is a convenience typed variable */
struct merged_info *mi;
Expand Down Expand Up @@ -1077,9 +1081,11 @@ static void setup_path_info(struct merge_options *opt,
*/
mi->is_null = 1;
}
strmap_put(&opt->priv->paths, fullpath, mi);
if (strmap_put(&opt->priv->paths, fullpath, mi))
return error(_("tree has duplicate entries for '%s'"), fullpath);
result->string = fullpath;
result->util = mi;
return 0;
}

static void add_pair(struct merge_options *opt,
Expand Down Expand Up @@ -1346,9 +1352,10 @@ static int collect_merge_info_callback(int n,
*/
if (side1_matches_mbase && side2_matches_mbase) {
/* mbase, side1, & side2 all match; use mbase as resolution */
setup_path_info(opt, &pi, dirname, info->pathlen, fullpath,
names, names+0, mbase_null, 0 /* df_conflict */,
filemask, dirmask, 1 /* resolved */);
if (setup_path_info(opt, &pi, dirname, info->pathlen, fullpath,
names, names+0, mbase_null, 0 /* df_conflict */,
filemask, dirmask, 1 /* resolved */))
return -1; /* Quit traversing */
return mask;
}

Expand All @@ -1360,9 +1367,10 @@ static int collect_merge_info_callback(int n,
*/
if (sides_match && filemask == 0x07) {
/* use side1 (== side2) version as resolution */
setup_path_info(opt, &pi, dirname, info->pathlen, fullpath,
names, names+1, side1_null, 0,
filemask, dirmask, 1);
if (setup_path_info(opt, &pi, dirname, info->pathlen, fullpath,
names, names+1, side1_null, 0,
filemask, dirmask, 1))
return -1; /* Quit traversing */
return mask;
}

Expand All @@ -1374,18 +1382,20 @@ static int collect_merge_info_callback(int n,
*/
if (side1_matches_mbase && filemask == 0x07) {
/* use side2 version as resolution */
setup_path_info(opt, &pi, dirname, info->pathlen, fullpath,
names, names+2, side2_null, 0,
filemask, dirmask, 1);
if (setup_path_info(opt, &pi, dirname, info->pathlen, fullpath,
names, names+2, side2_null, 0,
filemask, dirmask, 1))
return -1; /* Quit traversing */
return mask;
}

/* Similar to above but swapping sides 1 and 2 */
if (side2_matches_mbase && filemask == 0x07) {
/* use side1 version as resolution */
setup_path_info(opt, &pi, dirname, info->pathlen, fullpath,
names, names+1, side1_null, 0,
filemask, dirmask, 1);
if (setup_path_info(opt, &pi, dirname, info->pathlen, fullpath,
names, names+1, side1_null, 0,
filemask, dirmask, 1))
return -1; /* Quit traversing */
return mask;
}

Expand All @@ -1409,8 +1419,9 @@ static int collect_merge_info_callback(int n,
* unconflict some more cases, but that comes later so all we can
* do now is record the different non-null file hashes.)
*/
setup_path_info(opt, &pi, dirname, info->pathlen, fullpath,
names, NULL, 0, df_conflict, filemask, dirmask, 0);
if (setup_path_info(opt, &pi, dirname, info->pathlen, fullpath,
names, NULL, 0, df_conflict, filemask, dirmask, 0))
return -1; /* Quit traversing */

ci = pi.util;
VERIFY_CI(ci);
Expand Down Expand Up @@ -1738,7 +1749,6 @@ static int collect_merge_info(struct merge_options *opt,
setup_traverse_info(&info, opt->priv->toplevel_dir);
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Junio C Hamano wrote on the Git mailing list (how to reply to this email):

"Elijah Newren via GitGitGadget" <gitgitgadget@gmail.com> writes:

> From: Elijah Newren <newren@gmail.com>
>
> collect_merge_info() has set info.show_all_errors = 1 since
> d2bc1994f363 (merge-ort: implement a very basic collect_merge_info(),
> 2020-12-13).  This setting was copied from unpack-trees.c where it
> controls batching of error messages for porcelain display, but
> merge-ort has no such error-batching logic and never needed it.
>
> With show_all_errors set, traverse_trees() captures a negative callback
> return but continues processing remaining entries rather than stopping
> immediately.  Removing the setting restores the default behavior where
> a negative return from collect_merge_info_callback() breaks out of the
> traversal loop right away, allowing a future commit to exit early when
> a corrupt tree is detected.

Nice spotting.  As the error handling eventually is to die without
making any further damange, returning early without seeing "more
errors" is a good change.

>
> Signed-off-by: Elijah Newren <newren@gmail.com>
> ---
>  merge-ort.c | 1 -
>  1 file changed, 1 deletion(-)
>
> diff --git a/merge-ort.c b/merge-ort.c
> index 4b8e32209d..74e9636020 100644
> --- a/merge-ort.c
> +++ b/merge-ort.c
> @@ -1740,7 +1740,6 @@ static int collect_merge_info(struct merge_options *opt,
>  	setup_traverse_info(&info, opt->priv->toplevel_dir);
>  	info.fn = collect_merge_info_callback;
>  	info.data = opt;
> -	info.show_all_errors = 1;
>  
>  	if (repo_parse_tree(opt->repo, merge_base) < 0 ||
>  	    repo_parse_tree(opt->repo, side1) < 0 ||

info.fn = collect_merge_info_callback;
info.data = opt;
info.show_all_errors = 1;

if (repo_parse_tree(opt->repo, merge_base) < 0 ||
repo_parse_tree(opt->repo, side1) < 0 ||
Expand Down
1 change: 1 addition & 0 deletions t/meson.build
Original file line number Diff line number Diff line change
Expand Up @@ -124,6 +124,7 @@ integration_tests = [
't0090-cache-tree.sh',
't0091-bugreport.sh',
't0092-diagnose.sh',
't0093-verify-cache-df-gap.sh',
't0095-bloom.sh',
't0100-previous.sh',
't0101-at-syntax.sh',
Expand Down
38 changes: 38 additions & 0 deletions t/t0093-direct-index-write.pl
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
#!/usr/bin/perl
#
# Build a v2 index file from entries listed on stdin.
# Each line: "octalmode hex-oid name"
# Output: binary index written to stdout.
#
# This bypasses all D/F safety checks in add_index_entry(), simulating
# what happens when code uses ADD_CACHE_JUST_APPEND to bulk-load entries.
use strict;
use warnings;
use Digest::SHA qw(sha1 sha256);

my $hash_algo = $ENV{'GIT_DEFAULT_HASH'} || 'sha1';
my $hash_func = $hash_algo eq 'sha256' ? \&sha256 : \&sha1;

my @entries;
while (my $line = <STDIN>) {
chomp $line;
my ($mode, $oid_hex, $name) = split(/ /, $line, 3);
push @entries, [$mode, $oid_hex, $name];
}

my $body = "DIRC" . pack("NN", 2, scalar @entries);

for my $ent (@entries) {
my ($mode, $oid_hex, $name) = @{$ent};
# 10 x 32-bit stat fields (zeroed), with mode in position 7
my $stat = pack("N10", 0, 0, 0, 0, 0, 0, oct($mode), 0, 0, 0);
my $oid = pack("H*", $oid_hex);
my $flags = pack("n", length($name) & 0xFFF);
my $entry = $stat . $oid . $flags . $name . "\0";
# Pad to 8-byte boundary
while (length($entry) % 8) { $entry .= "\0"; }
$body .= $entry;
}

binmode STDOUT;
print $body . $hash_func->($body);
59 changes: 59 additions & 0 deletions t/t0093-verify-cache-df-gap.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
#!/bin/sh

test_description='verify_cache() must catch non-adjacent D/F conflicts

Ensure that verify_cache() can complain about bad entries like:

docs <-- submodule
docs-internal/... <-- sorts here because "-" < "/"
docs/... <-- D/F conflict with "docs" above, not adjacent

In order to test verify_cache, we directly construct a corrupt index
(bypassing the D/F safety checks in add_index_entry) and verify that
write-tree rejects it.
'

. ./test-lib.sh

if ! test_have_prereq PERL
then
skip_all='skipping verify_cache D/F tests; Perl not available'
test_done
fi

# Build a v2 index from entries on stdin, bypassing D/F checks.
# Each line: "octalmode hex-oid name" (entries must be pre-sorted).
build_corrupt_index () {
perl "$TEST_DIRECTORY/t0093-direct-index-write.pl" >"$1"
}

test_expect_success 'setup objects' '
test_commit base &&
BLOB=$(git rev-parse HEAD:base.t) &&
SUB_COMMIT=$(git rev-parse HEAD)
'

test_expect_success 'adjacent D/F conflict is caught by verify_cache' '
cat >index-entries <<-EOF &&
0160000 $SUB_COMMIT docs
0100644 $BLOB docs/requirements.txt
EOF
build_corrupt_index .git/index <index-entries &&

test_must_fail git write-tree 2>err &&
test_grep "You have both docs and docs/requirements.txt" err
'

test_expect_success 'non-adjacent D/F conflict is caught by verify_cache' '
cat >index-entries <<-EOF &&
0160000 $SUB_COMMIT docs
0100644 $BLOB docs-internal/README.md
0100644 $BLOB docs/requirements.txt
EOF
build_corrupt_index .git/index <index-entries &&

test_must_fail git write-tree 2>err &&
test_grep "You have both docs and docs/requirements.txt" err
'

test_done
54 changes: 54 additions & 0 deletions t/t6422-merge-rename-corner-cases.sh
Original file line number Diff line number Diff line change
Expand Up @@ -1525,4 +1525,58 @@ test_expect_success 'submodule/directory preliminary conflict' '
)
'

# Testcase: submodule/directory conflict with duplicate tree entries
# One side has a path as a gitlink (submodule). The other side replaces
# the gitlink with a directory. A third-party tool creates a tree on the
# submodule side that has *both* a gitlink and a tree entry for the same
# path (adding a file inside the submodule path ignoring that there's a
# gitlink there). collect_merge_info_callback() should detect the
# duplicate and abort rather than silently corrupting its bookkeeping.

test_expect_success 'duplicate tree entries trigger an error' '
test_when_finished "rm -rf duplicate-entry" &&
git init duplicate-entry &&
(
cd duplicate-entry &&

# Base commit: "docs" is a gitlink (submodule)
empty_tree=$(git mktree </dev/null) &&
fake_commit=$(git commit-tree $empty_tree </dev/null) &&
git update-index --add --cacheinfo 160000,$fake_commit,docs &&
echo base >file.txt &&
git add file.txt &&
git commit -m base &&

# side1: remove the gitlink, replace with a directory
git checkout -b side1 &&
git rm --cached docs &&
mkdir -p docs &&
echo hello >docs/requirements.txt &&
git add docs/requirements.txt &&
git commit -m "side1: submodule to directory" &&

# side2: keep the gitlink but craft a tree that also
# contains a tree entry for "docs" (simulating a tool
# that adds files inside a submodule path without
# removing the gitlink first).
git checkout main &&
git checkout -b side2 &&
blob_oid=$(echo world | git hash-object -w --stdin) &&
docs_tree=$(printf "100644 blob %s\trequirements.txt\n" \
"$blob_oid" | git mktree) &&
cur_tree=$(git rev-parse HEAD^{tree}) &&
git cat-file -p $cur_tree >tree-listing &&
printf "040000 tree %s\tdocs\n" "$docs_tree" >>tree-listing &&
new_tree=$(git mktree <tree-listing) &&
side2_commit=$(git commit-tree $new_tree -p HEAD \
-m "side2: add file alongside submodule") &&
git update-ref refs/heads/side2 $side2_commit &&

# Merging must detect the duplicate and abort
git checkout side1 &&
test_must_fail git merge side2 2>err &&
test_grep "duplicate entries" err
)
'

test_done
Loading