2023-08-16 16:54:33 -04:00
|
|
|
// SPDX-License-Identifier: GPL-2.0
|
|
|
|
|
|
|
|
#include "bcachefs.h"
|
2025-05-01 14:47:39 -04:00
|
|
|
#include "bbpos.h"
|
bcachefs: bch2_propagate_key_to_snapshot_leaves()
If fsck finds a key that needs work done, the primary example being an
unlinked inode that needs to be deleted, and the key is in an internal
snapshot node, we have a bit of a conundrum.
The conundrum is that internal snapshot nodes are shared, and we in
general do updates in internal snapshot nodes because there may be
overwrites in some snapshots and not others, and this may affect other
keys referenced by this key (i.e. extents).
For example, we might be seeing an unlinked inode in an internal
snapshot node, but then in one child snapshot the inode might have been
reattached and might not be unlinked. Deleting the inode in the internal
snapshot node would be wrong, because then we'll delete all the extents
that the child snapshot references.
But if an unlinked inode does not have any overwrites in child
snapshots, we're fine: the inode is overwrritten in all child snapshots,
so we can do the deletion at the point of comonality in the snapshot
tree, i.e. the node where we found it.
This patch adds a new helper, bch2_propagate_key_to_snapshot_leaves(),
to handle the case where we need a to update a key that does have
overwrites in child snapshots: we copy the key to leaf snapshot nodes,
and then rewind fsck and process the needed updates there.
With this, fsck can now always correctly handle unlinked inodes found in
internal snapshot nodes.
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
2023-08-18 21:14:33 -04:00
|
|
|
#include "bkey_buf.h"
|
2024-12-08 22:30:19 -05:00
|
|
|
#include "btree_cache.h"
|
2023-08-16 16:54:33 -04:00
|
|
|
#include "btree_key_cache.h"
|
|
|
|
#include "btree_update.h"
|
bcachefs: Cleanup redundant snapshot nodes
After deleteing snapshots, we may be left with a snapshot tree where
some nodes only have one child, and we have a linear chain.
Interior snapshot nodes are never used directly (i.e. they never have
subvolumes that point to them), they are only referered to by child
snapshot nodes - hence, they are redundant.
The existing code talks about redundant snapshot nodes as forming and
equivalence class; i.e. nodes for which snapshot_t->equiv is equal. In a
given equivalence class, we only ever need a single key at a given
position - i.e. multiple versions with different snapshot fields are
redundant.
The existing snapshot cleanup code deletes these redundant keys, but not
redundant nodes. It turns out this is buggy, because we assume that
after snapshot deletion finishes we should only have a single key per
equivalence class, but the btree update path doesn't preserve this -
overwriting keys in old snapshots doesn't check for the equivalence
class being equal, and thus we can end up with duplicate keys in the
same equivalence class and fsck complaining about snapshot deletion not
having run correctly.
The equivalence class notion has been leaking out of the core snapshots
code and into too much other code, i.e. fsck, so this patch takes a
different approach: snapshot deletion now moves keys to the node in an
equivalence class being kept (the leafiest node) and then deletes the
redundant nodes in the equivalance class.
Some work has to be done to correctly delete interior snapshot nodes;
snapshot node depth and skiplist fields for descendent nodes have to be
fixed.
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
2023-08-17 22:10:02 -04:00
|
|
|
#include "buckets.h"
|
2025-04-18 14:56:09 -04:00
|
|
|
#include "enumerated_ref.h"
|
2023-08-16 16:54:33 -04:00
|
|
|
#include "errcode.h"
|
|
|
|
#include "error.h"
|
|
|
|
#include "fs.h"
|
2024-03-27 22:50:19 -04:00
|
|
|
#include "recovery_passes.h"
|
2023-08-16 16:54:33 -04:00
|
|
|
#include "snapshot.h"
|
|
|
|
|
|
|
|
#include <linux/random.h>
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Snapshot trees:
|
|
|
|
*
|
|
|
|
* Keys in BTREE_ID_snapshot_trees identify a whole tree of snapshot nodes; they
|
|
|
|
* exist to provide a stable identifier for the whole lifetime of a snapshot
|
|
|
|
* tree.
|
|
|
|
*/
|
|
|
|
|
|
|
|
void bch2_snapshot_tree_to_text(struct printbuf *out, struct bch_fs *c,
|
|
|
|
struct bkey_s_c k)
|
|
|
|
{
|
|
|
|
struct bkey_s_c_snapshot_tree t = bkey_s_c_to_snapshot_tree(k);
|
|
|
|
|
|
|
|
prt_printf(out, "subvol %u root snapshot %u",
|
|
|
|
le32_to_cpu(t.v->master_subvol),
|
|
|
|
le32_to_cpu(t.v->root_snapshot));
|
|
|
|
}
|
|
|
|
|
2024-08-12 21:31:25 -04:00
|
|
|
int bch2_snapshot_tree_validate(struct bch_fs *c, struct bkey_s_c k,
|
2024-11-27 00:29:52 -05:00
|
|
|
struct bkey_validate_context from)
|
2023-08-16 16:54:33 -04:00
|
|
|
{
|
2023-10-24 20:44:36 -04:00
|
|
|
int ret = 0;
|
2023-08-16 16:54:33 -04:00
|
|
|
|
2023-10-24 20:44:36 -04:00
|
|
|
bkey_fsck_err_on(bkey_gt(k.k->p, POS(0, U32_MAX)) ||
|
2024-08-12 21:31:25 -04:00
|
|
|
bkey_lt(k.k->p, POS(0, 1)),
|
|
|
|
c, snapshot_tree_pos_bad,
|
2023-10-24 20:44:36 -04:00
|
|
|
"bad pos");
|
|
|
|
fsck_err:
|
|
|
|
return ret;
|
2023-08-16 16:54:33 -04:00
|
|
|
}
|
|
|
|
|
|
|
|
int bch2_snapshot_tree_lookup(struct btree_trans *trans, u32 id,
|
|
|
|
struct bch_snapshot_tree *s)
|
|
|
|
{
|
|
|
|
int ret = bch2_bkey_get_val_typed(trans, BTREE_ID_snapshot_trees, POS(0, id),
|
2024-04-07 18:05:34 -04:00
|
|
|
BTREE_ITER_with_updates, snapshot_tree, s);
|
2023-08-16 16:54:33 -04:00
|
|
|
|
|
|
|
if (bch2_err_matches(ret, ENOENT))
|
2025-05-28 11:57:50 -04:00
|
|
|
ret = bch_err_throw(trans->c, ENOENT_snapshot_tree);
|
2023-08-16 16:54:33 -04:00
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
struct bkey_i_snapshot_tree *
|
|
|
|
__bch2_snapshot_tree_create(struct btree_trans *trans)
|
|
|
|
{
|
|
|
|
struct btree_iter iter;
|
|
|
|
int ret = bch2_bkey_get_empty_slot(trans, &iter,
|
|
|
|
BTREE_ID_snapshot_trees, POS(0, U32_MAX));
|
|
|
|
struct bkey_i_snapshot_tree *s_t;
|
|
|
|
|
|
|
|
if (ret == -BCH_ERR_ENOSPC_btree_slot)
|
2025-05-28 11:57:50 -04:00
|
|
|
ret = bch_err_throw(trans->c, ENOSPC_snapshot_tree);
|
2023-08-16 16:54:33 -04:00
|
|
|
if (ret)
|
|
|
|
return ERR_PTR(ret);
|
|
|
|
|
|
|
|
s_t = bch2_bkey_alloc(trans, &iter, 0, snapshot_tree);
|
|
|
|
ret = PTR_ERR_OR_ZERO(s_t);
|
|
|
|
bch2_trans_iter_exit(trans, &iter);
|
|
|
|
return ret ? ERR_PTR(ret) : s_t;
|
|
|
|
}
|
|
|
|
|
|
|
|
static int bch2_snapshot_tree_create(struct btree_trans *trans,
|
|
|
|
u32 root_id, u32 subvol_id, u32 *tree_id)
|
|
|
|
{
|
|
|
|
struct bkey_i_snapshot_tree *n_tree =
|
|
|
|
__bch2_snapshot_tree_create(trans);
|
|
|
|
|
|
|
|
if (IS_ERR(n_tree))
|
|
|
|
return PTR_ERR(n_tree);
|
|
|
|
|
|
|
|
n_tree->v.master_subvol = cpu_to_le32(subvol_id);
|
|
|
|
n_tree->v.root_snapshot = cpu_to_le32(root_id);
|
|
|
|
*tree_id = n_tree->k.p.offset;
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Snapshot nodes: */
|
|
|
|
|
2024-03-17 20:39:42 -04:00
|
|
|
static bool __bch2_snapshot_is_ancestor_early(struct snapshot_table *t, u32 id, u32 ancestor)
|
2023-07-13 02:43:29 -04:00
|
|
|
{
|
2024-03-22 16:29:23 -04:00
|
|
|
while (id && id < ancestor) {
|
|
|
|
const struct snapshot_t *s = __snapshot_t(t, id);
|
|
|
|
id = s ? s->parent : 0;
|
|
|
|
}
|
2024-03-17 20:39:42 -04:00
|
|
|
return id == ancestor;
|
|
|
|
}
|
|
|
|
|
|
|
|
static bool bch2_snapshot_is_ancestor_early(struct bch_fs *c, u32 id, u32 ancestor)
|
|
|
|
{
|
2025-05-24 16:33:39 -04:00
|
|
|
guard(rcu)();
|
|
|
|
return __bch2_snapshot_is_ancestor_early(rcu_dereference(c->snapshots), id, ancestor);
|
2023-07-13 02:43:29 -04:00
|
|
|
}
|
|
|
|
|
2023-08-16 16:54:33 -04:00
|
|
|
static inline u32 get_ancestor_below(struct snapshot_table *t, u32 id, u32 ancestor)
|
|
|
|
{
|
|
|
|
const struct snapshot_t *s = __snapshot_t(t, id);
|
2024-03-22 16:29:23 -04:00
|
|
|
if (!s)
|
|
|
|
return 0;
|
2023-08-16 16:54:33 -04:00
|
|
|
|
|
|
|
if (s->skip[2] <= ancestor)
|
|
|
|
return s->skip[2];
|
|
|
|
if (s->skip[1] <= ancestor)
|
|
|
|
return s->skip[1];
|
|
|
|
if (s->skip[0] <= ancestor)
|
|
|
|
return s->skip[0];
|
|
|
|
return s->parent;
|
|
|
|
}
|
|
|
|
|
2024-04-04 15:50:26 -04:00
|
|
|
static bool test_ancestor_bitmap(struct snapshot_table *t, u32 id, u32 ancestor)
|
|
|
|
{
|
|
|
|
const struct snapshot_t *s = __snapshot_t(t, id);
|
|
|
|
if (!s)
|
|
|
|
return false;
|
|
|
|
|
|
|
|
return test_bit(ancestor - id - 1, s->is_ancestor);
|
|
|
|
}
|
|
|
|
|
2023-08-16 16:54:33 -04:00
|
|
|
bool __bch2_snapshot_is_ancestor(struct bch_fs *c, u32 id, u32 ancestor)
|
|
|
|
{
|
2025-06-13 15:17:37 -04:00
|
|
|
#ifdef CONFIG_BCACHEFS_DEBUG
|
|
|
|
u32 orig_id = id;
|
|
|
|
#endif
|
2023-08-16 16:54:33 -04:00
|
|
|
|
2025-05-24 16:33:39 -04:00
|
|
|
guard(rcu)();
|
2024-03-17 20:39:42 -04:00
|
|
|
struct snapshot_table *t = rcu_dereference(c->snapshots);
|
|
|
|
|
2025-05-24 16:33:39 -04:00
|
|
|
if (unlikely(c->recovery.pass_done < BCH_RECOVERY_PASS_check_snapshots))
|
|
|
|
return __bch2_snapshot_is_ancestor_early(t, id, ancestor);
|
2023-08-16 16:54:33 -04:00
|
|
|
|
2025-01-27 17:12:41 +08:00
|
|
|
if (likely(ancestor >= IS_ANCESTOR_BITMAP))
|
|
|
|
while (id && id < ancestor - IS_ANCESTOR_BITMAP)
|
|
|
|
id = get_ancestor_below(t, id, ancestor);
|
2023-08-16 16:54:33 -04:00
|
|
|
|
2025-06-13 15:17:37 -04:00
|
|
|
bool ret = id && id < ancestor
|
2024-04-04 15:50:26 -04:00
|
|
|
? test_ancestor_bitmap(t, id, ancestor)
|
|
|
|
: id == ancestor;
|
2023-08-16 16:54:33 -04:00
|
|
|
|
2025-06-13 15:17:37 -04:00
|
|
|
EBUG_ON(ret != __bch2_snapshot_is_ancestor_early(t, orig_id, ancestor));
|
2023-07-13 02:43:29 -04:00
|
|
|
return ret;
|
2023-08-16 16:54:33 -04:00
|
|
|
}
|
|
|
|
|
|
|
|
static noinline struct snapshot_t *__snapshot_t_mut(struct bch_fs *c, u32 id)
|
|
|
|
{
|
|
|
|
size_t idx = U32_MAX - id;
|
|
|
|
struct snapshot_table *new, *old;
|
|
|
|
|
2024-03-21 20:16:23 -04:00
|
|
|
size_t new_bytes = kmalloc_size_roundup(struct_size(new, s, idx + 1));
|
|
|
|
size_t new_size = (new_bytes - sizeof(*new)) / sizeof(new->s[0]);
|
2023-08-16 16:54:33 -04:00
|
|
|
|
2024-06-25 17:39:56 -07:00
|
|
|
if (unlikely(new_bytes > INT_MAX))
|
|
|
|
return NULL;
|
|
|
|
|
2024-03-21 20:16:23 -04:00
|
|
|
new = kvzalloc(new_bytes, GFP_KERNEL);
|
2023-08-16 16:54:33 -04:00
|
|
|
if (!new)
|
|
|
|
return NULL;
|
|
|
|
|
2024-03-21 20:16:23 -04:00
|
|
|
new->nr = new_size;
|
|
|
|
|
2023-08-16 16:54:33 -04:00
|
|
|
old = rcu_dereference_protected(c->snapshots, true);
|
|
|
|
if (old)
|
2024-03-21 20:16:23 -04:00
|
|
|
memcpy(new->s, old->s, sizeof(old->s[0]) * old->nr);
|
2023-08-16 16:54:33 -04:00
|
|
|
|
|
|
|
rcu_assign_pointer(c->snapshots, new);
|
2024-03-21 20:16:23 -04:00
|
|
|
kvfree_rcu(old, rcu);
|
2023-08-16 16:54:33 -04:00
|
|
|
|
2024-03-21 20:16:23 -04:00
|
|
|
return &rcu_dereference_protected(c->snapshots,
|
|
|
|
lockdep_is_held(&c->snapshot_table_lock))->s[idx];
|
2023-08-16 16:54:33 -04:00
|
|
|
}
|
|
|
|
|
|
|
|
static inline struct snapshot_t *snapshot_t_mut(struct bch_fs *c, u32 id)
|
|
|
|
{
|
|
|
|
size_t idx = U32_MAX - id;
|
2024-03-21 20:16:23 -04:00
|
|
|
struct snapshot_table *table =
|
|
|
|
rcu_dereference_protected(c->snapshots,
|
|
|
|
lockdep_is_held(&c->snapshot_table_lock));
|
2023-08-16 16:54:33 -04:00
|
|
|
|
|
|
|
lockdep_assert_held(&c->snapshot_table_lock);
|
|
|
|
|
2024-03-21 20:16:23 -04:00
|
|
|
if (likely(table && idx < table->nr))
|
|
|
|
return &table->s[idx];
|
2023-08-16 16:54:33 -04:00
|
|
|
|
|
|
|
return __snapshot_t_mut(c, id);
|
|
|
|
}
|
|
|
|
|
|
|
|
void bch2_snapshot_to_text(struct printbuf *out, struct bch_fs *c,
|
|
|
|
struct bkey_s_c k)
|
|
|
|
{
|
|
|
|
struct bkey_s_c_snapshot s = bkey_s_c_to_snapshot(k);
|
|
|
|
|
2025-05-02 12:37:36 -04:00
|
|
|
if (BCH_SNAPSHOT_SUBVOL(s.v))
|
|
|
|
prt_str(out, "subvol ");
|
|
|
|
if (BCH_SNAPSHOT_WILL_DELETE(s.v))
|
|
|
|
prt_str(out, "will_delete ");
|
|
|
|
if (BCH_SNAPSHOT_DELETED(s.v))
|
|
|
|
prt_str(out, "deleted ");
|
|
|
|
|
|
|
|
prt_printf(out, "parent %10u children %10u %10u subvol %u tree %u",
|
2023-08-16 16:54:33 -04:00
|
|
|
le32_to_cpu(s.v->parent),
|
|
|
|
le32_to_cpu(s.v->children[0]),
|
|
|
|
le32_to_cpu(s.v->children[1]),
|
|
|
|
le32_to_cpu(s.v->subvol),
|
|
|
|
le32_to_cpu(s.v->tree));
|
|
|
|
|
|
|
|
if (bkey_val_bytes(k.k) > offsetof(struct bch_snapshot, depth))
|
|
|
|
prt_printf(out, " depth %u skiplist %u %u %u",
|
|
|
|
le32_to_cpu(s.v->depth),
|
|
|
|
le32_to_cpu(s.v->skip[0]),
|
|
|
|
le32_to_cpu(s.v->skip[1]),
|
|
|
|
le32_to_cpu(s.v->skip[2]));
|
|
|
|
}
|
|
|
|
|
2024-08-12 21:31:25 -04:00
|
|
|
int bch2_snapshot_validate(struct bch_fs *c, struct bkey_s_c k,
|
2024-11-27 00:29:52 -05:00
|
|
|
struct bkey_validate_context from)
|
2023-08-16 16:54:33 -04:00
|
|
|
{
|
|
|
|
struct bkey_s_c_snapshot s;
|
|
|
|
u32 i, id;
|
2023-10-24 20:44:36 -04:00
|
|
|
int ret = 0;
|
2023-08-16 16:54:33 -04:00
|
|
|
|
2023-10-24 20:44:36 -04:00
|
|
|
bkey_fsck_err_on(bkey_gt(k.k->p, POS(0, U32_MAX)) ||
|
2024-08-12 21:31:25 -04:00
|
|
|
bkey_lt(k.k->p, POS(0, 1)),
|
|
|
|
c, snapshot_pos_bad,
|
2023-10-24 20:44:36 -04:00
|
|
|
"bad pos");
|
2023-08-16 16:54:33 -04:00
|
|
|
|
|
|
|
s = bkey_s_c_to_snapshot(k);
|
|
|
|
|
|
|
|
id = le32_to_cpu(s.v->parent);
|
2024-08-12 21:31:25 -04:00
|
|
|
bkey_fsck_err_on(id && id <= k.k->p.offset,
|
|
|
|
c, snapshot_parent_bad,
|
2023-10-24 20:44:36 -04:00
|
|
|
"bad parent node (%u <= %llu)",
|
|
|
|
id, k.k->p.offset);
|
2023-08-16 16:54:33 -04:00
|
|
|
|
2024-08-12 21:31:25 -04:00
|
|
|
bkey_fsck_err_on(le32_to_cpu(s.v->children[0]) < le32_to_cpu(s.v->children[1]),
|
|
|
|
c, snapshot_children_not_normalized,
|
2023-10-24 20:44:36 -04:00
|
|
|
"children not normalized");
|
2023-08-16 16:54:33 -04:00
|
|
|
|
2024-08-12 21:31:25 -04:00
|
|
|
bkey_fsck_err_on(s.v->children[0] && s.v->children[0] == s.v->children[1],
|
|
|
|
c, snapshot_child_duplicate,
|
2023-10-24 20:44:36 -04:00
|
|
|
"duplicate child nodes");
|
2023-08-16 16:54:33 -04:00
|
|
|
|
|
|
|
for (i = 0; i < 2; i++) {
|
|
|
|
id = le32_to_cpu(s.v->children[i]);
|
|
|
|
|
2024-08-12 21:31:25 -04:00
|
|
|
bkey_fsck_err_on(id >= k.k->p.offset,
|
|
|
|
c, snapshot_child_bad,
|
2023-10-24 20:44:36 -04:00
|
|
|
"bad child node (%u >= %llu)",
|
|
|
|
id, k.k->p.offset);
|
2023-08-16 16:54:33 -04:00
|
|
|
}
|
|
|
|
|
|
|
|
if (bkey_val_bytes(k.k) > offsetof(struct bch_snapshot, skip)) {
|
2023-10-24 20:44:36 -04:00
|
|
|
bkey_fsck_err_on(le32_to_cpu(s.v->skip[0]) > le32_to_cpu(s.v->skip[1]) ||
|
2024-08-12 21:31:25 -04:00
|
|
|
le32_to_cpu(s.v->skip[1]) > le32_to_cpu(s.v->skip[2]),
|
|
|
|
c, snapshot_skiplist_not_normalized,
|
2023-10-24 20:44:36 -04:00
|
|
|
"skiplist not normalized");
|
2023-08-16 16:54:33 -04:00
|
|
|
|
|
|
|
for (i = 0; i < ARRAY_SIZE(s.v->skip); i++) {
|
|
|
|
id = le32_to_cpu(s.v->skip[i]);
|
|
|
|
|
2024-08-12 21:31:25 -04:00
|
|
|
bkey_fsck_err_on(id && id < le32_to_cpu(s.v->parent),
|
|
|
|
c, snapshot_skiplist_bad,
|
2023-10-24 20:44:36 -04:00
|
|
|
"bad skiplist node %u", id);
|
2023-08-16 16:54:33 -04:00
|
|
|
}
|
|
|
|
}
|
2023-10-24 20:44:36 -04:00
|
|
|
fsck_err:
|
|
|
|
return ret;
|
2023-08-16 16:54:33 -04:00
|
|
|
}
|
|
|
|
|
2025-04-02 14:40:06 -04:00
|
|
|
static int bch2_snapshot_table_make_room(struct bch_fs *c, u32 id)
|
|
|
|
{
|
|
|
|
mutex_lock(&c->snapshot_table_lock);
|
|
|
|
int ret = snapshot_t_mut(c, id)
|
|
|
|
? 0
|
2025-05-28 11:57:50 -04:00
|
|
|
: bch_err_throw(c, ENOMEM_mark_snapshot);
|
2025-04-02 14:40:06 -04:00
|
|
|
mutex_unlock(&c->snapshot_table_lock);
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2023-12-27 23:19:09 -05:00
|
|
|
static int __bch2_mark_snapshot(struct btree_trans *trans,
|
2023-08-16 16:54:33 -04:00
|
|
|
enum btree_id btree, unsigned level,
|
|
|
|
struct bkey_s_c old, struct bkey_s_c new,
|
bcachefs: Fix type of flags parameter for some ->trigger() implementations
When building with clang's -Wincompatible-function-pointer-types-strict
(a warning designed to catch potential kCFI failures at build time),
there are several warnings along the lines of:
fs/bcachefs/bkey_methods.c:118:2: error: incompatible function pointer types initializing 'int (*)(struct btree_trans *, enum btree_id, unsigned int, struct bkey_s_c, struct bkey_s, enum btree_iter_update_trigger_flags)' with an expression of type 'int (struct btree_trans *, enum btree_id, unsigned int, struct bkey_s_c, struct bkey_s, unsigned int)' [-Werror,-Wincompatible-function-pointer-types-strict]
118 | BCH_BKEY_TYPES()
| ^~~~~~~~~~~~~~~~
fs/bcachefs/bcachefs_format.h:394:2: note: expanded from macro 'BCH_BKEY_TYPES'
394 | x(inode, 8) \
| ^~~~~~~~~~~~~~~~~~~~~~~~~~
fs/bcachefs/bkey_methods.c:117:41: note: expanded from macro 'x'
117 | #define x(name, nr) [KEY_TYPE_##name] = bch2_bkey_ops_##name,
| ^~~~~~~~~~~~~~~~~~~~
<scratch space>:277:1: note: expanded from here
277 | bch2_bkey_ops_inode
| ^~~~~~~~~~~~~~~~~~~
fs/bcachefs/inode.h:26:13: note: expanded from macro 'bch2_bkey_ops_inode'
26 | .trigger = bch2_trigger_inode, \
| ^~~~~~~~~~~~~~~~~~
There are several functions that did not have their flags parameter
converted to 'enum btree_iter_update_trigger_flags' in the recent
unification, which will cause kCFI failures at runtime because the
types, while ABI compatible (hence no warning from the non-strict
version of this warning), do not match exactly.
Fix up these functions (as well as a few other obvious functions that
should have it, even if there are no warnings currently) to resolve the
warnings and potential kCFI runtime failures.
Fixes: 31e4ef3280c8 ("bcachefs: iter/update/trigger/str_hash flag cleanup")
Signed-off-by: Nathan Chancellor <nathan@kernel.org>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
2024-04-23 11:58:09 -07:00
|
|
|
enum btree_iter_update_trigger_flags flags)
|
2023-08-16 16:54:33 -04:00
|
|
|
{
|
|
|
|
struct bch_fs *c = trans->c;
|
|
|
|
struct snapshot_t *t;
|
|
|
|
u32 id = new.k->p.offset;
|
|
|
|
int ret = 0;
|
|
|
|
|
|
|
|
mutex_lock(&c->snapshot_table_lock);
|
|
|
|
|
|
|
|
t = snapshot_t_mut(c, id);
|
|
|
|
if (!t) {
|
2025-05-28 11:57:50 -04:00
|
|
|
ret = bch_err_throw(c, ENOMEM_mark_snapshot);
|
2023-08-16 16:54:33 -04:00
|
|
|
goto err;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (new.k->type == KEY_TYPE_snapshot) {
|
|
|
|
struct bkey_s_c_snapshot s = bkey_s_c_to_snapshot(new);
|
|
|
|
|
2025-05-02 12:37:36 -04:00
|
|
|
t->state = !BCH_SNAPSHOT_DELETED(s.v)
|
|
|
|
? SNAPSHOT_ID_live
|
|
|
|
: SNAPSHOT_ID_deleted;
|
2023-08-16 16:54:33 -04:00
|
|
|
t->parent = le32_to_cpu(s.v->parent);
|
|
|
|
t->children[0] = le32_to_cpu(s.v->children[0]);
|
|
|
|
t->children[1] = le32_to_cpu(s.v->children[1]);
|
|
|
|
t->subvol = BCH_SNAPSHOT_SUBVOL(s.v) ? le32_to_cpu(s.v->subvol) : 0;
|
|
|
|
t->tree = le32_to_cpu(s.v->tree);
|
|
|
|
|
|
|
|
if (bkey_val_bytes(s.k) > offsetof(struct bch_snapshot, depth)) {
|
|
|
|
t->depth = le32_to_cpu(s.v->depth);
|
|
|
|
t->skip[0] = le32_to_cpu(s.v->skip[0]);
|
|
|
|
t->skip[1] = le32_to_cpu(s.v->skip[1]);
|
|
|
|
t->skip[2] = le32_to_cpu(s.v->skip[2]);
|
|
|
|
} else {
|
|
|
|
t->depth = 0;
|
|
|
|
t->skip[0] = 0;
|
|
|
|
t->skip[1] = 0;
|
|
|
|
t->skip[2] = 0;
|
|
|
|
}
|
|
|
|
|
2024-12-12 04:03:32 -05:00
|
|
|
u32 parent = id;
|
|
|
|
|
|
|
|
while ((parent = bch2_snapshot_parent_early(c, parent)) &&
|
|
|
|
parent - id - 1 < IS_ANCESTOR_BITMAP)
|
|
|
|
__set_bit(parent - id - 1, t->is_ancestor);
|
2023-08-16 16:54:33 -04:00
|
|
|
|
2025-05-02 12:33:17 -04:00
|
|
|
if (BCH_SNAPSHOT_WILL_DELETE(s.v)) {
|
2023-11-26 17:05:02 -05:00
|
|
|
set_bit(BCH_FS_need_delete_dead_snapshots, &c->flags);
|
2025-05-13 17:36:55 -04:00
|
|
|
if (c->recovery.pass_done > BCH_RECOVERY_PASS_delete_dead_snapshots)
|
2023-10-19 21:25:04 -04:00
|
|
|
bch2_delete_dead_snapshots_async(c);
|
2023-08-16 16:54:33 -04:00
|
|
|
}
|
|
|
|
} else {
|
|
|
|
memset(t, 0, sizeof(*t));
|
|
|
|
}
|
|
|
|
err:
|
|
|
|
mutex_unlock(&c->snapshot_table_lock);
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2023-12-27 23:19:09 -05:00
|
|
|
int bch2_mark_snapshot(struct btree_trans *trans,
|
|
|
|
enum btree_id btree, unsigned level,
|
|
|
|
struct bkey_s_c old, struct bkey_s new,
|
bcachefs: Fix type of flags parameter for some ->trigger() implementations
When building with clang's -Wincompatible-function-pointer-types-strict
(a warning designed to catch potential kCFI failures at build time),
there are several warnings along the lines of:
fs/bcachefs/bkey_methods.c:118:2: error: incompatible function pointer types initializing 'int (*)(struct btree_trans *, enum btree_id, unsigned int, struct bkey_s_c, struct bkey_s, enum btree_iter_update_trigger_flags)' with an expression of type 'int (struct btree_trans *, enum btree_id, unsigned int, struct bkey_s_c, struct bkey_s, unsigned int)' [-Werror,-Wincompatible-function-pointer-types-strict]
118 | BCH_BKEY_TYPES()
| ^~~~~~~~~~~~~~~~
fs/bcachefs/bcachefs_format.h:394:2: note: expanded from macro 'BCH_BKEY_TYPES'
394 | x(inode, 8) \
| ^~~~~~~~~~~~~~~~~~~~~~~~~~
fs/bcachefs/bkey_methods.c:117:41: note: expanded from macro 'x'
117 | #define x(name, nr) [KEY_TYPE_##name] = bch2_bkey_ops_##name,
| ^~~~~~~~~~~~~~~~~~~~
<scratch space>:277:1: note: expanded from here
277 | bch2_bkey_ops_inode
| ^~~~~~~~~~~~~~~~~~~
fs/bcachefs/inode.h:26:13: note: expanded from macro 'bch2_bkey_ops_inode'
26 | .trigger = bch2_trigger_inode, \
| ^~~~~~~~~~~~~~~~~~
There are several functions that did not have their flags parameter
converted to 'enum btree_iter_update_trigger_flags' in the recent
unification, which will cause kCFI failures at runtime because the
types, while ABI compatible (hence no warning from the non-strict
version of this warning), do not match exactly.
Fix up these functions (as well as a few other obvious functions that
should have it, even if there are no warnings currently) to resolve the
warnings and potential kCFI runtime failures.
Fixes: 31e4ef3280c8 ("bcachefs: iter/update/trigger/str_hash flag cleanup")
Signed-off-by: Nathan Chancellor <nathan@kernel.org>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
2024-04-23 11:58:09 -07:00
|
|
|
enum btree_iter_update_trigger_flags flags)
|
2023-12-27 23:19:09 -05:00
|
|
|
{
|
|
|
|
return __bch2_mark_snapshot(trans, btree, level, old, new.s_c, flags);
|
|
|
|
}
|
|
|
|
|
2023-08-16 16:54:33 -04:00
|
|
|
int bch2_snapshot_lookup(struct btree_trans *trans, u32 id,
|
|
|
|
struct bch_snapshot *s)
|
|
|
|
{
|
|
|
|
return bch2_bkey_get_val_typed(trans, BTREE_ID_snapshots, POS(0, id),
|
2024-04-07 18:05:34 -04:00
|
|
|
BTREE_ITER_with_updates, snapshot, s);
|
2023-08-16 16:54:33 -04:00
|
|
|
}
|
|
|
|
|
|
|
|
/* fsck: */
|
|
|
|
|
|
|
|
static u32 bch2_snapshot_child(struct bch_fs *c, u32 id, unsigned child)
|
|
|
|
{
|
|
|
|
return snapshot_t(c, id)->children[child];
|
|
|
|
}
|
|
|
|
|
|
|
|
static u32 bch2_snapshot_left_child(struct bch_fs *c, u32 id)
|
|
|
|
{
|
|
|
|
return bch2_snapshot_child(c, id, 0);
|
|
|
|
}
|
|
|
|
|
|
|
|
static u32 bch2_snapshot_right_child(struct bch_fs *c, u32 id)
|
|
|
|
{
|
|
|
|
return bch2_snapshot_child(c, id, 1);
|
|
|
|
}
|
|
|
|
|
|
|
|
static u32 bch2_snapshot_tree_next(struct bch_fs *c, u32 id)
|
|
|
|
{
|
|
|
|
u32 n, parent;
|
|
|
|
|
|
|
|
n = bch2_snapshot_left_child(c, id);
|
|
|
|
if (n)
|
|
|
|
return n;
|
|
|
|
|
|
|
|
while ((parent = bch2_snapshot_parent(c, id))) {
|
|
|
|
n = bch2_snapshot_right_child(c, parent);
|
|
|
|
if (n && n != id)
|
|
|
|
return n;
|
|
|
|
id = parent;
|
|
|
|
}
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2025-05-19 09:48:50 -04:00
|
|
|
u32 bch2_snapshot_oldest_subvol(struct bch_fs *c, u32 snapshot_root,
|
|
|
|
snapshot_id_list *skip)
|
2023-08-16 16:54:33 -04:00
|
|
|
{
|
2025-05-24 16:33:39 -04:00
|
|
|
guard(rcu)();
|
2025-05-19 09:48:50 -04:00
|
|
|
u32 id, subvol = 0, s;
|
|
|
|
retry:
|
|
|
|
id = snapshot_root;
|
2025-04-17 22:38:14 -04:00
|
|
|
while (id && bch2_snapshot_exists(c, id)) {
|
2025-05-19 09:48:50 -04:00
|
|
|
if (!(skip && snapshot_list_has_id(skip, id))) {
|
|
|
|
s = snapshot_t(c, id)->subvol;
|
2023-08-16 16:54:33 -04:00
|
|
|
|
2025-05-19 09:48:50 -04:00
|
|
|
if (s && (!subvol || s < subvol))
|
|
|
|
subvol = s;
|
|
|
|
}
|
2023-08-16 16:54:33 -04:00
|
|
|
id = bch2_snapshot_tree_next(c, id);
|
2025-05-19 09:48:50 -04:00
|
|
|
if (id == snapshot_root)
|
|
|
|
break;
|
2023-08-16 16:54:33 -04:00
|
|
|
}
|
|
|
|
|
2025-05-19 09:48:50 -04:00
|
|
|
if (!subvol && skip) {
|
|
|
|
skip = NULL;
|
|
|
|
goto retry;
|
|
|
|
}
|
|
|
|
|
2023-08-16 16:54:33 -04:00
|
|
|
return subvol;
|
|
|
|
}
|
|
|
|
|
|
|
|
static int bch2_snapshot_tree_master_subvol(struct btree_trans *trans,
|
|
|
|
u32 snapshot_root, u32 *subvol_id)
|
|
|
|
{
|
|
|
|
struct bch_fs *c = trans->c;
|
|
|
|
struct btree_iter iter;
|
|
|
|
struct bkey_s_c k;
|
|
|
|
bool found = false;
|
|
|
|
int ret;
|
|
|
|
|
|
|
|
for_each_btree_key_norestart(trans, iter, BTREE_ID_subvolumes, POS_MIN,
|
|
|
|
0, k, ret) {
|
|
|
|
if (k.k->type != KEY_TYPE_subvolume)
|
|
|
|
continue;
|
|
|
|
|
2023-12-16 22:43:41 -05:00
|
|
|
struct bkey_s_c_subvolume s = bkey_s_c_to_subvolume(k);
|
2023-08-16 16:54:33 -04:00
|
|
|
if (!bch2_snapshot_is_ancestor(c, le32_to_cpu(s.v->snapshot), snapshot_root))
|
|
|
|
continue;
|
|
|
|
if (!BCH_SUBVOLUME_SNAP(s.v)) {
|
|
|
|
*subvol_id = s.k->p.offset;
|
|
|
|
found = true;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
bch2_trans_iter_exit(trans, &iter);
|
|
|
|
|
|
|
|
if (!ret && !found) {
|
2023-09-12 18:41:22 -04:00
|
|
|
struct bkey_i_subvolume *u;
|
2023-08-16 16:54:33 -04:00
|
|
|
|
2025-05-19 09:48:50 -04:00
|
|
|
*subvol_id = bch2_snapshot_oldest_subvol(c, snapshot_root, NULL);
|
2023-08-16 16:54:33 -04:00
|
|
|
|
2023-09-12 18:41:22 -04:00
|
|
|
u = bch2_bkey_get_mut_typed(trans, &iter,
|
2023-08-16 16:54:33 -04:00
|
|
|
BTREE_ID_subvolumes, POS(0, *subvol_id),
|
|
|
|
0, subvolume);
|
2023-09-12 18:41:22 -04:00
|
|
|
ret = PTR_ERR_OR_ZERO(u);
|
2023-08-16 16:54:33 -04:00
|
|
|
if (ret)
|
|
|
|
return ret;
|
|
|
|
|
2023-09-12 18:41:22 -04:00
|
|
|
SET_BCH_SUBVOLUME_SNAP(&u->v, false);
|
2023-08-16 16:54:33 -04:00
|
|
|
}
|
|
|
|
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
static int check_snapshot_tree(struct btree_trans *trans,
|
|
|
|
struct btree_iter *iter,
|
|
|
|
struct bkey_s_c k)
|
|
|
|
{
|
|
|
|
struct bch_fs *c = trans->c;
|
|
|
|
struct bkey_s_c_snapshot_tree st;
|
|
|
|
struct bch_snapshot s;
|
|
|
|
struct bch_subvolume subvol;
|
|
|
|
struct printbuf buf = PRINTBUF;
|
2024-10-09 21:27:11 -04:00
|
|
|
struct btree_iter snapshot_iter = {};
|
2023-08-16 16:54:33 -04:00
|
|
|
u32 root_id;
|
|
|
|
int ret;
|
|
|
|
|
|
|
|
if (k.k->type != KEY_TYPE_snapshot_tree)
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
st = bkey_s_c_to_snapshot_tree(k);
|
|
|
|
root_id = le32_to_cpu(st.v->root_snapshot);
|
|
|
|
|
2024-10-09 21:27:11 -04:00
|
|
|
struct bkey_s_c_snapshot snapshot_k =
|
|
|
|
bch2_bkey_get_iter_typed(trans, &snapshot_iter, BTREE_ID_snapshots,
|
|
|
|
POS(0, root_id), 0, snapshot);
|
|
|
|
ret = bkey_err(snapshot_k);
|
2023-08-16 16:54:33 -04:00
|
|
|
if (ret && !bch2_err_matches(ret, ENOENT))
|
|
|
|
goto err;
|
|
|
|
|
2024-10-09 21:27:11 -04:00
|
|
|
if (!ret)
|
|
|
|
bkey_val_copy(&s, snapshot_k);
|
|
|
|
|
2023-08-16 16:54:33 -04:00
|
|
|
if (fsck_err_on(ret ||
|
|
|
|
root_id != bch2_snapshot_root(c, root_id) ||
|
|
|
|
st.k->p.offset != le32_to_cpu(s.tree),
|
2024-02-08 21:10:32 -05:00
|
|
|
trans, snapshot_tree_to_missing_snapshot,
|
2025-03-26 13:21:11 -04:00
|
|
|
"snapshot tree points to missing/incorrect snapshot:\n%s",
|
2024-10-09 21:27:11 -04:00
|
|
|
(bch2_bkey_val_to_text(&buf, c, st.s_c),
|
|
|
|
prt_newline(&buf),
|
|
|
|
ret
|
|
|
|
? prt_printf(&buf, "(%s)", bch2_err_str(ret))
|
|
|
|
: bch2_bkey_val_to_text(&buf, c, snapshot_k.s_c),
|
|
|
|
buf.buf))) {
|
2023-08-16 16:54:33 -04:00
|
|
|
ret = bch2_btree_delete_at(trans, iter, 0);
|
|
|
|
goto err;
|
|
|
|
}
|
|
|
|
|
2024-12-20 04:46:00 -05:00
|
|
|
if (!st.v->master_subvol)
|
|
|
|
goto out;
|
|
|
|
|
2024-12-04 23:40:26 -05:00
|
|
|
ret = bch2_subvolume_get(trans, le32_to_cpu(st.v->master_subvol), false, &subvol);
|
2023-08-16 16:54:33 -04:00
|
|
|
if (ret && !bch2_err_matches(ret, ENOENT))
|
|
|
|
goto err;
|
|
|
|
|
2023-10-24 20:44:36 -04:00
|
|
|
if (fsck_err_on(ret,
|
2024-02-08 21:10:32 -05:00
|
|
|
trans, snapshot_tree_to_missing_subvol,
|
2025-03-26 13:21:11 -04:00
|
|
|
"snapshot tree points to missing subvolume:\n%s",
|
2023-08-16 16:54:33 -04:00
|
|
|
(printbuf_reset(&buf),
|
|
|
|
bch2_bkey_val_to_text(&buf, c, st.s_c), buf.buf)) ||
|
2024-03-17 20:39:42 -04:00
|
|
|
fsck_err_on(!bch2_snapshot_is_ancestor(c,
|
2023-08-16 16:54:33 -04:00
|
|
|
le32_to_cpu(subvol.snapshot),
|
2023-10-24 20:44:36 -04:00
|
|
|
root_id),
|
2024-02-08 21:10:32 -05:00
|
|
|
trans, snapshot_tree_to_wrong_subvol,
|
2025-03-26 13:21:11 -04:00
|
|
|
"snapshot tree points to subvolume that does not point to snapshot in this tree:\n%s",
|
2023-08-16 16:54:33 -04:00
|
|
|
(printbuf_reset(&buf),
|
|
|
|
bch2_bkey_val_to_text(&buf, c, st.s_c), buf.buf)) ||
|
2023-10-24 20:44:36 -04:00
|
|
|
fsck_err_on(BCH_SUBVOLUME_SNAP(&subvol),
|
2024-02-08 21:10:32 -05:00
|
|
|
trans, snapshot_tree_to_snapshot_subvol,
|
2025-03-26 13:21:11 -04:00
|
|
|
"snapshot tree points to snapshot subvolume:\n%s",
|
2023-08-16 16:54:33 -04:00
|
|
|
(printbuf_reset(&buf),
|
|
|
|
bch2_bkey_val_to_text(&buf, c, st.s_c), buf.buf))) {
|
|
|
|
struct bkey_i_snapshot_tree *u;
|
|
|
|
u32 subvol_id;
|
|
|
|
|
|
|
|
ret = bch2_snapshot_tree_master_subvol(trans, root_id, &subvol_id);
|
2024-03-27 22:50:19 -04:00
|
|
|
bch_err_fn(c, ret);
|
|
|
|
|
|
|
|
if (bch2_err_matches(ret, ENOENT)) { /* nothing to be done here */
|
|
|
|
ret = 0;
|
|
|
|
goto err;
|
|
|
|
}
|
|
|
|
|
2023-08-16 16:54:33 -04:00
|
|
|
if (ret)
|
|
|
|
goto err;
|
|
|
|
|
|
|
|
u = bch2_bkey_make_mut_typed(trans, iter, &k, 0, snapshot_tree);
|
|
|
|
ret = PTR_ERR_OR_ZERO(u);
|
|
|
|
if (ret)
|
|
|
|
goto err;
|
|
|
|
|
|
|
|
u->v.master_subvol = cpu_to_le32(subvol_id);
|
|
|
|
st = snapshot_tree_i_to_s_c(u);
|
|
|
|
}
|
2024-12-20 04:46:00 -05:00
|
|
|
out:
|
2023-08-16 16:54:33 -04:00
|
|
|
err:
|
|
|
|
fsck_err:
|
2024-10-09 21:27:11 -04:00
|
|
|
bch2_trans_iter_exit(trans, &snapshot_iter);
|
2023-08-16 16:54:33 -04:00
|
|
|
printbuf_exit(&buf);
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* For each snapshot_tree, make sure it points to the root of a snapshot tree
|
|
|
|
* and that snapshot entry points back to it, or delete it.
|
|
|
|
*
|
|
|
|
* And, make sure it points to a subvolume within that snapshot tree, or correct
|
|
|
|
* it to point to the oldest subvolume within that snapshot tree.
|
|
|
|
*/
|
|
|
|
int bch2_check_snapshot_trees(struct bch_fs *c)
|
|
|
|
{
|
2023-12-16 22:30:09 -05:00
|
|
|
int ret = bch2_trans_run(c,
|
2023-09-12 17:16:02 -04:00
|
|
|
for_each_btree_key_commit(trans, iter,
|
2023-08-16 16:54:33 -04:00
|
|
|
BTREE_ID_snapshot_trees, POS_MIN,
|
2024-04-07 18:05:34 -04:00
|
|
|
BTREE_ITER_prefetch, k,
|
2023-11-28 16:36:54 -05:00
|
|
|
NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
|
2023-09-12 17:16:02 -04:00
|
|
|
check_snapshot_tree(trans, &iter, k)));
|
2023-12-16 22:43:41 -05:00
|
|
|
bch_err_fn(c, ret);
|
2023-08-16 16:54:33 -04:00
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Look up snapshot tree for @tree_id and find root,
|
|
|
|
* make sure @snap_id is a descendent:
|
|
|
|
*/
|
|
|
|
static int snapshot_tree_ptr_good(struct btree_trans *trans,
|
|
|
|
u32 snap_id, u32 tree_id)
|
|
|
|
{
|
|
|
|
struct bch_snapshot_tree s_t;
|
|
|
|
int ret = bch2_snapshot_tree_lookup(trans, tree_id, &s_t);
|
|
|
|
|
|
|
|
if (bch2_err_matches(ret, ENOENT))
|
|
|
|
return 0;
|
|
|
|
if (ret)
|
|
|
|
return ret;
|
|
|
|
|
|
|
|
return bch2_snapshot_is_ancestor_early(trans->c, snap_id, le32_to_cpu(s_t.root_snapshot));
|
|
|
|
}
|
|
|
|
|
|
|
|
u32 bch2_snapshot_skiplist_get(struct bch_fs *c, u32 id)
|
|
|
|
{
|
|
|
|
if (!id)
|
|
|
|
return 0;
|
|
|
|
|
2025-05-24 16:33:39 -04:00
|
|
|
guard(rcu)();
|
|
|
|
const struct snapshot_t *s = snapshot_t(c, id);
|
|
|
|
return s->parent
|
|
|
|
? bch2_snapshot_nth_parent(c, id, get_random_u32_below(s->depth))
|
|
|
|
: id;
|
2023-08-16 16:54:33 -04:00
|
|
|
}
|
|
|
|
|
2023-08-28 15:17:31 -04:00
|
|
|
static int snapshot_skiplist_good(struct btree_trans *trans, u32 id, struct bch_snapshot s)
|
2023-08-16 16:54:33 -04:00
|
|
|
{
|
|
|
|
unsigned i;
|
|
|
|
|
2023-08-28 15:17:31 -04:00
|
|
|
for (i = 0; i < 3; i++)
|
|
|
|
if (!s.parent) {
|
|
|
|
if (s.skip[i])
|
|
|
|
return false;
|
|
|
|
} else {
|
|
|
|
if (!bch2_snapshot_is_ancestor_early(trans->c, id, le32_to_cpu(s.skip[i])))
|
|
|
|
return false;
|
|
|
|
}
|
2023-08-16 16:54:33 -04:00
|
|
|
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* snapshot_tree pointer was incorrect: look up root snapshot node, make sure
|
|
|
|
* its snapshot_tree pointer is correct (allocate new one if necessary), then
|
|
|
|
* update this node's pointer to root node's pointer:
|
|
|
|
*/
|
|
|
|
static int snapshot_tree_ptr_repair(struct btree_trans *trans,
|
|
|
|
struct btree_iter *iter,
|
|
|
|
struct bkey_s_c k,
|
|
|
|
struct bch_snapshot *s)
|
|
|
|
{
|
|
|
|
struct bch_fs *c = trans->c;
|
|
|
|
struct btree_iter root_iter;
|
|
|
|
struct bch_snapshot_tree s_t;
|
|
|
|
struct bkey_s_c_snapshot root;
|
|
|
|
struct bkey_i_snapshot *u;
|
|
|
|
u32 root_id = bch2_snapshot_root(c, k.k->p.offset), tree_id;
|
|
|
|
int ret;
|
|
|
|
|
|
|
|
root = bch2_bkey_get_iter_typed(trans, &root_iter,
|
|
|
|
BTREE_ID_snapshots, POS(0, root_id),
|
2024-04-07 18:05:34 -04:00
|
|
|
BTREE_ITER_with_updates, snapshot);
|
2023-08-16 16:54:33 -04:00
|
|
|
ret = bkey_err(root);
|
|
|
|
if (ret)
|
|
|
|
goto err;
|
|
|
|
|
|
|
|
tree_id = le32_to_cpu(root.v->tree);
|
|
|
|
|
|
|
|
ret = bch2_snapshot_tree_lookup(trans, tree_id, &s_t);
|
|
|
|
if (ret && !bch2_err_matches(ret, ENOENT))
|
|
|
|
return ret;
|
|
|
|
|
|
|
|
if (ret || le32_to_cpu(s_t.root_snapshot) != root_id) {
|
|
|
|
u = bch2_bkey_make_mut_typed(trans, &root_iter, &root.s_c, 0, snapshot);
|
|
|
|
ret = PTR_ERR_OR_ZERO(u) ?:
|
|
|
|
bch2_snapshot_tree_create(trans, root_id,
|
2025-05-19 09:48:50 -04:00
|
|
|
bch2_snapshot_oldest_subvol(c, root_id, NULL),
|
2023-08-16 16:54:33 -04:00
|
|
|
&tree_id);
|
|
|
|
if (ret)
|
|
|
|
goto err;
|
|
|
|
|
|
|
|
u->v.tree = cpu_to_le32(tree_id);
|
|
|
|
if (k.k->p.offset == root_id)
|
|
|
|
*s = u->v;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (k.k->p.offset != root_id) {
|
|
|
|
u = bch2_bkey_make_mut_typed(trans, iter, &k, 0, snapshot);
|
|
|
|
ret = PTR_ERR_OR_ZERO(u);
|
|
|
|
if (ret)
|
|
|
|
goto err;
|
|
|
|
|
|
|
|
u->v.tree = cpu_to_le32(tree_id);
|
|
|
|
*s = u->v;
|
|
|
|
}
|
|
|
|
err:
|
|
|
|
bch2_trans_iter_exit(trans, &root_iter);
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
static int check_snapshot(struct btree_trans *trans,
|
|
|
|
struct btree_iter *iter,
|
|
|
|
struct bkey_s_c k)
|
|
|
|
{
|
|
|
|
struct bch_fs *c = trans->c;
|
|
|
|
struct bch_snapshot s;
|
|
|
|
struct bch_subvolume subvol;
|
|
|
|
struct bch_snapshot v;
|
|
|
|
struct bkey_i_snapshot *u;
|
|
|
|
u32 parent_id = bch2_snapshot_parent_early(c, k.k->p.offset);
|
|
|
|
u32 real_depth;
|
|
|
|
struct printbuf buf = PRINTBUF;
|
|
|
|
u32 i, id;
|
|
|
|
int ret = 0;
|
|
|
|
|
|
|
|
if (k.k->type != KEY_TYPE_snapshot)
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
memset(&s, 0, sizeof(s));
|
2024-02-24 01:18:45 -05:00
|
|
|
memcpy(&s, k.v, min(sizeof(s), bkey_val_bytes(k.k)));
|
2023-08-16 16:54:33 -04:00
|
|
|
|
2025-05-02 12:37:36 -04:00
|
|
|
if (BCH_SNAPSHOT_DELETED(&s))
|
|
|
|
return 0;
|
|
|
|
|
2023-08-16 16:54:33 -04:00
|
|
|
id = le32_to_cpu(s.parent);
|
|
|
|
if (id) {
|
|
|
|
ret = bch2_snapshot_lookup(trans, id, &v);
|
|
|
|
if (bch2_err_matches(ret, ENOENT))
|
|
|
|
bch_err(c, "snapshot with nonexistent parent:\n %s",
|
|
|
|
(bch2_bkey_val_to_text(&buf, c, k), buf.buf));
|
|
|
|
if (ret)
|
|
|
|
goto err;
|
|
|
|
|
|
|
|
if (le32_to_cpu(v.children[0]) != k.k->p.offset &&
|
|
|
|
le32_to_cpu(v.children[1]) != k.k->p.offset) {
|
|
|
|
bch_err(c, "snapshot parent %u missing pointer to child %llu",
|
|
|
|
id, k.k->p.offset);
|
|
|
|
ret = -EINVAL;
|
|
|
|
goto err;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
for (i = 0; i < 2 && s.children[i]; i++) {
|
|
|
|
id = le32_to_cpu(s.children[i]);
|
|
|
|
|
|
|
|
ret = bch2_snapshot_lookup(trans, id, &v);
|
|
|
|
if (bch2_err_matches(ret, ENOENT))
|
|
|
|
bch_err(c, "snapshot node %llu has nonexistent child %u",
|
|
|
|
k.k->p.offset, id);
|
|
|
|
if (ret)
|
|
|
|
goto err;
|
|
|
|
|
|
|
|
if (le32_to_cpu(v.parent) != k.k->p.offset) {
|
|
|
|
bch_err(c, "snapshot child %u has wrong parent (got %u should be %llu)",
|
|
|
|
id, le32_to_cpu(v.parent), k.k->p.offset);
|
|
|
|
ret = -EINVAL;
|
|
|
|
goto err;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2024-03-27 22:50:19 -04:00
|
|
|
bool should_have_subvol = BCH_SNAPSHOT_SUBVOL(&s) &&
|
2025-05-02 12:33:17 -04:00
|
|
|
!BCH_SNAPSHOT_WILL_DELETE(&s);
|
2023-08-16 16:54:33 -04:00
|
|
|
|
|
|
|
if (should_have_subvol) {
|
|
|
|
id = le32_to_cpu(s.subvol);
|
2024-12-04 23:40:26 -05:00
|
|
|
ret = bch2_subvolume_get(trans, id, false, &subvol);
|
2023-08-16 16:54:33 -04:00
|
|
|
if (bch2_err_matches(ret, ENOENT))
|
|
|
|
bch_err(c, "snapshot points to nonexistent subvolume:\n %s",
|
|
|
|
(bch2_bkey_val_to_text(&buf, c, k), buf.buf));
|
|
|
|
if (ret)
|
|
|
|
goto err;
|
|
|
|
|
|
|
|
if (BCH_SNAPSHOT_SUBVOL(&s) != (le32_to_cpu(subvol.snapshot) == k.k->p.offset)) {
|
|
|
|
bch_err(c, "snapshot node %llu has wrong BCH_SNAPSHOT_SUBVOL",
|
|
|
|
k.k->p.offset);
|
|
|
|
ret = -EINVAL;
|
|
|
|
goto err;
|
|
|
|
}
|
|
|
|
} else {
|
2023-10-24 20:44:36 -04:00
|
|
|
if (fsck_err_on(s.subvol,
|
2024-02-08 21:10:32 -05:00
|
|
|
trans, snapshot_should_not_have_subvol,
|
2025-03-26 13:21:11 -04:00
|
|
|
"snapshot should not point to subvol:\n%s",
|
2023-08-16 16:54:33 -04:00
|
|
|
(bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
|
|
|
|
u = bch2_bkey_make_mut_typed(trans, iter, &k, 0, snapshot);
|
|
|
|
ret = PTR_ERR_OR_ZERO(u);
|
|
|
|
if (ret)
|
|
|
|
goto err;
|
|
|
|
|
|
|
|
u->v.subvol = 0;
|
|
|
|
s = u->v;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
ret = snapshot_tree_ptr_good(trans, k.k->p.offset, le32_to_cpu(s.tree));
|
|
|
|
if (ret < 0)
|
|
|
|
goto err;
|
|
|
|
|
2024-02-08 21:10:32 -05:00
|
|
|
if (fsck_err_on(!ret,
|
|
|
|
trans, snapshot_to_bad_snapshot_tree,
|
2025-03-26 13:21:11 -04:00
|
|
|
"snapshot points to missing/incorrect tree:\n%s",
|
2023-08-16 16:54:33 -04:00
|
|
|
(bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
|
|
|
|
ret = snapshot_tree_ptr_repair(trans, iter, k, &s);
|
|
|
|
if (ret)
|
|
|
|
goto err;
|
|
|
|
}
|
|
|
|
ret = 0;
|
|
|
|
|
|
|
|
real_depth = bch2_snapshot_depth(c, parent_id);
|
|
|
|
|
2024-01-03 23:29:02 -05:00
|
|
|
if (fsck_err_on(le32_to_cpu(s.depth) != real_depth,
|
2024-02-08 21:10:32 -05:00
|
|
|
trans, snapshot_bad_depth,
|
2025-03-26 13:21:11 -04:00
|
|
|
"snapshot with incorrect depth field, should be %u:\n%s",
|
2024-01-03 23:29:02 -05:00
|
|
|
real_depth, (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
|
2023-08-16 16:54:33 -04:00
|
|
|
u = bch2_bkey_make_mut_typed(trans, iter, &k, 0, snapshot);
|
|
|
|
ret = PTR_ERR_OR_ZERO(u);
|
|
|
|
if (ret)
|
|
|
|
goto err;
|
|
|
|
|
|
|
|
u->v.depth = cpu_to_le32(real_depth);
|
|
|
|
s = u->v;
|
|
|
|
}
|
|
|
|
|
2023-08-28 15:17:31 -04:00
|
|
|
ret = snapshot_skiplist_good(trans, k.k->p.offset, s);
|
2023-08-16 16:54:33 -04:00
|
|
|
if (ret < 0)
|
|
|
|
goto err;
|
|
|
|
|
2024-02-08 21:10:32 -05:00
|
|
|
if (fsck_err_on(!ret,
|
|
|
|
trans, snapshot_bad_skiplist,
|
2025-03-26 13:21:11 -04:00
|
|
|
"snapshot with bad skiplist field:\n%s",
|
2024-01-03 23:29:02 -05:00
|
|
|
(bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
|
2023-08-16 16:54:33 -04:00
|
|
|
u = bch2_bkey_make_mut_typed(trans, iter, &k, 0, snapshot);
|
|
|
|
ret = PTR_ERR_OR_ZERO(u);
|
|
|
|
if (ret)
|
|
|
|
goto err;
|
|
|
|
|
|
|
|
for (i = 0; i < ARRAY_SIZE(u->v.skip); i++)
|
|
|
|
u->v.skip[i] = cpu_to_le32(bch2_snapshot_skiplist_get(c, parent_id));
|
|
|
|
|
|
|
|
bubble_sort(u->v.skip, ARRAY_SIZE(u->v.skip), cmp_le32);
|
|
|
|
s = u->v;
|
|
|
|
}
|
|
|
|
ret = 0;
|
|
|
|
err:
|
|
|
|
fsck_err:
|
|
|
|
printbuf_exit(&buf);
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
int bch2_check_snapshots(struct bch_fs *c)
|
|
|
|
{
|
|
|
|
/*
|
|
|
|
* We iterate backwards as checking/fixing the depth field requires that
|
|
|
|
* the parent's depth already be correct:
|
|
|
|
*/
|
2023-12-16 22:30:09 -05:00
|
|
|
int ret = bch2_trans_run(c,
|
2023-09-12 17:16:02 -04:00
|
|
|
for_each_btree_key_reverse_commit(trans, iter,
|
2023-12-16 22:43:41 -05:00
|
|
|
BTREE_ID_snapshots, POS_MAX,
|
2024-04-07 18:05:34 -04:00
|
|
|
BTREE_ITER_prefetch, k,
|
2023-12-16 22:43:41 -05:00
|
|
|
NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
|
|
|
|
check_snapshot(trans, &iter, k)));
|
|
|
|
bch_err_fn(c, ret);
|
2023-08-16 16:54:33 -04:00
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2024-03-27 22:50:19 -04:00
|
|
|
static int check_snapshot_exists(struct btree_trans *trans, u32 id)
|
|
|
|
{
|
|
|
|
struct bch_fs *c = trans->c;
|
|
|
|
|
2024-10-09 21:28:11 -04:00
|
|
|
/* Do we need to reconstruct the snapshot_tree entry as well? */
|
|
|
|
struct btree_iter iter;
|
|
|
|
struct bkey_s_c k;
|
|
|
|
int ret = 0;
|
2024-04-26 11:21:35 +08:00
|
|
|
u32 tree_id = 0;
|
2024-10-09 21:28:11 -04:00
|
|
|
|
|
|
|
for_each_btree_key_norestart(trans, iter, BTREE_ID_snapshot_trees, POS_MIN,
|
|
|
|
0, k, ret) {
|
2025-06-19 14:32:57 -04:00
|
|
|
if (k.k->type == KEY_TYPE_snapshot_tree &&
|
|
|
|
le32_to_cpu(bkey_s_c_to_snapshot_tree(k).v->root_snapshot) == id) {
|
2024-10-09 21:28:11 -04:00
|
|
|
tree_id = k.k->p.offset;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
bch2_trans_iter_exit(trans, &iter);
|
|
|
|
|
2024-03-27 22:50:19 -04:00
|
|
|
if (ret)
|
|
|
|
return ret;
|
|
|
|
|
2024-10-09 21:28:11 -04:00
|
|
|
if (!tree_id) {
|
|
|
|
ret = bch2_snapshot_tree_create(trans, id, 0, &tree_id);
|
|
|
|
if (ret)
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2024-03-27 22:50:19 -04:00
|
|
|
struct bkey_i_snapshot *snapshot = bch2_trans_kmalloc(trans, sizeof(*snapshot));
|
|
|
|
ret = PTR_ERR_OR_ZERO(snapshot);
|
|
|
|
if (ret)
|
|
|
|
return ret;
|
|
|
|
|
|
|
|
bkey_snapshot_init(&snapshot->k_i);
|
|
|
|
snapshot->k.p = POS(0, id);
|
|
|
|
snapshot->v.tree = cpu_to_le32(tree_id);
|
|
|
|
snapshot->v.btime.lo = cpu_to_le64(bch2_current_time(c));
|
|
|
|
|
2024-10-09 21:28:11 -04:00
|
|
|
for_each_btree_key_norestart(trans, iter, BTREE_ID_subvolumes, POS_MIN,
|
|
|
|
0, k, ret) {
|
2025-06-19 14:32:57 -04:00
|
|
|
if (k.k->type == KEY_TYPE_subvolume &&
|
|
|
|
le32_to_cpu(bkey_s_c_to_subvolume(k).v->snapshot) == id) {
|
2024-10-09 21:28:11 -04:00
|
|
|
snapshot->v.subvol = cpu_to_le32(k.k->p.offset);
|
|
|
|
SET_BCH_SNAPSHOT_SUBVOL(&snapshot->v, true);
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
bch2_trans_iter_exit(trans, &iter);
|
|
|
|
|
2025-04-02 14:40:06 -04:00
|
|
|
return bch2_snapshot_table_make_room(c, id) ?:
|
|
|
|
bch2_btree_insert_trans(trans, BTREE_ID_snapshots, &snapshot->k_i, 0);
|
2024-03-27 22:50:19 -04:00
|
|
|
}
|
|
|
|
|
|
|
|
/* Figure out which snapshot nodes belong in the same tree: */
|
|
|
|
struct snapshot_tree_reconstruct {
|
|
|
|
enum btree_id btree;
|
|
|
|
struct bpos cur_pos;
|
|
|
|
snapshot_id_list cur_ids;
|
|
|
|
DARRAY(snapshot_id_list) trees;
|
|
|
|
};
|
|
|
|
|
|
|
|
static void snapshot_tree_reconstruct_exit(struct snapshot_tree_reconstruct *r)
|
|
|
|
{
|
|
|
|
darray_for_each(r->trees, i)
|
|
|
|
darray_exit(i);
|
|
|
|
darray_exit(&r->trees);
|
|
|
|
darray_exit(&r->cur_ids);
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline bool same_snapshot(struct snapshot_tree_reconstruct *r, struct bpos pos)
|
|
|
|
{
|
|
|
|
return r->btree == BTREE_ID_inodes
|
|
|
|
? r->cur_pos.offset == pos.offset
|
|
|
|
: r->cur_pos.inode == pos.inode;
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline bool snapshot_id_lists_have_common(snapshot_id_list *l, snapshot_id_list *r)
|
|
|
|
{
|
2025-05-29 16:56:50 -04:00
|
|
|
return darray_find_p(*l, i, snapshot_list_has_id(r, *i)) != NULL;
|
2024-03-27 22:50:19 -04:00
|
|
|
}
|
|
|
|
|
|
|
|
static void snapshot_id_list_to_text(struct printbuf *out, snapshot_id_list *s)
|
|
|
|
{
|
|
|
|
bool first = true;
|
|
|
|
darray_for_each(*s, i) {
|
|
|
|
if (!first)
|
|
|
|
prt_char(out, ' ');
|
|
|
|
first = false;
|
|
|
|
prt_printf(out, "%u", *i);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
static int snapshot_tree_reconstruct_next(struct bch_fs *c, struct snapshot_tree_reconstruct *r)
|
|
|
|
{
|
|
|
|
if (r->cur_ids.nr) {
|
|
|
|
darray_for_each(r->trees, i)
|
|
|
|
if (snapshot_id_lists_have_common(i, &r->cur_ids)) {
|
|
|
|
int ret = snapshot_list_merge(c, i, &r->cur_ids);
|
|
|
|
if (ret)
|
|
|
|
return ret;
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
darray_push(&r->trees, r->cur_ids);
|
|
|
|
darray_init(&r->cur_ids);
|
|
|
|
}
|
|
|
|
out:
|
|
|
|
r->cur_ids.nr = 0;
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
static int get_snapshot_trees(struct bch_fs *c, struct snapshot_tree_reconstruct *r, struct bpos pos)
|
|
|
|
{
|
|
|
|
if (!same_snapshot(r, pos))
|
|
|
|
snapshot_tree_reconstruct_next(c, r);
|
|
|
|
r->cur_pos = pos;
|
|
|
|
return snapshot_list_add_nodup(c, &r->cur_ids, pos.snapshot);
|
|
|
|
}
|
|
|
|
|
|
|
|
int bch2_reconstruct_snapshots(struct bch_fs *c)
|
|
|
|
{
|
|
|
|
struct btree_trans *trans = bch2_trans_get(c);
|
|
|
|
struct printbuf buf = PRINTBUF;
|
|
|
|
struct snapshot_tree_reconstruct r = {};
|
|
|
|
int ret = 0;
|
|
|
|
|
|
|
|
for (unsigned btree = 0; btree < BTREE_ID_NR; btree++) {
|
|
|
|
if (btree_type_has_snapshots(btree)) {
|
|
|
|
r.btree = btree;
|
|
|
|
|
|
|
|
ret = for_each_btree_key(trans, iter, btree, POS_MIN,
|
2024-04-07 18:05:34 -04:00
|
|
|
BTREE_ITER_all_snapshots|BTREE_ITER_prefetch, k, ({
|
2024-03-27 22:50:19 -04:00
|
|
|
get_snapshot_trees(c, &r, k.k->p);
|
|
|
|
}));
|
|
|
|
if (ret)
|
|
|
|
goto err;
|
|
|
|
|
|
|
|
snapshot_tree_reconstruct_next(c, &r);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
darray_for_each(r.trees, t) {
|
|
|
|
printbuf_reset(&buf);
|
|
|
|
snapshot_id_list_to_text(&buf, t);
|
|
|
|
|
|
|
|
darray_for_each(*t, id) {
|
2025-05-02 12:37:36 -04:00
|
|
|
if (fsck_err_on(bch2_snapshot_id_state(c, *id) == SNAPSHOT_ID_empty,
|
2024-02-08 21:10:32 -05:00
|
|
|
trans, snapshot_node_missing,
|
2024-03-28 00:39:11 -04:00
|
|
|
"snapshot node %u from tree %s missing, recreate?", *id, buf.buf)) {
|
2024-03-27 22:50:19 -04:00
|
|
|
if (t->nr > 1) {
|
|
|
|
bch_err(c, "cannot reconstruct snapshot trees with multiple nodes");
|
2025-05-28 11:57:50 -04:00
|
|
|
ret = bch_err_throw(c, fsck_repair_unimplemented);
|
2024-03-27 22:50:19 -04:00
|
|
|
goto err;
|
|
|
|
}
|
|
|
|
|
|
|
|
ret = commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
|
|
|
|
check_snapshot_exists(trans, *id));
|
|
|
|
if (ret)
|
|
|
|
goto err;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
fsck_err:
|
|
|
|
err:
|
|
|
|
bch2_trans_put(trans);
|
|
|
|
snapshot_tree_reconstruct_exit(&r);
|
|
|
|
printbuf_exit(&buf);
|
|
|
|
bch_err_fn(c, ret);
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2025-05-02 12:37:36 -04:00
|
|
|
int __bch2_check_key_has_snapshot(struct btree_trans *trans,
|
|
|
|
struct btree_iter *iter,
|
|
|
|
struct bkey_s_c k)
|
2024-05-26 12:38:30 -04:00
|
|
|
{
|
|
|
|
struct bch_fs *c = trans->c;
|
|
|
|
struct printbuf buf = PRINTBUF;
|
|
|
|
int ret = 0;
|
2025-05-02 12:37:36 -04:00
|
|
|
enum snapshot_id_state state = bch2_snapshot_id_state(c, k.k->p.snapshot);
|
2024-05-26 12:38:30 -04:00
|
|
|
|
2025-05-02 12:37:36 -04:00
|
|
|
/* Snapshot was definitively deleted, this error is marked autofix */
|
|
|
|
if (fsck_err_on(state == SNAPSHOT_ID_deleted,
|
|
|
|
trans, bkey_in_deleted_snapshot,
|
|
|
|
"key in deleted snapshot %s, delete?",
|
|
|
|
(bch2_btree_id_to_text(&buf, iter->btree_id),
|
|
|
|
prt_char(&buf, ' '),
|
|
|
|
bch2_bkey_val_to_text(&buf, c, k), buf.buf)))
|
|
|
|
ret = bch2_btree_delete_at(trans, iter,
|
|
|
|
BTREE_UPDATE_internal_snapshot_node) ?: 1;
|
|
|
|
|
2025-05-31 13:10:43 -04:00
|
|
|
if (state == SNAPSHOT_ID_empty) {
|
|
|
|
/*
|
|
|
|
* Snapshot missing: we should have caught this with btree_lost_data and
|
|
|
|
* kicked off reconstruct_snapshots, so if we end up here we have no
|
|
|
|
* idea what happened.
|
|
|
|
*
|
|
|
|
* Do not delete unless we know that subvolumes and snapshots
|
|
|
|
* are consistent:
|
|
|
|
*
|
|
|
|
* XXX:
|
|
|
|
*
|
|
|
|
* We could be smarter here, and instead of using the generic
|
|
|
|
* recovery pass ratelimiting, track if there have been any
|
|
|
|
* changes to the snapshots or inodes btrees since those passes
|
|
|
|
* last ran.
|
|
|
|
*/
|
|
|
|
ret = bch2_require_recovery_pass(c, &buf, BCH_RECOVERY_PASS_check_snapshots) ?: ret;
|
|
|
|
ret = bch2_require_recovery_pass(c, &buf, BCH_RECOVERY_PASS_check_subvols) ?: ret;
|
|
|
|
|
|
|
|
if (c->sb.btrees_lost_data & BIT_ULL(BTREE_ID_snapshots))
|
|
|
|
ret = bch2_require_recovery_pass(c, &buf, BCH_RECOVERY_PASS_reconstruct_snapshots) ?: ret;
|
|
|
|
|
|
|
|
unsigned repair_flags = FSCK_CAN_IGNORE | (!ret ? FSCK_CAN_FIX : 0);
|
|
|
|
|
|
|
|
if (__fsck_err(trans, repair_flags, bkey_in_missing_snapshot,
|
|
|
|
"key in missing snapshot %s, delete?",
|
|
|
|
(bch2_btree_id_to_text(&buf, iter->btree_id),
|
|
|
|
prt_char(&buf, ' '),
|
|
|
|
bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
|
|
|
|
ret = bch2_btree_delete_at(trans, iter,
|
|
|
|
BTREE_UPDATE_internal_snapshot_node) ?: 1;
|
|
|
|
}
|
|
|
|
}
|
2024-05-26 12:38:30 -04:00
|
|
|
fsck_err:
|
|
|
|
printbuf_exit(&buf);
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2025-05-28 15:08:19 -04:00
|
|
|
int __bch2_get_snapshot_overwrites(struct btree_trans *trans,
|
|
|
|
enum btree_id btree, struct bpos pos,
|
|
|
|
snapshot_id_list *s)
|
|
|
|
{
|
|
|
|
struct bch_fs *c = trans->c;
|
|
|
|
struct btree_iter iter;
|
|
|
|
struct bkey_s_c k;
|
|
|
|
int ret = 0;
|
|
|
|
|
|
|
|
for_each_btree_key_reverse_norestart(trans, iter, btree, bpos_predecessor(pos),
|
|
|
|
BTREE_ITER_all_snapshots, k, ret) {
|
|
|
|
if (!bkey_eq(k.k->p, pos))
|
|
|
|
break;
|
|
|
|
|
|
|
|
if (!bch2_snapshot_is_ancestor(c, k.k->p.snapshot, pos.snapshot) ||
|
|
|
|
snapshot_list_has_ancestor(c, s, k.k->p.snapshot))
|
|
|
|
continue;
|
|
|
|
|
|
|
|
ret = snapshot_list_add(c, s, k.k->p.snapshot);
|
|
|
|
if (ret)
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
bch2_trans_iter_exit(trans, &iter);
|
|
|
|
if (ret)
|
|
|
|
darray_exit(s);
|
|
|
|
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2023-08-16 16:54:33 -04:00
|
|
|
/*
|
|
|
|
* Mark a snapshot as deleted, for future cleanup:
|
|
|
|
*/
|
|
|
|
int bch2_snapshot_node_set_deleted(struct btree_trans *trans, u32 id)
|
|
|
|
{
|
|
|
|
struct btree_iter iter;
|
2024-12-20 04:46:00 -05:00
|
|
|
struct bkey_i_snapshot *s =
|
|
|
|
bch2_bkey_get_mut_typed(trans, &iter,
|
2023-08-16 16:54:33 -04:00
|
|
|
BTREE_ID_snapshots, POS(0, id),
|
|
|
|
0, snapshot);
|
2024-12-20 04:46:00 -05:00
|
|
|
int ret = PTR_ERR_OR_ZERO(s);
|
2023-08-16 16:54:33 -04:00
|
|
|
if (unlikely(ret)) {
|
|
|
|
bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT),
|
|
|
|
trans->c, "missing snapshot %u", id);
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* already deleted? */
|
2025-05-02 12:33:17 -04:00
|
|
|
if (BCH_SNAPSHOT_WILL_DELETE(&s->v))
|
2023-08-16 16:54:33 -04:00
|
|
|
goto err;
|
|
|
|
|
2025-05-02 12:33:17 -04:00
|
|
|
SET_BCH_SNAPSHOT_WILL_DELETE(&s->v, true);
|
2023-08-16 16:54:33 -04:00
|
|
|
SET_BCH_SNAPSHOT_SUBVOL(&s->v, false);
|
|
|
|
s->v.subvol = 0;
|
|
|
|
err:
|
|
|
|
bch2_trans_iter_exit(trans, &iter);
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
bcachefs: Cleanup redundant snapshot nodes
After deleteing snapshots, we may be left with a snapshot tree where
some nodes only have one child, and we have a linear chain.
Interior snapshot nodes are never used directly (i.e. they never have
subvolumes that point to them), they are only referered to by child
snapshot nodes - hence, they are redundant.
The existing code talks about redundant snapshot nodes as forming and
equivalence class; i.e. nodes for which snapshot_t->equiv is equal. In a
given equivalence class, we only ever need a single key at a given
position - i.e. multiple versions with different snapshot fields are
redundant.
The existing snapshot cleanup code deletes these redundant keys, but not
redundant nodes. It turns out this is buggy, because we assume that
after snapshot deletion finishes we should only have a single key per
equivalence class, but the btree update path doesn't preserve this -
overwriting keys in old snapshots doesn't check for the equivalence
class being equal, and thus we can end up with duplicate keys in the
same equivalence class and fsck complaining about snapshot deletion not
having run correctly.
The equivalence class notion has been leaking out of the core snapshots
code and into too much other code, i.e. fsck, so this patch takes a
different approach: snapshot deletion now moves keys to the node in an
equivalence class being kept (the leafiest node) and then deletes the
redundant nodes in the equivalance class.
Some work has to be done to correctly delete interior snapshot nodes;
snapshot node depth and skiplist fields for descendent nodes have to be
fixed.
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
2023-08-17 22:10:02 -04:00
|
|
|
static inline void normalize_snapshot_child_pointers(struct bch_snapshot *s)
|
|
|
|
{
|
|
|
|
if (le32_to_cpu(s->children[0]) < le32_to_cpu(s->children[1]))
|
|
|
|
swap(s->children[0], s->children[1]);
|
|
|
|
}
|
|
|
|
|
2023-09-12 18:41:22 -04:00
|
|
|
static int bch2_snapshot_node_delete(struct btree_trans *trans, u32 id)
|
2023-08-16 16:54:33 -04:00
|
|
|
{
|
|
|
|
struct bch_fs *c = trans->c;
|
2025-03-21 15:18:51 -04:00
|
|
|
struct btree_iter iter, p_iter = {};
|
|
|
|
struct btree_iter c_iter = {};
|
|
|
|
struct btree_iter tree_iter = {};
|
bcachefs: Cleanup redundant snapshot nodes
After deleteing snapshots, we may be left with a snapshot tree where
some nodes only have one child, and we have a linear chain.
Interior snapshot nodes are never used directly (i.e. they never have
subvolumes that point to them), they are only referered to by child
snapshot nodes - hence, they are redundant.
The existing code talks about redundant snapshot nodes as forming and
equivalence class; i.e. nodes for which snapshot_t->equiv is equal. In a
given equivalence class, we only ever need a single key at a given
position - i.e. multiple versions with different snapshot fields are
redundant.
The existing snapshot cleanup code deletes these redundant keys, but not
redundant nodes. It turns out this is buggy, because we assume that
after snapshot deletion finishes we should only have a single key per
equivalence class, but the btree update path doesn't preserve this -
overwriting keys in old snapshots doesn't check for the equivalence
class being equal, and thus we can end up with duplicate keys in the
same equivalence class and fsck complaining about snapshot deletion not
having run correctly.
The equivalence class notion has been leaking out of the core snapshots
code and into too much other code, i.e. fsck, so this patch takes a
different approach: snapshot deletion now moves keys to the node in an
equivalence class being kept (the leafiest node) and then deletes the
redundant nodes in the equivalance class.
Some work has to be done to correctly delete interior snapshot nodes;
snapshot node depth and skiplist fields for descendent nodes have to be
fixed.
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
2023-08-17 22:10:02 -04:00
|
|
|
u32 parent_id, child_id;
|
2023-08-16 16:54:33 -04:00
|
|
|
unsigned i;
|
|
|
|
int ret = 0;
|
|
|
|
|
2025-05-02 12:37:36 -04:00
|
|
|
struct bkey_i_snapshot *s =
|
|
|
|
bch2_bkey_get_mut_typed(trans, &iter, BTREE_ID_snapshots, POS(0, id),
|
|
|
|
BTREE_ITER_intent, snapshot);
|
|
|
|
ret = PTR_ERR_OR_ZERO(s);
|
2023-08-16 16:54:33 -04:00
|
|
|
bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), c,
|
|
|
|
"missing snapshot %u", id);
|
|
|
|
|
|
|
|
if (ret)
|
|
|
|
goto err;
|
|
|
|
|
2025-05-02 12:37:36 -04:00
|
|
|
BUG_ON(BCH_SNAPSHOT_DELETED(&s->v));
|
|
|
|
BUG_ON(s->v.children[1]);
|
bcachefs: Cleanup redundant snapshot nodes
After deleteing snapshots, we may be left with a snapshot tree where
some nodes only have one child, and we have a linear chain.
Interior snapshot nodes are never used directly (i.e. they never have
subvolumes that point to them), they are only referered to by child
snapshot nodes - hence, they are redundant.
The existing code talks about redundant snapshot nodes as forming and
equivalence class; i.e. nodes for which snapshot_t->equiv is equal. In a
given equivalence class, we only ever need a single key at a given
position - i.e. multiple versions with different snapshot fields are
redundant.
The existing snapshot cleanup code deletes these redundant keys, but not
redundant nodes. It turns out this is buggy, because we assume that
after snapshot deletion finishes we should only have a single key per
equivalence class, but the btree update path doesn't preserve this -
overwriting keys in old snapshots doesn't check for the equivalence
class being equal, and thus we can end up with duplicate keys in the
same equivalence class and fsck complaining about snapshot deletion not
having run correctly.
The equivalence class notion has been leaking out of the core snapshots
code and into too much other code, i.e. fsck, so this patch takes a
different approach: snapshot deletion now moves keys to the node in an
equivalence class being kept (the leafiest node) and then deletes the
redundant nodes in the equivalance class.
Some work has to be done to correctly delete interior snapshot nodes;
snapshot node depth and skiplist fields for descendent nodes have to be
fixed.
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
2023-08-17 22:10:02 -04:00
|
|
|
|
2025-05-02 12:37:36 -04:00
|
|
|
parent_id = le32_to_cpu(s->v.parent);
|
|
|
|
child_id = le32_to_cpu(s->v.children[0]);
|
2023-08-16 16:54:33 -04:00
|
|
|
|
|
|
|
if (parent_id) {
|
|
|
|
struct bkey_i_snapshot *parent;
|
|
|
|
|
|
|
|
parent = bch2_bkey_get_mut_typed(trans, &p_iter,
|
|
|
|
BTREE_ID_snapshots, POS(0, parent_id),
|
|
|
|
0, snapshot);
|
|
|
|
ret = PTR_ERR_OR_ZERO(parent);
|
bcachefs: Cleanup redundant snapshot nodes
After deleteing snapshots, we may be left with a snapshot tree where
some nodes only have one child, and we have a linear chain.
Interior snapshot nodes are never used directly (i.e. they never have
subvolumes that point to them), they are only referered to by child
snapshot nodes - hence, they are redundant.
The existing code talks about redundant snapshot nodes as forming and
equivalence class; i.e. nodes for which snapshot_t->equiv is equal. In a
given equivalence class, we only ever need a single key at a given
position - i.e. multiple versions with different snapshot fields are
redundant.
The existing snapshot cleanup code deletes these redundant keys, but not
redundant nodes. It turns out this is buggy, because we assume that
after snapshot deletion finishes we should only have a single key per
equivalence class, but the btree update path doesn't preserve this -
overwriting keys in old snapshots doesn't check for the equivalence
class being equal, and thus we can end up with duplicate keys in the
same equivalence class and fsck complaining about snapshot deletion not
having run correctly.
The equivalence class notion has been leaking out of the core snapshots
code and into too much other code, i.e. fsck, so this patch takes a
different approach: snapshot deletion now moves keys to the node in an
equivalence class being kept (the leafiest node) and then deletes the
redundant nodes in the equivalance class.
Some work has to be done to correctly delete interior snapshot nodes;
snapshot node depth and skiplist fields for descendent nodes have to be
fixed.
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
2023-08-17 22:10:02 -04:00
|
|
|
bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), c,
|
|
|
|
"missing snapshot %u", parent_id);
|
|
|
|
if (unlikely(ret))
|
2023-08-16 16:54:33 -04:00
|
|
|
goto err;
|
|
|
|
|
bcachefs: Cleanup redundant snapshot nodes
After deleteing snapshots, we may be left with a snapshot tree where
some nodes only have one child, and we have a linear chain.
Interior snapshot nodes are never used directly (i.e. they never have
subvolumes that point to them), they are only referered to by child
snapshot nodes - hence, they are redundant.
The existing code talks about redundant snapshot nodes as forming and
equivalence class; i.e. nodes for which snapshot_t->equiv is equal. In a
given equivalence class, we only ever need a single key at a given
position - i.e. multiple versions with different snapshot fields are
redundant.
The existing snapshot cleanup code deletes these redundant keys, but not
redundant nodes. It turns out this is buggy, because we assume that
after snapshot deletion finishes we should only have a single key per
equivalence class, but the btree update path doesn't preserve this -
overwriting keys in old snapshots doesn't check for the equivalence
class being equal, and thus we can end up with duplicate keys in the
same equivalence class and fsck complaining about snapshot deletion not
having run correctly.
The equivalence class notion has been leaking out of the core snapshots
code and into too much other code, i.e. fsck, so this patch takes a
different approach: snapshot deletion now moves keys to the node in an
equivalence class being kept (the leafiest node) and then deletes the
redundant nodes in the equivalance class.
Some work has to be done to correctly delete interior snapshot nodes;
snapshot node depth and skiplist fields for descendent nodes have to be
fixed.
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
2023-08-17 22:10:02 -04:00
|
|
|
/* find entry in parent->children for node being deleted */
|
2023-08-16 16:54:33 -04:00
|
|
|
for (i = 0; i < 2; i++)
|
|
|
|
if (le32_to_cpu(parent->v.children[i]) == id)
|
|
|
|
break;
|
|
|
|
|
bcachefs: Cleanup redundant snapshot nodes
After deleteing snapshots, we may be left with a snapshot tree where
some nodes only have one child, and we have a linear chain.
Interior snapshot nodes are never used directly (i.e. they never have
subvolumes that point to them), they are only referered to by child
snapshot nodes - hence, they are redundant.
The existing code talks about redundant snapshot nodes as forming and
equivalence class; i.e. nodes for which snapshot_t->equiv is equal. In a
given equivalence class, we only ever need a single key at a given
position - i.e. multiple versions with different snapshot fields are
redundant.
The existing snapshot cleanup code deletes these redundant keys, but not
redundant nodes. It turns out this is buggy, because we assume that
after snapshot deletion finishes we should only have a single key per
equivalence class, but the btree update path doesn't preserve this -
overwriting keys in old snapshots doesn't check for the equivalence
class being equal, and thus we can end up with duplicate keys in the
same equivalence class and fsck complaining about snapshot deletion not
having run correctly.
The equivalence class notion has been leaking out of the core snapshots
code and into too much other code, i.e. fsck, so this patch takes a
different approach: snapshot deletion now moves keys to the node in an
equivalence class being kept (the leafiest node) and then deletes the
redundant nodes in the equivalance class.
Some work has to be done to correctly delete interior snapshot nodes;
snapshot node depth and skiplist fields for descendent nodes have to be
fixed.
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
2023-08-17 22:10:02 -04:00
|
|
|
if (bch2_fs_inconsistent_on(i == 2, c,
|
|
|
|
"snapshot %u missing child pointer to %u",
|
|
|
|
parent_id, id))
|
|
|
|
goto err;
|
|
|
|
|
2023-11-11 16:31:20 -05:00
|
|
|
parent->v.children[i] = cpu_to_le32(child_id);
|
2023-08-16 16:54:33 -04:00
|
|
|
|
bcachefs: Cleanup redundant snapshot nodes
After deleteing snapshots, we may be left with a snapshot tree where
some nodes only have one child, and we have a linear chain.
Interior snapshot nodes are never used directly (i.e. they never have
subvolumes that point to them), they are only referered to by child
snapshot nodes - hence, they are redundant.
The existing code talks about redundant snapshot nodes as forming and
equivalence class; i.e. nodes for which snapshot_t->equiv is equal. In a
given equivalence class, we only ever need a single key at a given
position - i.e. multiple versions with different snapshot fields are
redundant.
The existing snapshot cleanup code deletes these redundant keys, but not
redundant nodes. It turns out this is buggy, because we assume that
after snapshot deletion finishes we should only have a single key per
equivalence class, but the btree update path doesn't preserve this -
overwriting keys in old snapshots doesn't check for the equivalence
class being equal, and thus we can end up with duplicate keys in the
same equivalence class and fsck complaining about snapshot deletion not
having run correctly.
The equivalence class notion has been leaking out of the core snapshots
code and into too much other code, i.e. fsck, so this patch takes a
different approach: snapshot deletion now moves keys to the node in an
equivalence class being kept (the leafiest node) and then deletes the
redundant nodes in the equivalance class.
Some work has to be done to correctly delete interior snapshot nodes;
snapshot node depth and skiplist fields for descendent nodes have to be
fixed.
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
2023-08-17 22:10:02 -04:00
|
|
|
normalize_snapshot_child_pointers(&parent->v);
|
|
|
|
}
|
|
|
|
|
|
|
|
if (child_id) {
|
|
|
|
struct bkey_i_snapshot *child;
|
|
|
|
|
|
|
|
child = bch2_bkey_get_mut_typed(trans, &c_iter,
|
|
|
|
BTREE_ID_snapshots, POS(0, child_id),
|
|
|
|
0, snapshot);
|
|
|
|
ret = PTR_ERR_OR_ZERO(child);
|
|
|
|
bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), c,
|
|
|
|
"missing snapshot %u", child_id);
|
|
|
|
if (unlikely(ret))
|
|
|
|
goto err;
|
|
|
|
|
|
|
|
child->v.parent = cpu_to_le32(parent_id);
|
|
|
|
|
|
|
|
if (!child->v.parent) {
|
|
|
|
child->v.skip[0] = 0;
|
|
|
|
child->v.skip[1] = 0;
|
|
|
|
child->v.skip[2] = 0;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if (!parent_id) {
|
2023-08-16 16:54:33 -04:00
|
|
|
/*
|
|
|
|
* We're deleting the root of a snapshot tree: update the
|
|
|
|
* snapshot_tree entry to point to the new root, or delete it if
|
|
|
|
* this is the last snapshot ID in this tree:
|
|
|
|
*/
|
|
|
|
struct bkey_i_snapshot_tree *s_t;
|
|
|
|
|
2025-05-02 12:37:36 -04:00
|
|
|
BUG_ON(s->v.children[1]);
|
2023-08-16 16:54:33 -04:00
|
|
|
|
|
|
|
s_t = bch2_bkey_get_mut_typed(trans, &tree_iter,
|
2025-05-02 12:37:36 -04:00
|
|
|
BTREE_ID_snapshot_trees, POS(0, le32_to_cpu(s->v.tree)),
|
2023-08-16 16:54:33 -04:00
|
|
|
0, snapshot_tree);
|
|
|
|
ret = PTR_ERR_OR_ZERO(s_t);
|
|
|
|
if (ret)
|
|
|
|
goto err;
|
|
|
|
|
2025-05-02 12:37:36 -04:00
|
|
|
if (s->v.children[0]) {
|
|
|
|
s_t->v.root_snapshot = s->v.children[0];
|
2023-08-16 16:54:33 -04:00
|
|
|
} else {
|
|
|
|
s_t->k.type = KEY_TYPE_deleted;
|
|
|
|
set_bkey_val_u64s(&s_t->k, 0);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2025-05-02 12:37:36 -04:00
|
|
|
if (!bch2_request_incompat_feature(c, bcachefs_metadata_version_snapshot_deletion_v2)) {
|
|
|
|
SET_BCH_SNAPSHOT_DELETED(&s->v, true);
|
|
|
|
s->v.parent = 0;
|
|
|
|
s->v.children[0] = 0;
|
|
|
|
s->v.children[1] = 0;
|
|
|
|
s->v.subvol = 0;
|
|
|
|
s->v.tree = 0;
|
|
|
|
s->v.depth = 0;
|
|
|
|
s->v.skip[0] = 0;
|
|
|
|
s->v.skip[1] = 0;
|
|
|
|
s->v.skip[2] = 0;
|
|
|
|
} else {
|
|
|
|
s->k.type = KEY_TYPE_deleted;
|
|
|
|
set_bkey_val_u64s(&s->k, 0);
|
|
|
|
}
|
2023-08-16 16:54:33 -04:00
|
|
|
err:
|
|
|
|
bch2_trans_iter_exit(trans, &tree_iter);
|
|
|
|
bch2_trans_iter_exit(trans, &p_iter);
|
bcachefs: Cleanup redundant snapshot nodes
After deleteing snapshots, we may be left with a snapshot tree where
some nodes only have one child, and we have a linear chain.
Interior snapshot nodes are never used directly (i.e. they never have
subvolumes that point to them), they are only referered to by child
snapshot nodes - hence, they are redundant.
The existing code talks about redundant snapshot nodes as forming and
equivalence class; i.e. nodes for which snapshot_t->equiv is equal. In a
given equivalence class, we only ever need a single key at a given
position - i.e. multiple versions with different snapshot fields are
redundant.
The existing snapshot cleanup code deletes these redundant keys, but not
redundant nodes. It turns out this is buggy, because we assume that
after snapshot deletion finishes we should only have a single key per
equivalence class, but the btree update path doesn't preserve this -
overwriting keys in old snapshots doesn't check for the equivalence
class being equal, and thus we can end up with duplicate keys in the
same equivalence class and fsck complaining about snapshot deletion not
having run correctly.
The equivalence class notion has been leaking out of the core snapshots
code and into too much other code, i.e. fsck, so this patch takes a
different approach: snapshot deletion now moves keys to the node in an
equivalence class being kept (the leafiest node) and then deletes the
redundant nodes in the equivalance class.
Some work has to be done to correctly delete interior snapshot nodes;
snapshot node depth and skiplist fields for descendent nodes have to be
fixed.
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
2023-08-17 22:10:02 -04:00
|
|
|
bch2_trans_iter_exit(trans, &c_iter);
|
2023-08-16 16:54:33 -04:00
|
|
|
bch2_trans_iter_exit(trans, &iter);
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
static int create_snapids(struct btree_trans *trans, u32 parent, u32 tree,
|
|
|
|
u32 *new_snapids,
|
|
|
|
u32 *snapshot_subvols,
|
|
|
|
unsigned nr_snapids)
|
|
|
|
{
|
|
|
|
struct bch_fs *c = trans->c;
|
|
|
|
struct btree_iter iter;
|
|
|
|
struct bkey_i_snapshot *n;
|
|
|
|
struct bkey_s_c k;
|
|
|
|
unsigned i, j;
|
|
|
|
u32 depth = bch2_snapshot_depth(c, parent);
|
|
|
|
int ret;
|
|
|
|
|
|
|
|
bch2_trans_iter_init(trans, &iter, BTREE_ID_snapshots,
|
2024-04-07 18:05:34 -04:00
|
|
|
POS_MIN, BTREE_ITER_intent);
|
2025-03-21 15:18:51 -04:00
|
|
|
k = bch2_btree_iter_peek(trans, &iter);
|
2023-08-16 16:54:33 -04:00
|
|
|
ret = bkey_err(k);
|
|
|
|
if (ret)
|
|
|
|
goto err;
|
|
|
|
|
|
|
|
for (i = 0; i < nr_snapids; i++) {
|
2025-03-21 15:18:51 -04:00
|
|
|
k = bch2_btree_iter_prev_slot(trans, &iter);
|
2023-08-16 16:54:33 -04:00
|
|
|
ret = bkey_err(k);
|
|
|
|
if (ret)
|
|
|
|
goto err;
|
|
|
|
|
|
|
|
if (!k.k || !k.k->p.offset) {
|
2025-05-28 11:57:50 -04:00
|
|
|
ret = bch_err_throw(c, ENOSPC_snapshot_create);
|
2023-08-16 16:54:33 -04:00
|
|
|
goto err;
|
|
|
|
}
|
|
|
|
|
|
|
|
n = bch2_bkey_alloc(trans, &iter, 0, snapshot);
|
|
|
|
ret = PTR_ERR_OR_ZERO(n);
|
|
|
|
if (ret)
|
|
|
|
goto err;
|
|
|
|
|
|
|
|
n->v.flags = 0;
|
|
|
|
n->v.parent = cpu_to_le32(parent);
|
|
|
|
n->v.subvol = cpu_to_le32(snapshot_subvols[i]);
|
|
|
|
n->v.tree = cpu_to_le32(tree);
|
|
|
|
n->v.depth = cpu_to_le32(depth);
|
2024-01-20 23:35:41 -05:00
|
|
|
n->v.btime.lo = cpu_to_le64(bch2_current_time(c));
|
|
|
|
n->v.btime.hi = 0;
|
2023-08-16 16:54:33 -04:00
|
|
|
|
|
|
|
for (j = 0; j < ARRAY_SIZE(n->v.skip); j++)
|
|
|
|
n->v.skip[j] = cpu_to_le32(bch2_snapshot_skiplist_get(c, parent));
|
|
|
|
|
|
|
|
bubble_sort(n->v.skip, ARRAY_SIZE(n->v.skip), cmp_le32);
|
|
|
|
SET_BCH_SNAPSHOT_SUBVOL(&n->v, true);
|
|
|
|
|
2023-12-27 23:19:09 -05:00
|
|
|
ret = __bch2_mark_snapshot(trans, BTREE_ID_snapshots, 0,
|
2023-08-16 16:54:33 -04:00
|
|
|
bkey_s_c_null, bkey_i_to_s_c(&n->k_i), 0);
|
|
|
|
if (ret)
|
|
|
|
goto err;
|
|
|
|
|
|
|
|
new_snapids[i] = iter.pos.offset;
|
|
|
|
}
|
|
|
|
err:
|
|
|
|
bch2_trans_iter_exit(trans, &iter);
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Create new snapshot IDs as children of an existing snapshot ID:
|
|
|
|
*/
|
|
|
|
static int bch2_snapshot_node_create_children(struct btree_trans *trans, u32 parent,
|
|
|
|
u32 *new_snapids,
|
|
|
|
u32 *snapshot_subvols,
|
|
|
|
unsigned nr_snapids)
|
|
|
|
{
|
|
|
|
struct btree_iter iter;
|
|
|
|
struct bkey_i_snapshot *n_parent;
|
|
|
|
int ret = 0;
|
|
|
|
|
|
|
|
n_parent = bch2_bkey_get_mut_typed(trans, &iter,
|
|
|
|
BTREE_ID_snapshots, POS(0, parent),
|
|
|
|
0, snapshot);
|
|
|
|
ret = PTR_ERR_OR_ZERO(n_parent);
|
|
|
|
if (unlikely(ret)) {
|
|
|
|
if (bch2_err_matches(ret, ENOENT))
|
|
|
|
bch_err(trans->c, "snapshot %u not found", parent);
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (n_parent->v.children[0] || n_parent->v.children[1]) {
|
|
|
|
bch_err(trans->c, "Trying to add child snapshot nodes to parent that already has children");
|
|
|
|
ret = -EINVAL;
|
|
|
|
goto err;
|
|
|
|
}
|
|
|
|
|
|
|
|
ret = create_snapids(trans, parent, le32_to_cpu(n_parent->v.tree),
|
|
|
|
new_snapids, snapshot_subvols, nr_snapids);
|
|
|
|
if (ret)
|
|
|
|
goto err;
|
|
|
|
|
|
|
|
n_parent->v.children[0] = cpu_to_le32(new_snapids[0]);
|
|
|
|
n_parent->v.children[1] = cpu_to_le32(new_snapids[1]);
|
|
|
|
n_parent->v.subvol = 0;
|
|
|
|
SET_BCH_SNAPSHOT_SUBVOL(&n_parent->v, false);
|
|
|
|
err:
|
|
|
|
bch2_trans_iter_exit(trans, &iter);
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Create a snapshot node that is the root of a new tree:
|
|
|
|
*/
|
|
|
|
static int bch2_snapshot_node_create_tree(struct btree_trans *trans,
|
|
|
|
u32 *new_snapids,
|
|
|
|
u32 *snapshot_subvols,
|
|
|
|
unsigned nr_snapids)
|
|
|
|
{
|
|
|
|
struct bkey_i_snapshot_tree *n_tree;
|
|
|
|
int ret;
|
|
|
|
|
|
|
|
n_tree = __bch2_snapshot_tree_create(trans);
|
|
|
|
ret = PTR_ERR_OR_ZERO(n_tree) ?:
|
|
|
|
create_snapids(trans, 0, n_tree->k.p.offset,
|
|
|
|
new_snapids, snapshot_subvols, nr_snapids);
|
|
|
|
if (ret)
|
|
|
|
return ret;
|
|
|
|
|
|
|
|
n_tree->v.master_subvol = cpu_to_le32(snapshot_subvols[0]);
|
|
|
|
n_tree->v.root_snapshot = cpu_to_le32(new_snapids[0]);
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
int bch2_snapshot_node_create(struct btree_trans *trans, u32 parent,
|
|
|
|
u32 *new_snapids,
|
|
|
|
u32 *snapshot_subvols,
|
|
|
|
unsigned nr_snapids)
|
|
|
|
{
|
|
|
|
BUG_ON((parent == 0) != (nr_snapids == 1));
|
|
|
|
BUG_ON((parent != 0) != (nr_snapids == 2));
|
|
|
|
|
|
|
|
return parent
|
|
|
|
? bch2_snapshot_node_create_children(trans, parent,
|
|
|
|
new_snapids, snapshot_subvols, nr_snapids)
|
|
|
|
: bch2_snapshot_node_create_tree(trans,
|
|
|
|
new_snapids, snapshot_subvols, nr_snapids);
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* If we have an unlinked inode in an internal snapshot node, and the inode
|
|
|
|
* really has been deleted in all child snapshots, how does this get cleaned up?
|
|
|
|
*
|
|
|
|
* first there is the problem of how keys that have been overwritten in all
|
|
|
|
* child snapshots get deleted (unimplemented?), but inodes may perhaps be
|
|
|
|
* special?
|
|
|
|
*
|
|
|
|
* also: unlinked inode in internal snapshot appears to not be getting deleted
|
|
|
|
* correctly if inode doesn't exist in leaf snapshots
|
bcachefs: Cleanup redundant snapshot nodes
After deleteing snapshots, we may be left with a snapshot tree where
some nodes only have one child, and we have a linear chain.
Interior snapshot nodes are never used directly (i.e. they never have
subvolumes that point to them), they are only referered to by child
snapshot nodes - hence, they are redundant.
The existing code talks about redundant snapshot nodes as forming and
equivalence class; i.e. nodes for which snapshot_t->equiv is equal. In a
given equivalence class, we only ever need a single key at a given
position - i.e. multiple versions with different snapshot fields are
redundant.
The existing snapshot cleanup code deletes these redundant keys, but not
redundant nodes. It turns out this is buggy, because we assume that
after snapshot deletion finishes we should only have a single key per
equivalence class, but the btree update path doesn't preserve this -
overwriting keys in old snapshots doesn't check for the equivalence
class being equal, and thus we can end up with duplicate keys in the
same equivalence class and fsck complaining about snapshot deletion not
having run correctly.
The equivalence class notion has been leaking out of the core snapshots
code and into too much other code, i.e. fsck, so this patch takes a
different approach: snapshot deletion now moves keys to the node in an
equivalence class being kept (the leafiest node) and then deletes the
redundant nodes in the equivalance class.
Some work has to be done to correctly delete interior snapshot nodes;
snapshot node depth and skiplist fields for descendent nodes have to be
fixed.
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
2023-08-17 22:10:02 -04:00
|
|
|
*
|
|
|
|
* solution:
|
|
|
|
*
|
|
|
|
* for a key in an interior snapshot node that needs work to be done that
|
|
|
|
* requires it to be mutated: iterate over all descendent leaf nodes and copy
|
|
|
|
* that key to snapshot leaf nodes, where we can mutate it
|
2023-08-16 16:54:33 -04:00
|
|
|
*/
|
|
|
|
|
2024-12-12 03:03:58 -05:00
|
|
|
static inline u32 interior_delete_has_id(interior_delete_list *l, u32 id)
|
2023-08-16 16:54:33 -04:00
|
|
|
{
|
2025-05-29 16:56:50 -04:00
|
|
|
struct snapshot_interior_delete *i = darray_find_p(*l, i, i->id == id);
|
|
|
|
return i ? i->live_child : 0;
|
2024-12-12 03:03:58 -05:00
|
|
|
}
|
2024-05-26 12:38:30 -04:00
|
|
|
|
2024-12-12 03:03:58 -05:00
|
|
|
static unsigned __live_child(struct snapshot_table *t, u32 id,
|
|
|
|
snapshot_id_list *delete_leaves,
|
|
|
|
interior_delete_list *delete_interior)
|
|
|
|
{
|
|
|
|
struct snapshot_t *s = __snapshot_t(t, id);
|
|
|
|
if (!s)
|
2024-05-26 22:22:30 -04:00
|
|
|
return 0;
|
2023-08-16 16:54:33 -04:00
|
|
|
|
2024-12-12 03:03:58 -05:00
|
|
|
for (unsigned i = 0; i < ARRAY_SIZE(s->children); i++)
|
|
|
|
if (s->children[i] &&
|
|
|
|
!snapshot_list_has_id(delete_leaves, s->children[i]) &&
|
|
|
|
!interior_delete_has_id(delete_interior, s->children[i]))
|
|
|
|
return s->children[i];
|
|
|
|
|
|
|
|
for (unsigned i = 0; i < ARRAY_SIZE(s->children); i++) {
|
|
|
|
u32 live_child = s->children[i]
|
|
|
|
? __live_child(t, s->children[i], delete_leaves, delete_interior)
|
|
|
|
: 0;
|
|
|
|
if (live_child)
|
|
|
|
return live_child;
|
|
|
|
}
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2025-05-01 14:47:39 -04:00
|
|
|
static unsigned live_child(struct bch_fs *c, u32 id)
|
2024-12-12 03:03:58 -05:00
|
|
|
{
|
2025-05-01 14:47:39 -04:00
|
|
|
struct snapshot_delete *d = &c->snapshot_delete;
|
|
|
|
|
2025-05-24 16:33:39 -04:00
|
|
|
guard(rcu)();
|
|
|
|
return __live_child(rcu_dereference(c->snapshots), id,
|
|
|
|
&d->delete_leaves, &d->delete_interior);
|
2024-12-12 03:03:58 -05:00
|
|
|
}
|
|
|
|
|
2025-05-02 13:23:22 -04:00
|
|
|
static bool snapshot_id_dying(struct snapshot_delete *d, unsigned id)
|
|
|
|
{
|
|
|
|
return snapshot_list_has_id(&d->delete_leaves, id) ||
|
|
|
|
interior_delete_has_id(&d->delete_interior, id) != 0;
|
|
|
|
}
|
|
|
|
|
2024-12-12 03:03:58 -05:00
|
|
|
static int delete_dead_snapshots_process_key(struct btree_trans *trans,
|
|
|
|
struct btree_iter *iter,
|
2025-05-01 14:47:39 -04:00
|
|
|
struct bkey_s_c k)
|
2024-12-12 03:03:58 -05:00
|
|
|
{
|
2025-05-01 14:47:39 -04:00
|
|
|
struct snapshot_delete *d = &trans->c->snapshot_delete;
|
|
|
|
|
|
|
|
if (snapshot_list_has_id(&d->delete_leaves, k.k->p.snapshot))
|
2023-08-16 16:54:33 -04:00
|
|
|
return bch2_btree_delete_at(trans, iter,
|
2024-04-07 18:05:34 -04:00
|
|
|
BTREE_UPDATE_internal_snapshot_node);
|
2023-08-16 16:54:33 -04:00
|
|
|
|
2025-05-01 14:47:39 -04:00
|
|
|
u32 live_child = interior_delete_has_id(&d->delete_interior, k.k->p.snapshot);
|
2024-12-12 03:03:58 -05:00
|
|
|
if (live_child) {
|
bcachefs: Cleanup redundant snapshot nodes
After deleteing snapshots, we may be left with a snapshot tree where
some nodes only have one child, and we have a linear chain.
Interior snapshot nodes are never used directly (i.e. they never have
subvolumes that point to them), they are only referered to by child
snapshot nodes - hence, they are redundant.
The existing code talks about redundant snapshot nodes as forming and
equivalence class; i.e. nodes for which snapshot_t->equiv is equal. In a
given equivalence class, we only ever need a single key at a given
position - i.e. multiple versions with different snapshot fields are
redundant.
The existing snapshot cleanup code deletes these redundant keys, but not
redundant nodes. It turns out this is buggy, because we assume that
after snapshot deletion finishes we should only have a single key per
equivalence class, but the btree update path doesn't preserve this -
overwriting keys in old snapshots doesn't check for the equivalence
class being equal, and thus we can end up with duplicate keys in the
same equivalence class and fsck complaining about snapshot deletion not
having run correctly.
The equivalence class notion has been leaking out of the core snapshots
code and into too much other code, i.e. fsck, so this patch takes a
different approach: snapshot deletion now moves keys to the node in an
equivalence class being kept (the leafiest node) and then deletes the
redundant nodes in the equivalance class.
Some work has to be done to correctly delete interior snapshot nodes;
snapshot node depth and skiplist fields for descendent nodes have to be
fixed.
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
2023-08-17 22:10:02 -04:00
|
|
|
struct bkey_i *new = bch2_bkey_make_mut_noupdate(trans, k);
|
2024-05-26 22:22:30 -04:00
|
|
|
int ret = PTR_ERR_OR_ZERO(new);
|
bcachefs: Cleanup redundant snapshot nodes
After deleteing snapshots, we may be left with a snapshot tree where
some nodes only have one child, and we have a linear chain.
Interior snapshot nodes are never used directly (i.e. they never have
subvolumes that point to them), they are only referered to by child
snapshot nodes - hence, they are redundant.
The existing code talks about redundant snapshot nodes as forming and
equivalence class; i.e. nodes for which snapshot_t->equiv is equal. In a
given equivalence class, we only ever need a single key at a given
position - i.e. multiple versions with different snapshot fields are
redundant.
The existing snapshot cleanup code deletes these redundant keys, but not
redundant nodes. It turns out this is buggy, because we assume that
after snapshot deletion finishes we should only have a single key per
equivalence class, but the btree update path doesn't preserve this -
overwriting keys in old snapshots doesn't check for the equivalence
class being equal, and thus we can end up with duplicate keys in the
same equivalence class and fsck complaining about snapshot deletion not
having run correctly.
The equivalence class notion has been leaking out of the core snapshots
code and into too much other code, i.e. fsck, so this patch takes a
different approach: snapshot deletion now moves keys to the node in an
equivalence class being kept (the leafiest node) and then deletes the
redundant nodes in the equivalance class.
Some work has to be done to correctly delete interior snapshot nodes;
snapshot node depth and skiplist fields for descendent nodes have to be
fixed.
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
2023-08-17 22:10:02 -04:00
|
|
|
if (ret)
|
|
|
|
return ret;
|
|
|
|
|
2024-12-12 03:03:58 -05:00
|
|
|
new->k.p.snapshot = live_child;
|
bcachefs: Cleanup redundant snapshot nodes
After deleteing snapshots, we may be left with a snapshot tree where
some nodes only have one child, and we have a linear chain.
Interior snapshot nodes are never used directly (i.e. they never have
subvolumes that point to them), they are only referered to by child
snapshot nodes - hence, they are redundant.
The existing code talks about redundant snapshot nodes as forming and
equivalence class; i.e. nodes for which snapshot_t->equiv is equal. In a
given equivalence class, we only ever need a single key at a given
position - i.e. multiple versions with different snapshot fields are
redundant.
The existing snapshot cleanup code deletes these redundant keys, but not
redundant nodes. It turns out this is buggy, because we assume that
after snapshot deletion finishes we should only have a single key per
equivalence class, but the btree update path doesn't preserve this -
overwriting keys in old snapshots doesn't check for the equivalence
class being equal, and thus we can end up with duplicate keys in the
same equivalence class and fsck complaining about snapshot deletion not
having run correctly.
The equivalence class notion has been leaking out of the core snapshots
code and into too much other code, i.e. fsck, so this patch takes a
different approach: snapshot deletion now moves keys to the node in an
equivalence class being kept (the leafiest node) and then deletes the
redundant nodes in the equivalance class.
Some work has to be done to correctly delete interior snapshot nodes;
snapshot node depth and skiplist fields for descendent nodes have to be
fixed.
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
2023-08-17 22:10:02 -04:00
|
|
|
|
2024-12-12 02:41:37 -05:00
|
|
|
struct btree_iter dst_iter;
|
|
|
|
struct bkey_s_c dst_k = bch2_bkey_get_iter(trans, &dst_iter,
|
|
|
|
iter->btree_id, new->k.p,
|
|
|
|
BTREE_ITER_all_snapshots|
|
|
|
|
BTREE_ITER_intent);
|
|
|
|
ret = bkey_err(dst_k);
|
bcachefs: Cleanup redundant snapshot nodes
After deleteing snapshots, we may be left with a snapshot tree where
some nodes only have one child, and we have a linear chain.
Interior snapshot nodes are never used directly (i.e. they never have
subvolumes that point to them), they are only referered to by child
snapshot nodes - hence, they are redundant.
The existing code talks about redundant snapshot nodes as forming and
equivalence class; i.e. nodes for which snapshot_t->equiv is equal. In a
given equivalence class, we only ever need a single key at a given
position - i.e. multiple versions with different snapshot fields are
redundant.
The existing snapshot cleanup code deletes these redundant keys, but not
redundant nodes. It turns out this is buggy, because we assume that
after snapshot deletion finishes we should only have a single key per
equivalence class, but the btree update path doesn't preserve this -
overwriting keys in old snapshots doesn't check for the equivalence
class being equal, and thus we can end up with duplicate keys in the
same equivalence class and fsck complaining about snapshot deletion not
having run correctly.
The equivalence class notion has been leaking out of the core snapshots
code and into too much other code, i.e. fsck, so this patch takes a
different approach: snapshot deletion now moves keys to the node in an
equivalence class being kept (the leafiest node) and then deletes the
redundant nodes in the equivalance class.
Some work has to be done to correctly delete interior snapshot nodes;
snapshot node depth and skiplist fields for descendent nodes have to be
fixed.
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
2023-08-17 22:10:02 -04:00
|
|
|
if (ret)
|
|
|
|
return ret;
|
2024-12-12 02:41:37 -05:00
|
|
|
|
|
|
|
ret = (bkey_deleted(dst_k.k)
|
|
|
|
? bch2_trans_update(trans, &dst_iter, new,
|
|
|
|
BTREE_UPDATE_internal_snapshot_node)
|
|
|
|
: 0) ?:
|
|
|
|
bch2_btree_delete_at(trans, iter,
|
|
|
|
BTREE_UPDATE_internal_snapshot_node);
|
|
|
|
bch2_trans_iter_exit(trans, &dst_iter);
|
|
|
|
return ret;
|
bcachefs: Cleanup redundant snapshot nodes
After deleteing snapshots, we may be left with a snapshot tree where
some nodes only have one child, and we have a linear chain.
Interior snapshot nodes are never used directly (i.e. they never have
subvolumes that point to them), they are only referered to by child
snapshot nodes - hence, they are redundant.
The existing code talks about redundant snapshot nodes as forming and
equivalence class; i.e. nodes for which snapshot_t->equiv is equal. In a
given equivalence class, we only ever need a single key at a given
position - i.e. multiple versions with different snapshot fields are
redundant.
The existing snapshot cleanup code deletes these redundant keys, but not
redundant nodes. It turns out this is buggy, because we assume that
after snapshot deletion finishes we should only have a single key per
equivalence class, but the btree update path doesn't preserve this -
overwriting keys in old snapshots doesn't check for the equivalence
class being equal, and thus we can end up with duplicate keys in the
same equivalence class and fsck complaining about snapshot deletion not
having run correctly.
The equivalence class notion has been leaking out of the core snapshots
code and into too much other code, i.e. fsck, so this patch takes a
different approach: snapshot deletion now moves keys to the node in an
equivalence class being kept (the leafiest node) and then deletes the
redundant nodes in the equivalance class.
Some work has to be done to correctly delete interior snapshot nodes;
snapshot node depth and skiplist fields for descendent nodes have to be
fixed.
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
2023-08-17 22:10:02 -04:00
|
|
|
}
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2025-05-02 14:43:45 -04:00
|
|
|
static bool skip_unrelated_snapshot_tree(struct btree_trans *trans, struct btree_iter *iter, u64 *prev_inum)
|
|
|
|
{
|
|
|
|
struct bch_fs *c = trans->c;
|
|
|
|
struct snapshot_delete *d = &c->snapshot_delete;
|
|
|
|
|
|
|
|
u64 inum = iter->btree_id != BTREE_ID_inodes
|
|
|
|
? iter->pos.inode
|
|
|
|
: iter->pos.offset;
|
|
|
|
|
|
|
|
if (*prev_inum == inum)
|
|
|
|
return false;
|
|
|
|
|
|
|
|
*prev_inum = inum;
|
|
|
|
|
|
|
|
bool ret = !snapshot_list_has_id(&d->deleting_from_trees,
|
|
|
|
bch2_snapshot_tree(c, iter->pos.snapshot));
|
|
|
|
if (unlikely(ret)) {
|
|
|
|
struct bpos pos = iter->pos;
|
|
|
|
pos.snapshot = 0;
|
|
|
|
if (iter->btree_id != BTREE_ID_inodes)
|
|
|
|
pos.offset = U64_MAX;
|
|
|
|
bch2_btree_iter_set_pos(trans, iter, bpos_nosnap_successor(pos));
|
|
|
|
}
|
|
|
|
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2025-05-02 13:23:22 -04:00
|
|
|
static int delete_dead_snapshot_keys_v1(struct btree_trans *trans)
|
|
|
|
{
|
|
|
|
struct bch_fs *c = trans->c;
|
|
|
|
struct snapshot_delete *d = &c->snapshot_delete;
|
|
|
|
|
|
|
|
for (d->pos.btree = 0; d->pos.btree < BTREE_ID_NR; d->pos.btree++) {
|
|
|
|
struct disk_reservation res = { 0 };
|
|
|
|
u64 prev_inum = 0;
|
|
|
|
|
|
|
|
d->pos.pos = POS_MIN;
|
|
|
|
|
|
|
|
if (!btree_type_has_snapshots(d->pos.btree))
|
|
|
|
continue;
|
|
|
|
|
|
|
|
int ret = for_each_btree_key_commit(trans, iter,
|
|
|
|
d->pos.btree, POS_MIN,
|
|
|
|
BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k,
|
|
|
|
&res, NULL, BCH_TRANS_COMMIT_no_enospc, ({
|
|
|
|
d->pos.pos = iter.pos;
|
|
|
|
|
|
|
|
if (skip_unrelated_snapshot_tree(trans, &iter, &prev_inum))
|
|
|
|
continue;
|
|
|
|
|
|
|
|
delete_dead_snapshots_process_key(trans, &iter, k);
|
|
|
|
}));
|
|
|
|
|
|
|
|
bch2_disk_reservation_put(c, &res);
|
|
|
|
|
|
|
|
if (ret)
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
static int delete_dead_snapshot_keys_range(struct btree_trans *trans, enum btree_id btree,
|
|
|
|
struct bpos start, struct bpos end)
|
|
|
|
{
|
|
|
|
struct bch_fs *c = trans->c;
|
|
|
|
struct snapshot_delete *d = &c->snapshot_delete;
|
|
|
|
struct disk_reservation res = { 0 };
|
|
|
|
|
|
|
|
d->pos.btree = btree;
|
|
|
|
d->pos.pos = POS_MIN;
|
|
|
|
|
|
|
|
int ret = for_each_btree_key_max_commit(trans, iter,
|
|
|
|
btree, start, end,
|
|
|
|
BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k,
|
|
|
|
&res, NULL, BCH_TRANS_COMMIT_no_enospc, ({
|
|
|
|
d->pos.pos = iter.pos;
|
|
|
|
delete_dead_snapshots_process_key(trans, &iter, k);
|
|
|
|
}));
|
|
|
|
|
|
|
|
bch2_disk_reservation_put(c, &res);
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
static int delete_dead_snapshot_keys_v2(struct btree_trans *trans)
|
|
|
|
{
|
|
|
|
struct bch_fs *c = trans->c;
|
|
|
|
struct snapshot_delete *d = &c->snapshot_delete;
|
|
|
|
struct disk_reservation res = { 0 };
|
|
|
|
u64 prev_inum = 0;
|
|
|
|
int ret = 0;
|
|
|
|
|
|
|
|
struct btree_iter iter;
|
|
|
|
bch2_trans_iter_init(trans, &iter, BTREE_ID_inodes, POS_MIN,
|
|
|
|
BTREE_ITER_prefetch|BTREE_ITER_all_snapshots);
|
|
|
|
|
|
|
|
while (1) {
|
|
|
|
struct bkey_s_c k;
|
|
|
|
ret = lockrestart_do(trans,
|
|
|
|
bkey_err(k = bch2_btree_iter_peek(trans, &iter)));
|
|
|
|
if (ret)
|
|
|
|
break;
|
|
|
|
|
|
|
|
if (!k.k)
|
|
|
|
break;
|
|
|
|
|
|
|
|
d->pos.btree = iter.btree_id;
|
|
|
|
d->pos.pos = iter.pos;
|
|
|
|
|
|
|
|
if (skip_unrelated_snapshot_tree(trans, &iter, &prev_inum))
|
|
|
|
continue;
|
|
|
|
|
|
|
|
if (snapshot_id_dying(d, k.k->p.snapshot)) {
|
|
|
|
struct bpos start = POS(k.k->p.offset, 0);
|
|
|
|
struct bpos end = POS(k.k->p.offset, U64_MAX);
|
|
|
|
|
|
|
|
ret = delete_dead_snapshot_keys_range(trans, BTREE_ID_extents, start, end) ?:
|
|
|
|
delete_dead_snapshot_keys_range(trans, BTREE_ID_dirents, start, end) ?:
|
|
|
|
delete_dead_snapshot_keys_range(trans, BTREE_ID_xattrs, start, end);
|
|
|
|
if (ret)
|
|
|
|
break;
|
|
|
|
|
|
|
|
bch2_btree_iter_set_pos(trans, &iter, POS(0, k.k->p.offset + 1));
|
|
|
|
} else {
|
|
|
|
bch2_btree_iter_advance(trans, &iter);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
bch2_trans_iter_exit(trans, &iter);
|
|
|
|
|
|
|
|
if (ret)
|
|
|
|
goto err;
|
|
|
|
|
|
|
|
prev_inum = 0;
|
|
|
|
ret = for_each_btree_key_commit(trans, iter,
|
|
|
|
BTREE_ID_inodes, POS_MIN,
|
|
|
|
BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k,
|
|
|
|
&res, NULL, BCH_TRANS_COMMIT_no_enospc, ({
|
|
|
|
d->pos.btree = iter.btree_id;
|
|
|
|
d->pos.pos = iter.pos;
|
|
|
|
|
|
|
|
if (skip_unrelated_snapshot_tree(trans, &iter, &prev_inum))
|
|
|
|
continue;
|
|
|
|
|
|
|
|
delete_dead_snapshots_process_key(trans, &iter, k);
|
|
|
|
}));
|
|
|
|
err:
|
|
|
|
bch2_disk_reservation_put(c, &res);
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2024-12-12 03:03:58 -05:00
|
|
|
/*
|
|
|
|
* For a given snapshot, if it doesn't have a subvolume that points to it, and
|
|
|
|
* it doesn't have child snapshot nodes - it's now redundant and we can mark it
|
|
|
|
* as deleted.
|
|
|
|
*/
|
2025-05-01 14:47:39 -04:00
|
|
|
static int check_should_delete_snapshot(struct btree_trans *trans, struct bkey_s_c k)
|
2023-08-16 16:54:33 -04:00
|
|
|
{
|
|
|
|
if (k.k->type != KEY_TYPE_snapshot)
|
|
|
|
return 0;
|
|
|
|
|
2024-12-12 03:03:58 -05:00
|
|
|
struct bch_fs *c = trans->c;
|
2025-05-01 14:47:39 -04:00
|
|
|
struct snapshot_delete *d = &c->snapshot_delete;
|
2024-12-12 03:03:58 -05:00
|
|
|
struct bkey_s_c_snapshot s = bkey_s_c_to_snapshot(k);
|
|
|
|
unsigned live_children = 0;
|
2025-05-01 14:47:39 -04:00
|
|
|
int ret = 0;
|
2024-12-12 03:03:58 -05:00
|
|
|
|
|
|
|
if (BCH_SNAPSHOT_SUBVOL(s.v))
|
2023-08-16 16:54:33 -04:00
|
|
|
return 0;
|
|
|
|
|
2025-05-02 12:37:36 -04:00
|
|
|
if (BCH_SNAPSHOT_DELETED(s.v))
|
|
|
|
return 0;
|
|
|
|
|
2025-05-01 14:47:39 -04:00
|
|
|
mutex_lock(&d->progress_lock);
|
2024-12-12 03:03:58 -05:00
|
|
|
for (unsigned i = 0; i < 2; i++) {
|
|
|
|
u32 child = le32_to_cpu(s.v->children[i]);
|
2023-08-16 16:54:33 -04:00
|
|
|
|
2024-12-12 03:03:58 -05:00
|
|
|
live_children += child &&
|
2025-05-01 14:47:39 -04:00
|
|
|
!snapshot_list_has_id(&d->delete_leaves, child);
|
2024-12-12 03:03:58 -05:00
|
|
|
}
|
2023-08-16 16:54:33 -04:00
|
|
|
|
2025-05-02 14:43:45 -04:00
|
|
|
u32 tree = bch2_snapshot_tree(c, s.k->p.offset);
|
|
|
|
|
2024-12-12 03:03:58 -05:00
|
|
|
if (live_children == 0) {
|
2025-05-02 14:43:45 -04:00
|
|
|
ret = snapshot_list_add_nodup(c, &d->deleting_from_trees, tree) ?:
|
|
|
|
snapshot_list_add(c, &d->delete_leaves, s.k->p.offset);
|
2024-12-12 03:03:58 -05:00
|
|
|
} else if (live_children == 1) {
|
2025-05-01 14:47:39 -04:00
|
|
|
struct snapshot_interior_delete n = {
|
2024-12-12 03:03:58 -05:00
|
|
|
.id = s.k->p.offset,
|
2025-05-01 14:47:39 -04:00
|
|
|
.live_child = live_child(c, s.k->p.offset),
|
2024-12-12 03:03:58 -05:00
|
|
|
};
|
|
|
|
|
2025-05-01 14:47:39 -04:00
|
|
|
if (!n.live_child) {
|
|
|
|
bch_err(c, "error finding live child of snapshot %u", n.id);
|
|
|
|
ret = -EINVAL;
|
|
|
|
} else {
|
2025-05-02 14:43:45 -04:00
|
|
|
ret = snapshot_list_add_nodup(c, &d->deleting_from_trees, tree) ?:
|
|
|
|
darray_push(&d->delete_interior, n);
|
2024-12-12 03:03:58 -05:00
|
|
|
}
|
|
|
|
}
|
2025-05-01 14:47:39 -04:00
|
|
|
mutex_unlock(&d->progress_lock);
|
|
|
|
|
|
|
|
return ret;
|
2023-08-16 16:54:33 -04:00
|
|
|
}
|
|
|
|
|
bcachefs: Cleanup redundant snapshot nodes
After deleteing snapshots, we may be left with a snapshot tree where
some nodes only have one child, and we have a linear chain.
Interior snapshot nodes are never used directly (i.e. they never have
subvolumes that point to them), they are only referered to by child
snapshot nodes - hence, they are redundant.
The existing code talks about redundant snapshot nodes as forming and
equivalence class; i.e. nodes for which snapshot_t->equiv is equal. In a
given equivalence class, we only ever need a single key at a given
position - i.e. multiple versions with different snapshot fields are
redundant.
The existing snapshot cleanup code deletes these redundant keys, but not
redundant nodes. It turns out this is buggy, because we assume that
after snapshot deletion finishes we should only have a single key per
equivalence class, but the btree update path doesn't preserve this -
overwriting keys in old snapshots doesn't check for the equivalence
class being equal, and thus we can end up with duplicate keys in the
same equivalence class and fsck complaining about snapshot deletion not
having run correctly.
The equivalence class notion has been leaking out of the core snapshots
code and into too much other code, i.e. fsck, so this patch takes a
different approach: snapshot deletion now moves keys to the node in an
equivalence class being kept (the leafiest node) and then deletes the
redundant nodes in the equivalance class.
Some work has to be done to correctly delete interior snapshot nodes;
snapshot node depth and skiplist fields for descendent nodes have to be
fixed.
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
2023-08-17 22:10:02 -04:00
|
|
|
static inline u32 bch2_snapshot_nth_parent_skip(struct bch_fs *c, u32 id, u32 n,
|
2024-12-12 03:03:58 -05:00
|
|
|
interior_delete_list *skip)
|
bcachefs: Cleanup redundant snapshot nodes
After deleteing snapshots, we may be left with a snapshot tree where
some nodes only have one child, and we have a linear chain.
Interior snapshot nodes are never used directly (i.e. they never have
subvolumes that point to them), they are only referered to by child
snapshot nodes - hence, they are redundant.
The existing code talks about redundant snapshot nodes as forming and
equivalence class; i.e. nodes for which snapshot_t->equiv is equal. In a
given equivalence class, we only ever need a single key at a given
position - i.e. multiple versions with different snapshot fields are
redundant.
The existing snapshot cleanup code deletes these redundant keys, but not
redundant nodes. It turns out this is buggy, because we assume that
after snapshot deletion finishes we should only have a single key per
equivalence class, but the btree update path doesn't preserve this -
overwriting keys in old snapshots doesn't check for the equivalence
class being equal, and thus we can end up with duplicate keys in the
same equivalence class and fsck complaining about snapshot deletion not
having run correctly.
The equivalence class notion has been leaking out of the core snapshots
code and into too much other code, i.e. fsck, so this patch takes a
different approach: snapshot deletion now moves keys to the node in an
equivalence class being kept (the leafiest node) and then deletes the
redundant nodes in the equivalance class.
Some work has to be done to correctly delete interior snapshot nodes;
snapshot node depth and skiplist fields for descendent nodes have to be
fixed.
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
2023-08-17 22:10:02 -04:00
|
|
|
{
|
2025-05-24 16:33:39 -04:00
|
|
|
guard(rcu)();
|
2024-12-12 03:03:58 -05:00
|
|
|
while (interior_delete_has_id(skip, id))
|
2023-09-29 00:46:30 -04:00
|
|
|
id = __bch2_snapshot_parent(c, id);
|
|
|
|
|
bcachefs: Cleanup redundant snapshot nodes
After deleteing snapshots, we may be left with a snapshot tree where
some nodes only have one child, and we have a linear chain.
Interior snapshot nodes are never used directly (i.e. they never have
subvolumes that point to them), they are only referered to by child
snapshot nodes - hence, they are redundant.
The existing code talks about redundant snapshot nodes as forming and
equivalence class; i.e. nodes for which snapshot_t->equiv is equal. In a
given equivalence class, we only ever need a single key at a given
position - i.e. multiple versions with different snapshot fields are
redundant.
The existing snapshot cleanup code deletes these redundant keys, but not
redundant nodes. It turns out this is buggy, because we assume that
after snapshot deletion finishes we should only have a single key per
equivalence class, but the btree update path doesn't preserve this -
overwriting keys in old snapshots doesn't check for the equivalence
class being equal, and thus we can end up with duplicate keys in the
same equivalence class and fsck complaining about snapshot deletion not
having run correctly.
The equivalence class notion has been leaking out of the core snapshots
code and into too much other code, i.e. fsck, so this patch takes a
different approach: snapshot deletion now moves keys to the node in an
equivalence class being kept (the leafiest node) and then deletes the
redundant nodes in the equivalance class.
Some work has to be done to correctly delete interior snapshot nodes;
snapshot node depth and skiplist fields for descendent nodes have to be
fixed.
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
2023-08-17 22:10:02 -04:00
|
|
|
while (n--) {
|
|
|
|
do {
|
|
|
|
id = __bch2_snapshot_parent(c, id);
|
2024-12-12 03:03:58 -05:00
|
|
|
} while (interior_delete_has_id(skip, id));
|
bcachefs: Cleanup redundant snapshot nodes
After deleteing snapshots, we may be left with a snapshot tree where
some nodes only have one child, and we have a linear chain.
Interior snapshot nodes are never used directly (i.e. they never have
subvolumes that point to them), they are only referered to by child
snapshot nodes - hence, they are redundant.
The existing code talks about redundant snapshot nodes as forming and
equivalence class; i.e. nodes for which snapshot_t->equiv is equal. In a
given equivalence class, we only ever need a single key at a given
position - i.e. multiple versions with different snapshot fields are
redundant.
The existing snapshot cleanup code deletes these redundant keys, but not
redundant nodes. It turns out this is buggy, because we assume that
after snapshot deletion finishes we should only have a single key per
equivalence class, but the btree update path doesn't preserve this -
overwriting keys in old snapshots doesn't check for the equivalence
class being equal, and thus we can end up with duplicate keys in the
same equivalence class and fsck complaining about snapshot deletion not
having run correctly.
The equivalence class notion has been leaking out of the core snapshots
code and into too much other code, i.e. fsck, so this patch takes a
different approach: snapshot deletion now moves keys to the node in an
equivalence class being kept (the leafiest node) and then deletes the
redundant nodes in the equivalance class.
Some work has to be done to correctly delete interior snapshot nodes;
snapshot node depth and skiplist fields for descendent nodes have to be
fixed.
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
2023-08-17 22:10:02 -04:00
|
|
|
}
|
|
|
|
|
|
|
|
return id;
|
|
|
|
}
|
|
|
|
|
|
|
|
static int bch2_fix_child_of_deleted_snapshot(struct btree_trans *trans,
|
|
|
|
struct btree_iter *iter, struct bkey_s_c k,
|
2024-12-12 03:03:58 -05:00
|
|
|
interior_delete_list *deleted)
|
bcachefs: Cleanup redundant snapshot nodes
After deleteing snapshots, we may be left with a snapshot tree where
some nodes only have one child, and we have a linear chain.
Interior snapshot nodes are never used directly (i.e. they never have
subvolumes that point to them), they are only referered to by child
snapshot nodes - hence, they are redundant.
The existing code talks about redundant snapshot nodes as forming and
equivalence class; i.e. nodes for which snapshot_t->equiv is equal. In a
given equivalence class, we only ever need a single key at a given
position - i.e. multiple versions with different snapshot fields are
redundant.
The existing snapshot cleanup code deletes these redundant keys, but not
redundant nodes. It turns out this is buggy, because we assume that
after snapshot deletion finishes we should only have a single key per
equivalence class, but the btree update path doesn't preserve this -
overwriting keys in old snapshots doesn't check for the equivalence
class being equal, and thus we can end up with duplicate keys in the
same equivalence class and fsck complaining about snapshot deletion not
having run correctly.
The equivalence class notion has been leaking out of the core snapshots
code and into too much other code, i.e. fsck, so this patch takes a
different approach: snapshot deletion now moves keys to the node in an
equivalence class being kept (the leafiest node) and then deletes the
redundant nodes in the equivalance class.
Some work has to be done to correctly delete interior snapshot nodes;
snapshot node depth and skiplist fields for descendent nodes have to be
fixed.
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
2023-08-17 22:10:02 -04:00
|
|
|
{
|
|
|
|
struct bch_fs *c = trans->c;
|
|
|
|
u32 nr_deleted_ancestors = 0;
|
|
|
|
struct bkey_i_snapshot *s;
|
|
|
|
int ret;
|
|
|
|
|
2025-05-02 12:37:36 -04:00
|
|
|
if (!bch2_snapshot_exists(c, k.k->p.offset))
|
|
|
|
return 0;
|
|
|
|
|
bcachefs: Cleanup redundant snapshot nodes
After deleteing snapshots, we may be left with a snapshot tree where
some nodes only have one child, and we have a linear chain.
Interior snapshot nodes are never used directly (i.e. they never have
subvolumes that point to them), they are only referered to by child
snapshot nodes - hence, they are redundant.
The existing code talks about redundant snapshot nodes as forming and
equivalence class; i.e. nodes for which snapshot_t->equiv is equal. In a
given equivalence class, we only ever need a single key at a given
position - i.e. multiple versions with different snapshot fields are
redundant.
The existing snapshot cleanup code deletes these redundant keys, but not
redundant nodes. It turns out this is buggy, because we assume that
after snapshot deletion finishes we should only have a single key per
equivalence class, but the btree update path doesn't preserve this -
overwriting keys in old snapshots doesn't check for the equivalence
class being equal, and thus we can end up with duplicate keys in the
same equivalence class and fsck complaining about snapshot deletion not
having run correctly.
The equivalence class notion has been leaking out of the core snapshots
code and into too much other code, i.e. fsck, so this patch takes a
different approach: snapshot deletion now moves keys to the node in an
equivalence class being kept (the leafiest node) and then deletes the
redundant nodes in the equivalance class.
Some work has to be done to correctly delete interior snapshot nodes;
snapshot node depth and skiplist fields for descendent nodes have to be
fixed.
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
2023-08-17 22:10:02 -04:00
|
|
|
if (k.k->type != KEY_TYPE_snapshot)
|
|
|
|
return 0;
|
|
|
|
|
2024-12-12 03:03:58 -05:00
|
|
|
if (interior_delete_has_id(deleted, k.k->p.offset))
|
bcachefs: Cleanup redundant snapshot nodes
After deleteing snapshots, we may be left with a snapshot tree where
some nodes only have one child, and we have a linear chain.
Interior snapshot nodes are never used directly (i.e. they never have
subvolumes that point to them), they are only referered to by child
snapshot nodes - hence, they are redundant.
The existing code talks about redundant snapshot nodes as forming and
equivalence class; i.e. nodes for which snapshot_t->equiv is equal. In a
given equivalence class, we only ever need a single key at a given
position - i.e. multiple versions with different snapshot fields are
redundant.
The existing snapshot cleanup code deletes these redundant keys, but not
redundant nodes. It turns out this is buggy, because we assume that
after snapshot deletion finishes we should only have a single key per
equivalence class, but the btree update path doesn't preserve this -
overwriting keys in old snapshots doesn't check for the equivalence
class being equal, and thus we can end up with duplicate keys in the
same equivalence class and fsck complaining about snapshot deletion not
having run correctly.
The equivalence class notion has been leaking out of the core snapshots
code and into too much other code, i.e. fsck, so this patch takes a
different approach: snapshot deletion now moves keys to the node in an
equivalence class being kept (the leafiest node) and then deletes the
redundant nodes in the equivalance class.
Some work has to be done to correctly delete interior snapshot nodes;
snapshot node depth and skiplist fields for descendent nodes have to be
fixed.
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
2023-08-17 22:10:02 -04:00
|
|
|
return 0;
|
|
|
|
|
|
|
|
s = bch2_bkey_make_mut_noupdate_typed(trans, k, snapshot);
|
|
|
|
ret = PTR_ERR_OR_ZERO(s);
|
|
|
|
if (ret)
|
|
|
|
return ret;
|
|
|
|
|
|
|
|
darray_for_each(*deleted, i)
|
2024-12-12 03:03:58 -05:00
|
|
|
nr_deleted_ancestors += bch2_snapshot_is_ancestor(c, s->k.p.offset, i->id);
|
bcachefs: Cleanup redundant snapshot nodes
After deleteing snapshots, we may be left with a snapshot tree where
some nodes only have one child, and we have a linear chain.
Interior snapshot nodes are never used directly (i.e. they never have
subvolumes that point to them), they are only referered to by child
snapshot nodes - hence, they are redundant.
The existing code talks about redundant snapshot nodes as forming and
equivalence class; i.e. nodes for which snapshot_t->equiv is equal. In a
given equivalence class, we only ever need a single key at a given
position - i.e. multiple versions with different snapshot fields are
redundant.
The existing snapshot cleanup code deletes these redundant keys, but not
redundant nodes. It turns out this is buggy, because we assume that
after snapshot deletion finishes we should only have a single key per
equivalence class, but the btree update path doesn't preserve this -
overwriting keys in old snapshots doesn't check for the equivalence
class being equal, and thus we can end up with duplicate keys in the
same equivalence class and fsck complaining about snapshot deletion not
having run correctly.
The equivalence class notion has been leaking out of the core snapshots
code and into too much other code, i.e. fsck, so this patch takes a
different approach: snapshot deletion now moves keys to the node in an
equivalence class being kept (the leafiest node) and then deletes the
redundant nodes in the equivalance class.
Some work has to be done to correctly delete interior snapshot nodes;
snapshot node depth and skiplist fields for descendent nodes have to be
fixed.
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
2023-08-17 22:10:02 -04:00
|
|
|
|
|
|
|
if (!nr_deleted_ancestors)
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
le32_add_cpu(&s->v.depth, -nr_deleted_ancestors);
|
|
|
|
|
|
|
|
if (!s->v.depth) {
|
|
|
|
s->v.skip[0] = 0;
|
|
|
|
s->v.skip[1] = 0;
|
|
|
|
s->v.skip[2] = 0;
|
|
|
|
} else {
|
|
|
|
u32 depth = le32_to_cpu(s->v.depth);
|
|
|
|
u32 parent = bch2_snapshot_parent(c, s->k.p.offset);
|
|
|
|
|
|
|
|
for (unsigned j = 0; j < ARRAY_SIZE(s->v.skip); j++) {
|
|
|
|
u32 id = le32_to_cpu(s->v.skip[j]);
|
|
|
|
|
2024-12-12 03:03:58 -05:00
|
|
|
if (interior_delete_has_id(deleted, id)) {
|
2023-10-26 16:20:08 -04:00
|
|
|
id = bch2_snapshot_nth_parent_skip(c,
|
bcachefs: Cleanup redundant snapshot nodes
After deleteing snapshots, we may be left with a snapshot tree where
some nodes only have one child, and we have a linear chain.
Interior snapshot nodes are never used directly (i.e. they never have
subvolumes that point to them), they are only referered to by child
snapshot nodes - hence, they are redundant.
The existing code talks about redundant snapshot nodes as forming and
equivalence class; i.e. nodes for which snapshot_t->equiv is equal. In a
given equivalence class, we only ever need a single key at a given
position - i.e. multiple versions with different snapshot fields are
redundant.
The existing snapshot cleanup code deletes these redundant keys, but not
redundant nodes. It turns out this is buggy, because we assume that
after snapshot deletion finishes we should only have a single key per
equivalence class, but the btree update path doesn't preserve this -
overwriting keys in old snapshots doesn't check for the equivalence
class being equal, and thus we can end up with duplicate keys in the
same equivalence class and fsck complaining about snapshot deletion not
having run correctly.
The equivalence class notion has been leaking out of the core snapshots
code and into too much other code, i.e. fsck, so this patch takes a
different approach: snapshot deletion now moves keys to the node in an
equivalence class being kept (the leafiest node) and then deletes the
redundant nodes in the equivalance class.
Some work has to be done to correctly delete interior snapshot nodes;
snapshot node depth and skiplist fields for descendent nodes have to be
fixed.
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
2023-08-17 22:10:02 -04:00
|
|
|
parent,
|
2023-10-26 16:20:08 -04:00
|
|
|
depth > 1
|
|
|
|
? get_random_u32_below(depth - 1)
|
|
|
|
: 0,
|
|
|
|
deleted);
|
bcachefs: Cleanup redundant snapshot nodes
After deleteing snapshots, we may be left with a snapshot tree where
some nodes only have one child, and we have a linear chain.
Interior snapshot nodes are never used directly (i.e. they never have
subvolumes that point to them), they are only referered to by child
snapshot nodes - hence, they are redundant.
The existing code talks about redundant snapshot nodes as forming and
equivalence class; i.e. nodes for which snapshot_t->equiv is equal. In a
given equivalence class, we only ever need a single key at a given
position - i.e. multiple versions with different snapshot fields are
redundant.
The existing snapshot cleanup code deletes these redundant keys, but not
redundant nodes. It turns out this is buggy, because we assume that
after snapshot deletion finishes we should only have a single key per
equivalence class, but the btree update path doesn't preserve this -
overwriting keys in old snapshots doesn't check for the equivalence
class being equal, and thus we can end up with duplicate keys in the
same equivalence class and fsck complaining about snapshot deletion not
having run correctly.
The equivalence class notion has been leaking out of the core snapshots
code and into too much other code, i.e. fsck, so this patch takes a
different approach: snapshot deletion now moves keys to the node in an
equivalence class being kept (the leafiest node) and then deletes the
redundant nodes in the equivalance class.
Some work has to be done to correctly delete interior snapshot nodes;
snapshot node depth and skiplist fields for descendent nodes have to be
fixed.
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
2023-08-17 22:10:02 -04:00
|
|
|
s->v.skip[j] = cpu_to_le32(id);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
bubble_sort(s->v.skip, ARRAY_SIZE(s->v.skip), cmp_le32);
|
|
|
|
}
|
|
|
|
|
|
|
|
return bch2_trans_update(trans, iter, &s->k_i, 0);
|
|
|
|
}
|
|
|
|
|
2025-05-01 14:47:39 -04:00
|
|
|
static void bch2_snapshot_delete_nodes_to_text(struct printbuf *out, struct snapshot_delete *d)
|
|
|
|
{
|
2025-05-02 14:43:45 -04:00
|
|
|
prt_printf(out, "deleting from trees");
|
|
|
|
darray_for_each(d->deleting_from_trees, i)
|
|
|
|
prt_printf(out, " %u", *i);
|
|
|
|
|
2025-05-01 14:47:39 -04:00
|
|
|
prt_printf(out, "deleting leaves");
|
|
|
|
darray_for_each(d->delete_leaves, i)
|
|
|
|
prt_printf(out, " %u", *i);
|
|
|
|
prt_newline(out);
|
|
|
|
|
|
|
|
prt_printf(out, "interior");
|
|
|
|
darray_for_each(d->delete_interior, i)
|
|
|
|
prt_printf(out, " %u->%u", i->id, i->live_child);
|
|
|
|
prt_newline(out);
|
|
|
|
}
|
|
|
|
|
2025-05-07 14:26:18 -04:00
|
|
|
int __bch2_delete_dead_snapshots(struct bch_fs *c)
|
2023-08-16 16:54:33 -04:00
|
|
|
{
|
2025-05-07 14:26:18 -04:00
|
|
|
struct snapshot_delete *d = &c->snapshot_delete;
|
|
|
|
int ret = 0;
|
|
|
|
|
|
|
|
if (!mutex_trylock(&d->lock))
|
2023-10-19 21:25:04 -04:00
|
|
|
return 0;
|
|
|
|
|
2025-05-07 14:26:18 -04:00
|
|
|
if (!test_and_clear_bit(BCH_FS_need_delete_dead_snapshots, &c->flags))
|
|
|
|
goto out_unlock;
|
|
|
|
|
2024-12-12 03:03:58 -05:00
|
|
|
struct btree_trans *trans = bch2_trans_get(c);
|
2023-08-16 16:54:33 -04:00
|
|
|
|
|
|
|
/*
|
|
|
|
* For every snapshot node: If we have no live children and it's not
|
|
|
|
* pointed to by a subvolume, delete it:
|
|
|
|
*/
|
2025-05-01 14:47:39 -04:00
|
|
|
d->running = true;
|
|
|
|
d->pos = BBPOS_MIN;
|
|
|
|
|
2024-12-12 03:03:58 -05:00
|
|
|
ret = for_each_btree_key(trans, iter, BTREE_ID_snapshots, POS_MIN, 0, k,
|
2025-05-01 14:47:39 -04:00
|
|
|
check_should_delete_snapshot(trans, k));
|
2024-12-31 09:55:09 -05:00
|
|
|
if (!bch2_err_matches(ret, EROFS))
|
|
|
|
bch_err_msg(c, ret, "walking snapshots");
|
2023-12-16 22:43:41 -05:00
|
|
|
if (ret)
|
2023-08-16 16:54:33 -04:00
|
|
|
goto err;
|
|
|
|
|
2025-05-01 14:47:39 -04:00
|
|
|
if (!d->delete_leaves.nr && !d->delete_interior.nr)
|
2023-08-16 16:54:33 -04:00
|
|
|
goto err;
|
|
|
|
|
2024-12-12 04:00:40 -05:00
|
|
|
{
|
|
|
|
struct printbuf buf = PRINTBUF;
|
2025-05-01 14:47:39 -04:00
|
|
|
bch2_snapshot_delete_nodes_to_text(&buf, d);
|
2024-12-12 04:00:40 -05:00
|
|
|
|
|
|
|
ret = commit_do(trans, NULL, NULL, 0, bch2_trans_log_msg(trans, &buf));
|
|
|
|
printbuf_exit(&buf);
|
|
|
|
if (ret)
|
|
|
|
goto err;
|
|
|
|
}
|
|
|
|
|
2025-05-02 13:23:22 -04:00
|
|
|
ret = !bch2_request_incompat_feature(c, bcachefs_metadata_version_snapshot_deletion_v2)
|
|
|
|
? delete_dead_snapshot_keys_v2(trans)
|
|
|
|
: delete_dead_snapshot_keys_v1(trans);
|
|
|
|
if (!bch2_err_matches(ret, EROFS))
|
|
|
|
bch_err_msg(c, ret, "deleting keys from dying snapshots");
|
|
|
|
if (ret)
|
|
|
|
goto err;
|
2023-08-16 16:54:33 -04:00
|
|
|
|
2025-05-01 14:47:39 -04:00
|
|
|
darray_for_each(d->delete_leaves, i) {
|
2024-12-12 03:03:58 -05:00
|
|
|
ret = commit_do(trans, NULL, NULL, 0,
|
|
|
|
bch2_snapshot_node_delete(trans, *i));
|
2024-12-31 09:55:09 -05:00
|
|
|
if (!bch2_err_matches(ret, EROFS))
|
|
|
|
bch_err_msg(c, ret, "deleting snapshot %u", *i);
|
2024-12-12 03:03:58 -05:00
|
|
|
if (ret)
|
|
|
|
goto err;
|
|
|
|
}
|
2023-09-29 01:15:33 -04:00
|
|
|
|
bcachefs: Cleanup redundant snapshot nodes
After deleteing snapshots, we may be left with a snapshot tree where
some nodes only have one child, and we have a linear chain.
Interior snapshot nodes are never used directly (i.e. they never have
subvolumes that point to them), they are only referered to by child
snapshot nodes - hence, they are redundant.
The existing code talks about redundant snapshot nodes as forming and
equivalence class; i.e. nodes for which snapshot_t->equiv is equal. In a
given equivalence class, we only ever need a single key at a given
position - i.e. multiple versions with different snapshot fields are
redundant.
The existing snapshot cleanup code deletes these redundant keys, but not
redundant nodes. It turns out this is buggy, because we assume that
after snapshot deletion finishes we should only have a single key per
equivalence class, but the btree update path doesn't preserve this -
overwriting keys in old snapshots doesn't check for the equivalence
class being equal, and thus we can end up with duplicate keys in the
same equivalence class and fsck complaining about snapshot deletion not
having run correctly.
The equivalence class notion has been leaking out of the core snapshots
code and into too much other code, i.e. fsck, so this patch takes a
different approach: snapshot deletion now moves keys to the node in an
equivalence class being kept (the leafiest node) and then deletes the
redundant nodes in the equivalance class.
Some work has to be done to correctly delete interior snapshot nodes;
snapshot node depth and skiplist fields for descendent nodes have to be
fixed.
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
2023-08-17 22:10:02 -04:00
|
|
|
/*
|
|
|
|
* Fixing children of deleted snapshots can't be done completely
|
|
|
|
* atomically, if we crash between here and when we delete the interior
|
|
|
|
* nodes some depth fields will be off:
|
|
|
|
*/
|
2023-09-12 17:16:02 -04:00
|
|
|
ret = for_each_btree_key_commit(trans, iter, BTREE_ID_snapshots, POS_MIN,
|
2024-04-07 18:05:34 -04:00
|
|
|
BTREE_ITER_intent, k,
|
2023-11-11 16:31:50 -05:00
|
|
|
NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
|
2025-05-01 14:47:39 -04:00
|
|
|
bch2_fix_child_of_deleted_snapshot(trans, &iter, k, &d->delete_interior));
|
bcachefs: Cleanup redundant snapshot nodes
After deleteing snapshots, we may be left with a snapshot tree where
some nodes only have one child, and we have a linear chain.
Interior snapshot nodes are never used directly (i.e. they never have
subvolumes that point to them), they are only referered to by child
snapshot nodes - hence, they are redundant.
The existing code talks about redundant snapshot nodes as forming and
equivalence class; i.e. nodes for which snapshot_t->equiv is equal. In a
given equivalence class, we only ever need a single key at a given
position - i.e. multiple versions with different snapshot fields are
redundant.
The existing snapshot cleanup code deletes these redundant keys, but not
redundant nodes. It turns out this is buggy, because we assume that
after snapshot deletion finishes we should only have a single key per
equivalence class, but the btree update path doesn't preserve this -
overwriting keys in old snapshots doesn't check for the equivalence
class being equal, and thus we can end up with duplicate keys in the
same equivalence class and fsck complaining about snapshot deletion not
having run correctly.
The equivalence class notion has been leaking out of the core snapshots
code and into too much other code, i.e. fsck, so this patch takes a
different approach: snapshot deletion now moves keys to the node in an
equivalence class being kept (the leafiest node) and then deletes the
redundant nodes in the equivalance class.
Some work has to be done to correctly delete interior snapshot nodes;
snapshot node depth and skiplist fields for descendent nodes have to be
fixed.
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
2023-08-17 22:10:02 -04:00
|
|
|
if (ret)
|
2024-12-12 03:03:58 -05:00
|
|
|
goto err;
|
2023-08-16 16:54:33 -04:00
|
|
|
|
2025-05-01 14:47:39 -04:00
|
|
|
darray_for_each(d->delete_interior, i) {
|
2023-09-12 17:16:02 -04:00
|
|
|
ret = commit_do(trans, NULL, NULL, 0,
|
2024-12-12 03:03:58 -05:00
|
|
|
bch2_snapshot_node_delete(trans, i->id));
|
2024-12-31 09:55:09 -05:00
|
|
|
if (!bch2_err_matches(ret, EROFS))
|
|
|
|
bch_err_msg(c, ret, "deleting snapshot %u", i->id);
|
2023-12-16 22:43:41 -05:00
|
|
|
if (ret)
|
2024-12-12 03:03:58 -05:00
|
|
|
goto err;
|
2023-08-16 16:54:33 -04:00
|
|
|
}
|
|
|
|
err:
|
2025-05-01 14:47:39 -04:00
|
|
|
mutex_lock(&d->progress_lock);
|
2025-05-02 14:43:45 -04:00
|
|
|
darray_exit(&d->deleting_from_trees);
|
2025-05-01 14:47:39 -04:00
|
|
|
darray_exit(&d->delete_interior);
|
|
|
|
darray_exit(&d->delete_leaves);
|
|
|
|
d->running = false;
|
|
|
|
mutex_unlock(&d->progress_lock);
|
2023-09-12 17:16:02 -04:00
|
|
|
bch2_trans_put(trans);
|
2025-05-31 13:01:44 -04:00
|
|
|
|
|
|
|
bch2_recovery_pass_set_no_ratelimit(c, BCH_RECOVERY_PASS_check_snapshots);
|
2025-05-07 14:26:18 -04:00
|
|
|
out_unlock:
|
|
|
|
mutex_unlock(&d->lock);
|
2024-12-31 09:55:09 -05:00
|
|
|
if (!bch2_err_matches(ret, EROFS))
|
|
|
|
bch_err_fn(c, ret);
|
2023-08-16 16:54:33 -04:00
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2025-05-07 14:26:18 -04:00
|
|
|
int bch2_delete_dead_snapshots(struct bch_fs *c)
|
|
|
|
{
|
|
|
|
if (!c->opts.auto_snapshot_deletion)
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
return __bch2_delete_dead_snapshots(c);
|
|
|
|
}
|
|
|
|
|
2023-08-16 16:54:33 -04:00
|
|
|
void bch2_delete_dead_snapshots_work(struct work_struct *work)
|
|
|
|
{
|
2025-05-01 14:47:39 -04:00
|
|
|
struct bch_fs *c = container_of(work, struct bch_fs, snapshot_delete.work);
|
2023-08-16 16:54:33 -04:00
|
|
|
|
2024-06-19 08:43:15 -04:00
|
|
|
set_worker_desc("bcachefs-delete-dead-snapshots/%s", c->name);
|
|
|
|
|
2023-10-19 21:25:04 -04:00
|
|
|
bch2_delete_dead_snapshots(c);
|
2025-04-18 14:56:09 -04:00
|
|
|
enumerated_ref_put(&c->writes, BCH_WRITE_REF_delete_dead_snapshots);
|
2023-08-16 16:54:33 -04:00
|
|
|
}
|
|
|
|
|
|
|
|
void bch2_delete_dead_snapshots_async(struct bch_fs *c)
|
|
|
|
{
|
2025-05-07 14:26:18 -04:00
|
|
|
if (!c->opts.auto_snapshot_deletion)
|
|
|
|
return;
|
|
|
|
|
2025-04-18 14:56:09 -04:00
|
|
|
if (!enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_delete_dead_snapshots))
|
2024-10-31 03:39:32 -04:00
|
|
|
return;
|
|
|
|
|
|
|
|
BUG_ON(!test_bit(BCH_FS_may_go_rw, &c->flags));
|
|
|
|
|
2025-06-01 18:24:18 -04:00
|
|
|
if (!queue_work(system_long_wq, &c->snapshot_delete.work))
|
2025-04-18 14:56:09 -04:00
|
|
|
enumerated_ref_put(&c->writes, BCH_WRITE_REF_delete_dead_snapshots);
|
2023-08-16 16:54:33 -04:00
|
|
|
}
|
|
|
|
|
2025-05-01 14:47:39 -04:00
|
|
|
void bch2_snapshot_delete_status_to_text(struct printbuf *out, struct bch_fs *c)
|
|
|
|
{
|
|
|
|
struct snapshot_delete *d = &c->snapshot_delete;
|
|
|
|
|
|
|
|
if (!d->running) {
|
|
|
|
prt_str(out, "(not running)");
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
mutex_lock(&d->progress_lock);
|
|
|
|
bch2_snapshot_delete_nodes_to_text(out, d);
|
|
|
|
|
|
|
|
bch2_bbpos_to_text(out, d->pos);
|
|
|
|
mutex_unlock(&d->progress_lock);
|
|
|
|
}
|
|
|
|
|
2023-08-18 21:13:44 -04:00
|
|
|
int __bch2_key_has_snapshot_overwrites(struct btree_trans *trans,
|
|
|
|
enum btree_id id,
|
|
|
|
struct bpos pos)
|
|
|
|
{
|
|
|
|
struct bch_fs *c = trans->c;
|
|
|
|
struct btree_iter iter;
|
|
|
|
struct bkey_s_c k;
|
|
|
|
int ret;
|
|
|
|
|
2024-09-30 00:14:09 -04:00
|
|
|
for_each_btree_key_reverse_norestart(trans, iter, id, bpos_predecessor(pos),
|
|
|
|
BTREE_ITER_not_extents|
|
|
|
|
BTREE_ITER_all_snapshots,
|
|
|
|
k, ret) {
|
2023-08-18 21:13:44 -04:00
|
|
|
if (!bkey_eq(pos, k.k->p))
|
|
|
|
break;
|
|
|
|
|
|
|
|
if (bch2_snapshot_is_ancestor(c, k.k->p.snapshot, pos.snapshot)) {
|
|
|
|
ret = 1;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
bch2_trans_iter_exit(trans, &iter);
|
|
|
|
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2024-12-12 03:03:58 -05:00
|
|
|
static bool interior_snapshot_needs_delete(struct bkey_s_c_snapshot snap)
|
2023-10-19 21:25:04 -04:00
|
|
|
{
|
2024-12-12 03:03:58 -05:00
|
|
|
/* If there's one child, it's redundant and keys will be moved to the child */
|
|
|
|
return !!snap.v->children[0] + !!snap.v->children[1] == 1;
|
|
|
|
}
|
2023-10-19 21:25:04 -04:00
|
|
|
|
2024-12-12 03:03:58 -05:00
|
|
|
static int bch2_check_snapshot_needs_deletion(struct btree_trans *trans, struct bkey_s_c k)
|
|
|
|
{
|
2023-10-19 21:25:04 -04:00
|
|
|
if (k.k->type != KEY_TYPE_snapshot)
|
|
|
|
return 0;
|
|
|
|
|
2024-12-12 03:03:58 -05:00
|
|
|
struct bkey_s_c_snapshot snap = bkey_s_c_to_snapshot(k);
|
2025-05-02 12:33:17 -04:00
|
|
|
if (BCH_SNAPSHOT_WILL_DELETE(snap.v) ||
|
2024-12-12 03:03:58 -05:00
|
|
|
interior_snapshot_needs_delete(snap))
|
|
|
|
set_bit(BCH_FS_need_delete_dead_snapshots, &trans->c->flags);
|
2023-10-19 21:25:04 -04:00
|
|
|
|
2024-12-12 03:03:58 -05:00
|
|
|
return 0;
|
2023-10-19 21:25:04 -04:00
|
|
|
}
|
|
|
|
|
2023-08-16 16:54:33 -04:00
|
|
|
int bch2_snapshots_read(struct bch_fs *c)
|
|
|
|
{
|
2024-12-12 04:03:32 -05:00
|
|
|
/*
|
|
|
|
* Initializing the is_ancestor bitmaps requires ancestors to already be
|
|
|
|
* initialized - so mark in reverse:
|
|
|
|
*/
|
2023-12-16 22:30:09 -05:00
|
|
|
int ret = bch2_trans_run(c,
|
2024-12-12 04:03:32 -05:00
|
|
|
for_each_btree_key_reverse(trans, iter, BTREE_ID_snapshots,
|
|
|
|
POS_MAX, 0, k,
|
2023-12-27 23:19:09 -05:00
|
|
|
__bch2_mark_snapshot(trans, BTREE_ID_snapshots, 0, bkey_s_c_null, k, 0) ?:
|
2024-12-12 04:03:32 -05:00
|
|
|
bch2_check_snapshot_needs_deletion(trans, k)));
|
2023-12-16 22:43:41 -05:00
|
|
|
bch_err_fn(c, ret);
|
2024-03-27 22:50:19 -04:00
|
|
|
|
|
|
|
/*
|
|
|
|
* It's important that we check if we need to reconstruct snapshots
|
|
|
|
* before going RW, so we mark that pass as required in the superblock -
|
|
|
|
* otherwise, we could end up deleting keys with missing snapshot nodes
|
|
|
|
* instead
|
|
|
|
*/
|
|
|
|
BUG_ON(!test_bit(BCH_FS_new_fs, &c->flags) &&
|
|
|
|
test_bit(BCH_FS_may_go_rw, &c->flags));
|
|
|
|
|
2023-08-16 16:54:33 -04:00
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
void bch2_fs_snapshots_exit(struct bch_fs *c)
|
|
|
|
{
|
2024-01-16 19:05:37 +08:00
|
|
|
kvfree(rcu_dereference_protected(c->snapshots, true));
|
2023-08-16 16:54:33 -04:00
|
|
|
}
|
2025-05-01 14:47:39 -04:00
|
|
|
|
|
|
|
void bch2_fs_snapshots_init_early(struct bch_fs *c)
|
|
|
|
{
|
|
|
|
INIT_WORK(&c->snapshot_delete.work, bch2_delete_dead_snapshots_work);
|
2025-05-07 14:26:18 -04:00
|
|
|
mutex_init(&c->snapshot_delete.lock);
|
2025-05-01 14:47:39 -04:00
|
|
|
mutex_init(&c->snapshot_delete.progress_lock);
|
|
|
|
mutex_init(&c->snapshots_unlinked_lock);
|
|
|
|
}
|