2017-03-16 22:18:50 -08:00
|
|
|
// SPDX-License-Identifier: GPL-2.0
|
|
|
|
|
|
|
|
#include "bcachefs.h"
|
2019-09-22 19:10:21 -04:00
|
|
|
#include "btree_key_cache.h"
|
2023-07-17 00:56:29 -04:00
|
|
|
#include "btree_write_buffer.h"
|
2017-03-16 22:18:50 -08:00
|
|
|
#include "bkey_methods.h"
|
|
|
|
#include "btree_update.h"
|
2021-12-22 22:39:50 -05:00
|
|
|
#include "buckets.h"
|
2023-10-22 18:29:54 -04:00
|
|
|
#include "compress.h"
|
2023-12-07 12:39:13 -05:00
|
|
|
#include "dirent.h"
|
2023-11-09 14:22:46 -05:00
|
|
|
#include "disk_accounting.h"
|
2017-03-16 22:18:50 -08:00
|
|
|
#include "error.h"
|
|
|
|
#include "extents.h"
|
2021-03-16 00:28:17 -04:00
|
|
|
#include "extent_update.h"
|
bcachefs: bcachefs_metadata_version_inode_has_child_snapshots
There's an inherent race in taking a snapshot while an unlinked file is
open, and then reattaching it in the child snapshot.
In the interior snapshot node the file will appear unlinked, as though
it should be deleted - it's not referenced by anything in that snapshot
- but we can't delete it, because the file data is referenced by the
child snapshot.
This was being handled incorrectly with
propagate_key_to_snapshot_leaves() - but that doesn't resolve the
fundamental inconsistency of "this file looks like it should be deleted
according to normal rules, but - ".
To fix this, we need to fix the rule for when an inode is deleted. The
previous rule, ignoring snapshots (there was no well-defined rule
for with snapshots) was:
Unlinked, non open files are deleted, either at recovery time or
during online fsck
The new rule is:
Unlinked, non open files, that do not exist in child snapshots, are
deleted.
To make this work transactionally, we add a new inode flag,
BCH_INODE_has_child_snapshot; it overrides BCH_INODE_unlinked when
considering whether to delete an inode, or put it on the deleted list.
For transactional consistency, clearing it handled by the inode trigger:
when deleting an inode we check if there are parent inodes which can now
have the BCH_INODE_has_child_snapshot flag cleared.
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
2024-09-29 22:11:37 -04:00
|
|
|
#include "fs.h"
|
2017-03-16 22:18:50 -08:00
|
|
|
#include "inode.h"
|
2019-10-04 15:58:43 -04:00
|
|
|
#include "str_hash.h"
|
2023-08-16 16:54:33 -04:00
|
|
|
#include "snapshot.h"
|
2021-03-16 00:42:25 -04:00
|
|
|
#include "subvolume.h"
|
2020-11-05 23:39:33 -05:00
|
|
|
#include "varint.h"
|
2017-03-16 22:18:50 -08:00
|
|
|
|
|
|
|
#include <linux/random.h>
|
|
|
|
|
2024-10-01 15:35:57 -04:00
|
|
|
#include <linux/unaligned.h>
|
2017-03-16 22:18:50 -08:00
|
|
|
|
2018-12-17 05:31:09 -05:00
|
|
|
#define x(name, ...) #name,
|
2023-11-02 11:42:48 -04:00
|
|
|
const char * const bch2_inode_opts[] = {
|
2018-12-17 05:31:09 -05:00
|
|
|
BCH_INODE_OPTS()
|
|
|
|
NULL,
|
|
|
|
};
|
2017-03-16 22:18:50 -08:00
|
|
|
|
2023-11-02 11:42:48 -04:00
|
|
|
static const char * const bch2_inode_flag_strs[] = {
|
|
|
|
BCH_INODE_FLAGS()
|
|
|
|
NULL
|
|
|
|
};
|
|
|
|
#undef x
|
|
|
|
|
bcachefs: bcachefs_metadata_version_inode_has_child_snapshots
There's an inherent race in taking a snapshot while an unlinked file is
open, and then reattaching it in the child snapshot.
In the interior snapshot node the file will appear unlinked, as though
it should be deleted - it's not referenced by anything in that snapshot
- but we can't delete it, because the file data is referenced by the
child snapshot.
This was being handled incorrectly with
propagate_key_to_snapshot_leaves() - but that doesn't resolve the
fundamental inconsistency of "this file looks like it should be deleted
according to normal rules, but - ".
To fix this, we need to fix the rule for when an inode is deleted. The
previous rule, ignoring snapshots (there was no well-defined rule
for with snapshots) was:
Unlinked, non open files are deleted, either at recovery time or
during online fsck
The new rule is:
Unlinked, non open files, that do not exist in child snapshots, are
deleted.
To make this work transactionally, we add a new inode flag,
BCH_INODE_has_child_snapshot; it overrides BCH_INODE_unlinked when
considering whether to delete an inode, or put it on the deleted list.
For transactional consistency, clearing it handled by the inode trigger:
when deleting an inode we check if there are parent inodes which can now
have the BCH_INODE_has_child_snapshot flag cleared.
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
2024-09-29 22:11:37 -04:00
|
|
|
static int delete_ancestor_snapshot_inodes(struct btree_trans *, struct bpos);
|
|
|
|
|
2017-03-16 22:18:50 -08:00
|
|
|
static const u8 byte_table[8] = { 1, 2, 3, 4, 6, 8, 10, 13 };
|
|
|
|
|
|
|
|
static int inode_decode_field(const u8 *in, const u8 *end,
|
|
|
|
u64 out[2], unsigned *out_bits)
|
|
|
|
{
|
|
|
|
__be64 be[2] = { 0, 0 };
|
|
|
|
unsigned bytes, shift;
|
|
|
|
u8 *p;
|
|
|
|
|
|
|
|
if (in >= end)
|
|
|
|
return -1;
|
|
|
|
|
|
|
|
if (!*in)
|
|
|
|
return -1;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* position of highest set bit indicates number of bytes:
|
|
|
|
* shift = number of bits to remove in high byte:
|
|
|
|
*/
|
|
|
|
shift = 8 - __fls(*in); /* 1 <= shift <= 8 */
|
|
|
|
bytes = byte_table[shift - 1];
|
|
|
|
|
|
|
|
if (in + bytes > end)
|
|
|
|
return -1;
|
|
|
|
|
|
|
|
p = (u8 *) be + 16 - bytes;
|
|
|
|
memcpy(p, in, bytes);
|
|
|
|
*p ^= (1 << 8) >> shift;
|
|
|
|
|
|
|
|
out[0] = be64_to_cpu(be[0]);
|
|
|
|
out[1] = be64_to_cpu(be[1]);
|
|
|
|
*out_bits = out[0] ? 64 + fls64(out[0]) : fls64(out[1]);
|
|
|
|
|
|
|
|
return bytes;
|
|
|
|
}
|
|
|
|
|
2022-10-21 13:21:03 -04:00
|
|
|
static inline void bch2_inode_pack_inlined(struct bkey_inode_buf *packed,
|
2022-10-17 07:09:02 -04:00
|
|
|
const struct bch_inode_unpacked *inode)
|
2020-11-05 23:39:33 -05:00
|
|
|
{
|
2022-10-21 13:21:03 -04:00
|
|
|
struct bkey_i_inode_v3 *k = &packed->inode;
|
2020-11-05 23:39:33 -05:00
|
|
|
u8 *out = k->v.fields;
|
|
|
|
u8 *end = (void *) &packed[1];
|
|
|
|
u8 *last_nonzero_field = out;
|
|
|
|
unsigned nr_fields = 0, last_nonzero_fieldnr = 0;
|
|
|
|
unsigned bytes;
|
|
|
|
int ret;
|
|
|
|
|
2022-10-21 13:21:03 -04:00
|
|
|
bkey_inode_v3_init(&packed->inode.k_i);
|
2021-10-29 21:14:23 -04:00
|
|
|
packed->inode.k.p.offset = inode->bi_inum;
|
|
|
|
packed->inode.v.bi_journal_seq = cpu_to_le64(inode->bi_journal_seq);
|
|
|
|
packed->inode.v.bi_hash_seed = inode->bi_hash_seed;
|
|
|
|
packed->inode.v.bi_flags = cpu_to_le64(inode->bi_flags);
|
2022-10-21 13:21:03 -04:00
|
|
|
packed->inode.v.bi_sectors = cpu_to_le64(inode->bi_sectors);
|
|
|
|
packed->inode.v.bi_size = cpu_to_le64(inode->bi_size);
|
|
|
|
packed->inode.v.bi_version = cpu_to_le64(inode->bi_version);
|
|
|
|
SET_INODEv3_MODE(&packed->inode.v, inode->bi_mode);
|
|
|
|
SET_INODEv3_FIELDS_START(&packed->inode.v, INODEv3_FIELDS_START_CUR);
|
|
|
|
|
2021-10-29 21:14:23 -04:00
|
|
|
|
2020-11-05 23:39:33 -05:00
|
|
|
#define x(_name, _bits) \
|
|
|
|
nr_fields++; \
|
|
|
|
\
|
|
|
|
if (inode->_name) { \
|
2021-07-13 16:03:51 -04:00
|
|
|
ret = bch2_varint_encode_fast(out, inode->_name); \
|
2020-11-05 23:39:33 -05:00
|
|
|
out += ret; \
|
|
|
|
\
|
|
|
|
if (_bits > 64) \
|
|
|
|
*out++ = 0; \
|
|
|
|
\
|
|
|
|
last_nonzero_field = out; \
|
|
|
|
last_nonzero_fieldnr = nr_fields; \
|
|
|
|
} else { \
|
|
|
|
*out++ = 0; \
|
|
|
|
\
|
|
|
|
if (_bits > 64) \
|
|
|
|
*out++ = 0; \
|
|
|
|
}
|
|
|
|
|
2022-10-21 13:21:03 -04:00
|
|
|
BCH_INODE_FIELDS_v3()
|
2020-11-05 23:39:33 -05:00
|
|
|
#undef x
|
|
|
|
BUG_ON(out > end);
|
|
|
|
|
|
|
|
out = last_nonzero_field;
|
|
|
|
nr_fields = last_nonzero_fieldnr;
|
|
|
|
|
|
|
|
bytes = out - (u8 *) &packed->inode.v;
|
|
|
|
set_bkey_val_bytes(&packed->inode.k, bytes);
|
|
|
|
memset_u64s_tail(&packed->inode.v, 0, bytes);
|
|
|
|
|
2022-10-21 13:21:03 -04:00
|
|
|
SET_INODEv3_NR_FIELDS(&k->v, nr_fields);
|
2017-03-16 22:18:50 -08:00
|
|
|
|
|
|
|
if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG)) {
|
|
|
|
struct bch_inode_unpacked unpacked;
|
|
|
|
|
2023-09-12 18:41:22 -04:00
|
|
|
ret = bch2_inode_unpack(bkey_i_to_s_c(&packed->inode.k_i), &unpacked);
|
2017-03-16 22:18:50 -08:00
|
|
|
BUG_ON(ret);
|
|
|
|
BUG_ON(unpacked.bi_inum != inode->bi_inum);
|
|
|
|
BUG_ON(unpacked.bi_hash_seed != inode->bi_hash_seed);
|
2022-10-21 13:21:03 -04:00
|
|
|
BUG_ON(unpacked.bi_sectors != inode->bi_sectors);
|
|
|
|
BUG_ON(unpacked.bi_size != inode->bi_size);
|
|
|
|
BUG_ON(unpacked.bi_version != inode->bi_version);
|
2017-03-16 22:18:50 -08:00
|
|
|
BUG_ON(unpacked.bi_mode != inode->bi_mode);
|
|
|
|
|
2020-11-05 23:39:33 -05:00
|
|
|
#define x(_name, _bits) if (unpacked._name != inode->_name) \
|
|
|
|
panic("unpacked %llu should be %llu", \
|
|
|
|
(u64) unpacked._name, (u64) inode->_name);
|
2022-10-21 13:21:03 -04:00
|
|
|
BCH_INODE_FIELDS_v3()
|
2018-12-13 06:01:30 -05:00
|
|
|
#undef x
|
2017-03-16 22:18:50 -08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2022-10-21 13:21:03 -04:00
|
|
|
void bch2_inode_pack(struct bkey_inode_buf *packed,
|
2022-10-17 07:09:02 -04:00
|
|
|
const struct bch_inode_unpacked *inode)
|
|
|
|
{
|
2022-10-21 13:21:03 -04:00
|
|
|
bch2_inode_pack_inlined(packed, inode);
|
2022-10-17 07:09:02 -04:00
|
|
|
}
|
|
|
|
|
2020-11-05 23:39:33 -05:00
|
|
|
static noinline int bch2_inode_unpack_v1(struct bkey_s_c_inode inode,
|
|
|
|
struct bch_inode_unpacked *unpacked)
|
2017-03-16 22:18:50 -08:00
|
|
|
{
|
|
|
|
const u8 *in = inode.v->fields;
|
2020-11-05 23:39:33 -05:00
|
|
|
const u8 *end = bkey_val_end(inode);
|
2017-03-16 22:18:50 -08:00
|
|
|
u64 field[2];
|
|
|
|
unsigned fieldnr = 0, field_bits;
|
|
|
|
int ret;
|
|
|
|
|
2024-10-17 22:55:59 -04:00
|
|
|
#define x(_name, _bits) \
|
|
|
|
if (fieldnr++ == INODEv1_NR_FIELDS(inode.v)) { \
|
2017-03-16 22:18:50 -08:00
|
|
|
unsigned offset = offsetof(struct bch_inode_unpacked, _name);\
|
|
|
|
memset((void *) unpacked + offset, 0, \
|
|
|
|
sizeof(*unpacked) - offset); \
|
|
|
|
return 0; \
|
|
|
|
} \
|
|
|
|
\
|
|
|
|
ret = inode_decode_field(in, end, field, &field_bits); \
|
|
|
|
if (ret < 0) \
|
|
|
|
return ret; \
|
|
|
|
\
|
|
|
|
if (field_bits > sizeof(unpacked->_name) * 8) \
|
|
|
|
return -1; \
|
|
|
|
\
|
|
|
|
unpacked->_name = field[1]; \
|
|
|
|
in += ret;
|
|
|
|
|
2022-10-21 13:21:03 -04:00
|
|
|
BCH_INODE_FIELDS_v2()
|
2018-12-13 06:01:30 -05:00
|
|
|
#undef x
|
2017-03-16 22:18:50 -08:00
|
|
|
|
|
|
|
/* XXX: signal if there were more fields than expected? */
|
2020-11-05 23:39:33 -05:00
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2021-10-29 21:14:23 -04:00
|
|
|
static int bch2_inode_unpack_v2(struct bch_inode_unpacked *unpacked,
|
|
|
|
const u8 *in, const u8 *end,
|
|
|
|
unsigned nr_fields)
|
2020-11-05 23:39:33 -05:00
|
|
|
{
|
|
|
|
unsigned fieldnr = 0;
|
|
|
|
int ret;
|
|
|
|
u64 v[2];
|
|
|
|
|
|
|
|
#define x(_name, _bits) \
|
2021-10-29 21:14:23 -04:00
|
|
|
if (fieldnr < nr_fields) { \
|
2021-07-13 16:03:51 -04:00
|
|
|
ret = bch2_varint_decode_fast(in, end, &v[0]); \
|
2020-11-05 23:39:33 -05:00
|
|
|
if (ret < 0) \
|
|
|
|
return ret; \
|
|
|
|
in += ret; \
|
|
|
|
\
|
|
|
|
if (_bits > 64) { \
|
2021-07-13 16:03:51 -04:00
|
|
|
ret = bch2_varint_decode_fast(in, end, &v[1]); \
|
2020-11-05 23:39:33 -05:00
|
|
|
if (ret < 0) \
|
|
|
|
return ret; \
|
|
|
|
in += ret; \
|
|
|
|
} else { \
|
|
|
|
v[1] = 0; \
|
|
|
|
} \
|
|
|
|
} else { \
|
|
|
|
v[0] = v[1] = 0; \
|
|
|
|
} \
|
|
|
|
\
|
|
|
|
unpacked->_name = v[0]; \
|
|
|
|
if (v[1] || v[0] != unpacked->_name) \
|
|
|
|
return -1; \
|
|
|
|
fieldnr++;
|
|
|
|
|
2022-10-21 13:21:03 -04:00
|
|
|
BCH_INODE_FIELDS_v2()
|
2020-11-05 23:39:33 -05:00
|
|
|
#undef x
|
|
|
|
|
|
|
|
/* XXX: signal if there were more fields than expected? */
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2022-10-21 13:21:03 -04:00
|
|
|
static int bch2_inode_unpack_v3(struct bkey_s_c k,
|
|
|
|
struct bch_inode_unpacked *unpacked)
|
|
|
|
{
|
|
|
|
struct bkey_s_c_inode_v3 inode = bkey_s_c_to_inode_v3(k);
|
|
|
|
const u8 *in = inode.v->fields;
|
|
|
|
const u8 *end = bkey_val_end(inode);
|
|
|
|
unsigned nr_fields = INODEv3_NR_FIELDS(inode.v);
|
|
|
|
unsigned fieldnr = 0;
|
|
|
|
int ret;
|
|
|
|
u64 v[2];
|
|
|
|
|
|
|
|
unpacked->bi_inum = inode.k->p.offset;
|
|
|
|
unpacked->bi_journal_seq= le64_to_cpu(inode.v->bi_journal_seq);
|
|
|
|
unpacked->bi_hash_seed = inode.v->bi_hash_seed;
|
|
|
|
unpacked->bi_flags = le64_to_cpu(inode.v->bi_flags);
|
|
|
|
unpacked->bi_sectors = le64_to_cpu(inode.v->bi_sectors);
|
|
|
|
unpacked->bi_size = le64_to_cpu(inode.v->bi_size);
|
|
|
|
unpacked->bi_version = le64_to_cpu(inode.v->bi_version);
|
|
|
|
unpacked->bi_mode = INODEv3_MODE(inode.v);
|
|
|
|
|
|
|
|
#define x(_name, _bits) \
|
|
|
|
if (fieldnr < nr_fields) { \
|
|
|
|
ret = bch2_varint_decode_fast(in, end, &v[0]); \
|
|
|
|
if (ret < 0) \
|
|
|
|
return ret; \
|
|
|
|
in += ret; \
|
|
|
|
\
|
|
|
|
if (_bits > 64) { \
|
|
|
|
ret = bch2_varint_decode_fast(in, end, &v[1]); \
|
|
|
|
if (ret < 0) \
|
|
|
|
return ret; \
|
|
|
|
in += ret; \
|
|
|
|
} else { \
|
|
|
|
v[1] = 0; \
|
|
|
|
} \
|
|
|
|
} else { \
|
|
|
|
v[0] = v[1] = 0; \
|
|
|
|
} \
|
|
|
|
\
|
|
|
|
unpacked->_name = v[0]; \
|
|
|
|
if (v[1] || v[0] != unpacked->_name) \
|
|
|
|
return -1; \
|
|
|
|
fieldnr++;
|
|
|
|
|
|
|
|
BCH_INODE_FIELDS_v3()
|
|
|
|
#undef x
|
|
|
|
|
|
|
|
/* XXX: signal if there were more fields than expected? */
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
static noinline int bch2_inode_unpack_slowpath(struct bkey_s_c k,
|
|
|
|
struct bch_inode_unpacked *unpacked)
|
2020-11-05 23:39:33 -05:00
|
|
|
{
|
2022-12-03 15:44:54 -05:00
|
|
|
memset(unpacked, 0, sizeof(*unpacked));
|
|
|
|
|
2024-10-20 18:00:13 -04:00
|
|
|
unpacked->bi_snapshot = k.k->p.snapshot;
|
|
|
|
|
2021-10-29 21:14:23 -04:00
|
|
|
switch (k.k->type) {
|
|
|
|
case KEY_TYPE_inode: {
|
|
|
|
struct bkey_s_c_inode inode = bkey_s_c_to_inode(k);
|
|
|
|
|
|
|
|
unpacked->bi_inum = inode.k->p.offset;
|
2021-11-13 17:57:52 -05:00
|
|
|
unpacked->bi_journal_seq= 0;
|
2021-10-29 21:14:23 -04:00
|
|
|
unpacked->bi_hash_seed = inode.v->bi_hash_seed;
|
|
|
|
unpacked->bi_flags = le32_to_cpu(inode.v->bi_flags);
|
|
|
|
unpacked->bi_mode = le16_to_cpu(inode.v->bi_mode);
|
|
|
|
|
2024-10-17 22:55:59 -04:00
|
|
|
if (INODEv1_NEW_VARINT(inode.v)) {
|
2021-10-29 21:14:23 -04:00
|
|
|
return bch2_inode_unpack_v2(unpacked, inode.v->fields,
|
|
|
|
bkey_val_end(inode),
|
2024-10-17 22:55:59 -04:00
|
|
|
INODEv1_NR_FIELDS(inode.v));
|
2021-10-29 21:14:23 -04:00
|
|
|
} else {
|
|
|
|
return bch2_inode_unpack_v1(inode, unpacked);
|
|
|
|
}
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
case KEY_TYPE_inode_v2: {
|
|
|
|
struct bkey_s_c_inode_v2 inode = bkey_s_c_to_inode_v2(k);
|
|
|
|
|
|
|
|
unpacked->bi_inum = inode.k->p.offset;
|
|
|
|
unpacked->bi_journal_seq= le64_to_cpu(inode.v->bi_journal_seq);
|
|
|
|
unpacked->bi_hash_seed = inode.v->bi_hash_seed;
|
|
|
|
unpacked->bi_flags = le64_to_cpu(inode.v->bi_flags);
|
|
|
|
unpacked->bi_mode = le16_to_cpu(inode.v->bi_mode);
|
|
|
|
|
|
|
|
return bch2_inode_unpack_v2(unpacked, inode.v->fields,
|
|
|
|
bkey_val_end(inode),
|
|
|
|
INODEv2_NR_FIELDS(inode.v));
|
|
|
|
}
|
|
|
|
default:
|
|
|
|
BUG();
|
2020-11-05 23:39:33 -05:00
|
|
|
}
|
2017-03-16 22:18:50 -08:00
|
|
|
}
|
|
|
|
|
2022-10-21 13:21:03 -04:00
|
|
|
int bch2_inode_unpack(struct bkey_s_c k,
|
|
|
|
struct bch_inode_unpacked *unpacked)
|
|
|
|
{
|
2024-09-23 22:06:04 -04:00
|
|
|
unpacked->bi_snapshot = k.k->p.snapshot;
|
|
|
|
|
|
|
|
return likely(k.k->type == KEY_TYPE_inode_v3)
|
|
|
|
? bch2_inode_unpack_v3(k, unpacked)
|
|
|
|
: bch2_inode_unpack_slowpath(k, unpacked);
|
2022-10-21 13:21:03 -04:00
|
|
|
}
|
|
|
|
|
2024-09-24 05:33:07 -04:00
|
|
|
int __bch2_inode_peek(struct btree_trans *trans,
|
|
|
|
struct btree_iter *iter,
|
|
|
|
struct bch_inode_unpacked *inode,
|
|
|
|
subvol_inum inum, unsigned flags,
|
|
|
|
bool warn)
|
2019-10-01 16:51:57 -04:00
|
|
|
{
|
2021-03-16 00:28:17 -04:00
|
|
|
u32 snapshot;
|
2024-09-24 05:33:07 -04:00
|
|
|
int ret = __bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot, warn);
|
2021-03-16 00:28:17 -04:00
|
|
|
if (ret)
|
|
|
|
return ret;
|
|
|
|
|
2024-09-24 05:33:07 -04:00
|
|
|
struct bkey_s_c k = bch2_bkey_get_iter(trans, iter, BTREE_ID_inodes,
|
|
|
|
SPOS(0, inum.inum, snapshot),
|
|
|
|
flags|BTREE_ITER_cached);
|
2019-10-01 16:51:57 -04:00
|
|
|
ret = bkey_err(k);
|
|
|
|
if (ret)
|
2023-04-29 19:33:09 -04:00
|
|
|
return ret;
|
2019-10-01 16:51:57 -04:00
|
|
|
|
2023-05-27 19:59:59 -04:00
|
|
|
ret = bkey_is_inode(k.k) ? 0 : -BCH_ERR_ENOENT_inode;
|
2019-10-01 16:51:57 -04:00
|
|
|
if (ret)
|
|
|
|
goto err;
|
|
|
|
|
2021-10-29 21:14:23 -04:00
|
|
|
ret = bch2_inode_unpack(k, inode);
|
2019-10-01 16:51:57 -04:00
|
|
|
if (ret)
|
|
|
|
goto err;
|
|
|
|
|
2021-08-30 15:18:31 -04:00
|
|
|
return 0;
|
2019-10-01 16:51:57 -04:00
|
|
|
err:
|
2024-09-24 05:33:07 -04:00
|
|
|
if (warn)
|
|
|
|
bch_err_msg(trans->c, ret, "looking up inum %llu:%llu:", inum.subvol, inum.inum);
|
2021-08-30 15:18:31 -04:00
|
|
|
bch2_trans_iter_exit(trans, iter);
|
2023-09-10 22:05:50 -04:00
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2023-11-02 15:28:15 -04:00
|
|
|
int bch2_inode_write_flags(struct btree_trans *trans,
|
2019-10-01 16:51:57 -04:00
|
|
|
struct btree_iter *iter,
|
2023-11-02 15:28:15 -04:00
|
|
|
struct bch_inode_unpacked *inode,
|
2024-04-07 18:05:34 -04:00
|
|
|
enum btree_iter_update_trigger_flags flags)
|
2019-10-01 16:51:57 -04:00
|
|
|
{
|
|
|
|
struct bkey_inode_buf *inode_p;
|
|
|
|
|
|
|
|
inode_p = bch2_trans_kmalloc(trans, sizeof(*inode_p));
|
|
|
|
if (IS_ERR(inode_p))
|
|
|
|
return PTR_ERR(inode_p);
|
|
|
|
|
2022-10-21 13:21:03 -04:00
|
|
|
bch2_inode_pack_inlined(inode_p, inode);
|
bcachefs: Start using bpos.snapshot field
This patch starts treating the bpos.snapshot field like part of the key
in the btree code:
* bpos_successor() and bpos_predecessor() now include the snapshot field
* Keys in btrees that will be using snapshots (extents, inodes, dirents
and xattrs) now always have their snapshot field set to U32_MAX
The btree iterator code gets a new flag, BTREE_ITER_ALL_SNAPSHOTS, that
determines whether we're iterating over keys in all snapshots or not -
internally, this controlls whether bkey_(successor|predecessor)
increment/decrement the snapshot field, or only the higher bits of the
key.
We add a new member to struct btree_iter, iter->snapshot: when
BTREE_ITER_ALL_SNAPSHOTS is not set, iter->pos.snapshot should always
equal iter->snapshot, which will be 0 for btrees that don't use
snapshots, and alsways U32_MAX for btrees that will use snapshots
(until we enable snapshot creation).
This patch also introduces a new metadata version number, and compat
code for reading from/writing to older versions - this isn't a forced
upgrade (yet).
Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
2021-03-24 18:02:16 -04:00
|
|
|
inode_p->inode.k.p.snapshot = iter->snapshot;
|
2023-11-02 15:28:15 -04:00
|
|
|
return bch2_trans_update(trans, iter, &inode_p->inode.k_i, flags);
|
2019-10-01 16:51:57 -04:00
|
|
|
}
|
|
|
|
|
2024-09-30 00:00:33 -04:00
|
|
|
int __bch2_fsck_write_inode(struct btree_trans *trans, struct bch_inode_unpacked *inode)
|
2024-02-01 07:35:28 -05:00
|
|
|
{
|
|
|
|
struct bkey_inode_buf *inode_p =
|
|
|
|
bch2_trans_kmalloc(trans, sizeof(*inode_p));
|
|
|
|
|
|
|
|
if (IS_ERR(inode_p))
|
|
|
|
return PTR_ERR(inode_p);
|
|
|
|
|
|
|
|
bch2_inode_pack(inode_p, inode);
|
2024-09-30 00:00:33 -04:00
|
|
|
inode_p->inode.k.p.snapshot = inode->bi_snapshot;
|
2024-02-01 07:35:28 -05:00
|
|
|
|
|
|
|
return bch2_btree_insert_nonextent(trans, BTREE_ID_inodes,
|
|
|
|
&inode_p->inode.k_i,
|
2024-04-07 18:05:34 -04:00
|
|
|
BTREE_UPDATE_internal_snapshot_node);
|
2024-02-01 07:35:28 -05:00
|
|
|
}
|
|
|
|
|
2024-09-30 00:00:33 -04:00
|
|
|
int bch2_fsck_write_inode(struct btree_trans *trans, struct bch_inode_unpacked *inode)
|
2024-02-01 07:35:28 -05:00
|
|
|
{
|
|
|
|
int ret = commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
|
2024-09-30 00:00:33 -04:00
|
|
|
__bch2_fsck_write_inode(trans, inode));
|
2024-02-01 07:35:28 -05:00
|
|
|
bch_err_fn(trans->c, ret);
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2022-10-21 13:21:03 -04:00
|
|
|
struct bkey_i *bch2_inode_to_v3(struct btree_trans *trans, struct bkey_i *k)
|
|
|
|
{
|
|
|
|
struct bch_inode_unpacked u;
|
|
|
|
struct bkey_inode_buf *inode_p;
|
|
|
|
int ret;
|
|
|
|
|
|
|
|
if (!bkey_is_inode(&k->k))
|
|
|
|
return ERR_PTR(-ENOENT);
|
|
|
|
|
|
|
|
inode_p = bch2_trans_kmalloc(trans, sizeof(*inode_p));
|
|
|
|
if (IS_ERR(inode_p))
|
|
|
|
return ERR_CAST(inode_p);
|
|
|
|
|
|
|
|
ret = bch2_inode_unpack(bkey_i_to_s_c(k), &u);
|
|
|
|
if (ret)
|
|
|
|
return ERR_PTR(ret);
|
|
|
|
|
|
|
|
bch2_inode_pack(inode_p, &u);
|
|
|
|
return &inode_p->inode.k_i;
|
|
|
|
}
|
|
|
|
|
2024-08-12 21:31:25 -04:00
|
|
|
static int __bch2_inode_validate(struct bch_fs *c, struct bkey_s_c k,
|
|
|
|
enum bch_validate_flags flags)
|
2017-03-16 22:18:50 -08:00
|
|
|
{
|
2021-03-16 00:42:25 -04:00
|
|
|
struct bch_inode_unpacked unpacked;
|
2023-10-24 20:44:36 -04:00
|
|
|
int ret = 0;
|
2017-03-16 22:18:50 -08:00
|
|
|
|
2024-08-12 21:31:25 -04:00
|
|
|
bkey_fsck_err_on(k.k->p.inode,
|
|
|
|
c, inode_pos_inode_nonzero,
|
2023-10-24 20:44:36 -04:00
|
|
|
"nonzero k.p.inode");
|
2017-03-16 22:18:50 -08:00
|
|
|
|
2024-08-12 21:31:25 -04:00
|
|
|
bkey_fsck_err_on(k.k->p.offset < BLOCKDEV_INODE_MAX,
|
|
|
|
c, inode_pos_blockdev_range,
|
2023-10-24 20:44:36 -04:00
|
|
|
"fs inode in blockdev range");
|
2017-03-16 22:18:50 -08:00
|
|
|
|
2024-08-12 21:31:25 -04:00
|
|
|
bkey_fsck_err_on(bch2_inode_unpack(k, &unpacked),
|
|
|
|
c, inode_unpack_error,
|
2023-10-24 20:44:36 -04:00
|
|
|
"invalid variable length fields");
|
2021-10-29 21:14:23 -04:00
|
|
|
|
2024-08-12 21:31:25 -04:00
|
|
|
bkey_fsck_err_on(unpacked.bi_data_checksum >= BCH_CSUM_OPT_NR + 1,
|
|
|
|
c, inode_checksum_type_invalid,
|
2023-10-24 20:44:36 -04:00
|
|
|
"invalid data checksum type (%u >= %u",
|
|
|
|
unpacked.bi_data_checksum, BCH_CSUM_OPT_NR + 1);
|
2021-10-29 21:14:23 -04:00
|
|
|
|
2023-10-24 20:44:36 -04:00
|
|
|
bkey_fsck_err_on(unpacked.bi_compression &&
|
2024-08-12 21:31:25 -04:00
|
|
|
!bch2_compression_opt_valid(unpacked.bi_compression - 1),
|
|
|
|
c, inode_compression_type_invalid,
|
2023-10-24 20:44:36 -04:00
|
|
|
"invalid compression opt %u", unpacked.bi_compression - 1);
|
2021-10-29 21:14:23 -04:00
|
|
|
|
2023-11-02 11:42:48 -04:00
|
|
|
bkey_fsck_err_on((unpacked.bi_flags & BCH_INODE_unlinked) &&
|
2024-08-12 21:31:25 -04:00
|
|
|
unpacked.bi_nlink != 0,
|
|
|
|
c, inode_unlinked_but_nlink_nonzero,
|
2023-10-24 20:44:36 -04:00
|
|
|
"flagged as unlinked but bi_nlink != 0");
|
2021-10-29 21:14:23 -04:00
|
|
|
|
2024-08-12 21:31:25 -04:00
|
|
|
bkey_fsck_err_on(unpacked.bi_subvol && !S_ISDIR(unpacked.bi_mode),
|
|
|
|
c, inode_subvol_root_but_not_dir,
|
2023-10-24 20:44:36 -04:00
|
|
|
"subvolume root but not a directory");
|
|
|
|
fsck_err:
|
|
|
|
return ret;
|
2021-10-29 21:14:23 -04:00
|
|
|
}
|
|
|
|
|
2024-08-12 21:31:25 -04:00
|
|
|
int bch2_inode_validate(struct bch_fs *c, struct bkey_s_c k,
|
|
|
|
enum bch_validate_flags flags)
|
2021-10-29 21:14:23 -04:00
|
|
|
{
|
2022-04-03 17:50:01 -04:00
|
|
|
struct bkey_s_c_inode inode = bkey_s_c_to_inode(k);
|
2023-10-24 20:44:36 -04:00
|
|
|
int ret = 0;
|
2021-10-29 21:14:23 -04:00
|
|
|
|
2024-10-17 22:55:59 -04:00
|
|
|
bkey_fsck_err_on(INODEv1_STR_HASH(inode.v) >= BCH_STR_HASH_NR,
|
2024-08-12 21:31:25 -04:00
|
|
|
c, inode_str_hash_invalid,
|
2023-10-24 20:44:36 -04:00
|
|
|
"invalid str hash type (%llu >= %u)",
|
2024-10-17 22:55:59 -04:00
|
|
|
INODEv1_STR_HASH(inode.v), BCH_STR_HASH_NR);
|
2017-03-16 22:18:50 -08:00
|
|
|
|
2024-08-12 21:31:25 -04:00
|
|
|
ret = __bch2_inode_validate(c, k, flags);
|
2023-10-24 20:44:36 -04:00
|
|
|
fsck_err:
|
|
|
|
return ret;
|
2022-04-03 17:50:01 -04:00
|
|
|
}
|
2017-03-16 22:18:50 -08:00
|
|
|
|
2024-08-12 21:31:25 -04:00
|
|
|
int bch2_inode_v2_validate(struct bch_fs *c, struct bkey_s_c k,
|
|
|
|
enum bch_validate_flags flags)
|
2022-04-03 17:50:01 -04:00
|
|
|
{
|
|
|
|
struct bkey_s_c_inode_v2 inode = bkey_s_c_to_inode_v2(k);
|
2023-10-24 20:44:36 -04:00
|
|
|
int ret = 0;
|
2017-03-16 22:18:50 -08:00
|
|
|
|
2024-08-12 21:31:25 -04:00
|
|
|
bkey_fsck_err_on(INODEv2_STR_HASH(inode.v) >= BCH_STR_HASH_NR,
|
|
|
|
c, inode_str_hash_invalid,
|
2023-10-24 20:44:36 -04:00
|
|
|
"invalid str hash type (%llu >= %u)",
|
|
|
|
INODEv2_STR_HASH(inode.v), BCH_STR_HASH_NR);
|
2021-03-16 00:42:25 -04:00
|
|
|
|
2024-08-12 21:31:25 -04:00
|
|
|
ret = __bch2_inode_validate(c, k, flags);
|
2023-10-24 20:44:36 -04:00
|
|
|
fsck_err:
|
|
|
|
return ret;
|
2017-03-16 22:18:50 -08:00
|
|
|
}
|
|
|
|
|
2024-08-12 21:31:25 -04:00
|
|
|
int bch2_inode_v3_validate(struct bch_fs *c, struct bkey_s_c k,
|
|
|
|
enum bch_validate_flags flags)
|
2022-10-21 13:21:03 -04:00
|
|
|
{
|
|
|
|
struct bkey_s_c_inode_v3 inode = bkey_s_c_to_inode_v3(k);
|
2023-10-24 20:44:36 -04:00
|
|
|
int ret = 0;
|
2022-10-21 13:21:03 -04:00
|
|
|
|
2023-10-24 20:44:36 -04:00
|
|
|
bkey_fsck_err_on(INODEv3_FIELDS_START(inode.v) < INODEv3_FIELDS_START_INITIAL ||
|
2024-08-12 21:31:25 -04:00
|
|
|
INODEv3_FIELDS_START(inode.v) > bkey_val_u64s(inode.k),
|
|
|
|
c, inode_v3_fields_start_bad,
|
2023-10-24 20:44:36 -04:00
|
|
|
"invalid fields_start (got %llu, min %u max %zu)",
|
|
|
|
INODEv3_FIELDS_START(inode.v),
|
|
|
|
INODEv3_FIELDS_START_INITIAL,
|
|
|
|
bkey_val_u64s(inode.k));
|
2022-10-21 13:21:03 -04:00
|
|
|
|
2024-08-12 21:31:25 -04:00
|
|
|
bkey_fsck_err_on(INODEv3_STR_HASH(inode.v) >= BCH_STR_HASH_NR,
|
|
|
|
c, inode_str_hash_invalid,
|
2023-10-24 20:44:36 -04:00
|
|
|
"invalid str hash type (%llu >= %u)",
|
|
|
|
INODEv3_STR_HASH(inode.v), BCH_STR_HASH_NR);
|
2022-10-21 13:21:03 -04:00
|
|
|
|
2024-08-12 21:31:25 -04:00
|
|
|
ret = __bch2_inode_validate(c, k, flags);
|
2023-10-24 20:44:36 -04:00
|
|
|
fsck_err:
|
|
|
|
return ret;
|
2022-10-21 13:21:03 -04:00
|
|
|
}
|
|
|
|
|
|
|
|
static void __bch2_inode_unpacked_to_text(struct printbuf *out,
|
|
|
|
struct bch_inode_unpacked *inode)
|
2021-07-30 17:59:37 -04:00
|
|
|
{
|
2024-04-18 08:50:55 +08:00
|
|
|
prt_printf(out, "\n");
|
2024-01-21 12:19:01 -05:00
|
|
|
printbuf_indent_add(out, 2);
|
2024-04-10 16:08:24 -04:00
|
|
|
prt_printf(out, "mode=%o\n", inode->bi_mode);
|
2023-11-02 11:42:48 -04:00
|
|
|
|
|
|
|
prt_str(out, "flags=");
|
|
|
|
prt_bitflags(out, bch2_inode_flag_strs, inode->bi_flags & ((1U << 20) - 1));
|
2024-04-18 08:50:55 +08:00
|
|
|
prt_printf(out, "(%x)\n", inode->bi_flags);
|
2023-11-02 11:42:48 -04:00
|
|
|
|
2024-04-10 16:08:24 -04:00
|
|
|
prt_printf(out, "journal_seq=%llu\n", inode->bi_journal_seq);
|
2024-09-28 14:44:06 -04:00
|
|
|
prt_printf(out, "hash_seed=%llx\n", inode->bi_hash_seed);
|
|
|
|
prt_printf(out, "hash_type=");
|
|
|
|
bch2_prt_str_hash_type(out, INODE_STR_HASH(inode));
|
|
|
|
prt_newline(out);
|
2024-04-10 16:08:24 -04:00
|
|
|
prt_printf(out, "bi_size=%llu\n", inode->bi_size);
|
|
|
|
prt_printf(out, "bi_sectors=%llu\n", inode->bi_sectors);
|
|
|
|
prt_printf(out, "bi_version=%llu\n", inode->bi_version);
|
2021-07-30 17:59:37 -04:00
|
|
|
|
|
|
|
#define x(_name, _bits) \
|
2024-04-10 16:08:24 -04:00
|
|
|
prt_printf(out, #_name "=%llu\n", (u64) inode->_name);
|
2022-10-21 13:21:03 -04:00
|
|
|
BCH_INODE_FIELDS_v3()
|
2021-07-30 17:59:37 -04:00
|
|
|
#undef x
|
2024-05-26 22:20:34 -04:00
|
|
|
|
|
|
|
bch2_printbuf_strip_trailing_newline(out);
|
2024-01-21 12:19:01 -05:00
|
|
|
printbuf_indent_sub(out, 2);
|
2021-07-30 17:59:37 -04:00
|
|
|
}
|
|
|
|
|
|
|
|
void bch2_inode_unpacked_to_text(struct printbuf *out, struct bch_inode_unpacked *inode)
|
|
|
|
{
|
2024-09-23 22:06:04 -04:00
|
|
|
prt_printf(out, "inum: %llu:%u ", inode->bi_inum, inode->bi_snapshot);
|
2021-07-30 17:59:37 -04:00
|
|
|
__bch2_inode_unpacked_to_text(out, inode);
|
|
|
|
}
|
|
|
|
|
2022-10-21 13:21:03 -04:00
|
|
|
void bch2_inode_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c k)
|
2017-03-16 22:18:50 -08:00
|
|
|
{
|
2021-10-29 21:14:23 -04:00
|
|
|
struct bch_inode_unpacked inode;
|
2017-03-16 22:18:50 -08:00
|
|
|
|
2021-10-29 21:14:23 -04:00
|
|
|
if (bch2_inode_unpack(k, &inode)) {
|
2023-02-03 21:01:40 -05:00
|
|
|
prt_printf(out, "(unpack error)");
|
2018-11-01 15:10:01 -04:00
|
|
|
return;
|
|
|
|
}
|
2017-03-16 22:18:50 -08:00
|
|
|
|
2021-10-29 21:14:23 -04:00
|
|
|
__bch2_inode_unpacked_to_text(out, &inode);
|
2018-11-01 15:10:01 -04:00
|
|
|
}
|
|
|
|
|
2023-07-17 00:56:29 -04:00
|
|
|
static inline u64 bkey_inode_flags(struct bkey_s_c k)
|
|
|
|
{
|
|
|
|
switch (k.k->type) {
|
|
|
|
case KEY_TYPE_inode:
|
|
|
|
return le32_to_cpu(bkey_s_c_to_inode(k).v->bi_flags);
|
|
|
|
case KEY_TYPE_inode_v2:
|
|
|
|
return le64_to_cpu(bkey_s_c_to_inode_v2(k).v->bi_flags);
|
|
|
|
case KEY_TYPE_inode_v3:
|
|
|
|
return le64_to_cpu(bkey_s_c_to_inode_v3(k).v->bi_flags);
|
|
|
|
default:
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
bcachefs: bcachefs_metadata_version_inode_has_child_snapshots
There's an inherent race in taking a snapshot while an unlinked file is
open, and then reattaching it in the child snapshot.
In the interior snapshot node the file will appear unlinked, as though
it should be deleted - it's not referenced by anything in that snapshot
- but we can't delete it, because the file data is referenced by the
child snapshot.
This was being handled incorrectly with
propagate_key_to_snapshot_leaves() - but that doesn't resolve the
fundamental inconsistency of "this file looks like it should be deleted
according to normal rules, but - ".
To fix this, we need to fix the rule for when an inode is deleted. The
previous rule, ignoring snapshots (there was no well-defined rule
for with snapshots) was:
Unlinked, non open files are deleted, either at recovery time or
during online fsck
The new rule is:
Unlinked, non open files, that do not exist in child snapshots, are
deleted.
To make this work transactionally, we add a new inode flag,
BCH_INODE_has_child_snapshot; it overrides BCH_INODE_unlinked when
considering whether to delete an inode, or put it on the deleted list.
For transactional consistency, clearing it handled by the inode trigger:
when deleting an inode we check if there are parent inodes which can now
have the BCH_INODE_has_child_snapshot flag cleared.
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
2024-09-29 22:11:37 -04:00
|
|
|
static inline void bkey_inode_flags_set(struct bkey_s k, u64 f)
|
|
|
|
{
|
|
|
|
switch (k.k->type) {
|
|
|
|
case KEY_TYPE_inode:
|
|
|
|
bkey_s_to_inode(k).v->bi_flags = cpu_to_le32(f);
|
|
|
|
return;
|
|
|
|
case KEY_TYPE_inode_v2:
|
|
|
|
bkey_s_to_inode_v2(k).v->bi_flags = cpu_to_le64(f);
|
|
|
|
return;
|
|
|
|
case KEY_TYPE_inode_v3:
|
|
|
|
bkey_s_to_inode_v3(k).v->bi_flags = cpu_to_le64(f);
|
|
|
|
return;
|
|
|
|
default:
|
|
|
|
BUG();
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline bool bkey_is_unlinked_inode(struct bkey_s_c k)
|
|
|
|
{
|
|
|
|
unsigned f = bkey_inode_flags(k) & BCH_INODE_unlinked;
|
|
|
|
|
|
|
|
return (f & BCH_INODE_unlinked) && !(f & BCH_INODE_has_child_snapshot);
|
|
|
|
}
|
|
|
|
|
|
|
|
static struct bkey_s_c
|
|
|
|
bch2_bkey_get_iter_snapshot_parent(struct btree_trans *trans, struct btree_iter *iter,
|
|
|
|
enum btree_id btree, struct bpos pos,
|
|
|
|
unsigned flags)
|
|
|
|
{
|
|
|
|
struct bch_fs *c = trans->c;
|
|
|
|
struct bkey_s_c k;
|
|
|
|
int ret = 0;
|
|
|
|
|
|
|
|
for_each_btree_key_upto_norestart(trans, *iter, btree,
|
|
|
|
bpos_successor(pos),
|
|
|
|
SPOS(pos.inode, pos.offset, U32_MAX),
|
|
|
|
flags|BTREE_ITER_all_snapshots, k, ret)
|
|
|
|
if (bch2_snapshot_is_ancestor(c, pos.snapshot, k.k->p.snapshot))
|
|
|
|
return k;
|
|
|
|
|
|
|
|
bch2_trans_iter_exit(trans, iter);
|
|
|
|
return ret ? bkey_s_c_err(ret) : bkey_s_c_null;
|
|
|
|
}
|
|
|
|
|
|
|
|
static struct bkey_s_c
|
|
|
|
bch2_inode_get_iter_snapshot_parent(struct btree_trans *trans, struct btree_iter *iter,
|
|
|
|
struct bpos pos, unsigned flags)
|
|
|
|
{
|
|
|
|
struct bkey_s_c k;
|
|
|
|
again:
|
|
|
|
k = bch2_bkey_get_iter_snapshot_parent(trans, iter, BTREE_ID_inodes, pos, flags);
|
|
|
|
if (!k.k ||
|
|
|
|
bkey_err(k) ||
|
|
|
|
bkey_is_inode(k.k))
|
|
|
|
return k;
|
|
|
|
|
|
|
|
bch2_trans_iter_exit(trans, iter);
|
|
|
|
pos = k.k->p;
|
|
|
|
goto again;
|
|
|
|
}
|
|
|
|
|
|
|
|
int __bch2_inode_has_child_snapshots(struct btree_trans *trans, struct bpos pos)
|
2023-07-17 00:56:29 -04:00
|
|
|
{
|
bcachefs: bcachefs_metadata_version_inode_has_child_snapshots
There's an inherent race in taking a snapshot while an unlinked file is
open, and then reattaching it in the child snapshot.
In the interior snapshot node the file will appear unlinked, as though
it should be deleted - it's not referenced by anything in that snapshot
- but we can't delete it, because the file data is referenced by the
child snapshot.
This was being handled incorrectly with
propagate_key_to_snapshot_leaves() - but that doesn't resolve the
fundamental inconsistency of "this file looks like it should be deleted
according to normal rules, but - ".
To fix this, we need to fix the rule for when an inode is deleted. The
previous rule, ignoring snapshots (there was no well-defined rule
for with snapshots) was:
Unlinked, non open files are deleted, either at recovery time or
during online fsck
The new rule is:
Unlinked, non open files, that do not exist in child snapshots, are
deleted.
To make this work transactionally, we add a new inode flag,
BCH_INODE_has_child_snapshot; it overrides BCH_INODE_unlinked when
considering whether to delete an inode, or put it on the deleted list.
For transactional consistency, clearing it handled by the inode trigger:
when deleting an inode we check if there are parent inodes which can now
have the BCH_INODE_has_child_snapshot flag cleared.
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
2024-09-29 22:11:37 -04:00
|
|
|
struct bch_fs *c = trans->c;
|
|
|
|
struct btree_iter iter;
|
|
|
|
struct bkey_s_c k;
|
|
|
|
int ret = 0;
|
|
|
|
|
|
|
|
for_each_btree_key_upto_norestart(trans, iter,
|
|
|
|
BTREE_ID_inodes, POS(0, pos.offset), bpos_predecessor(pos),
|
|
|
|
BTREE_ITER_all_snapshots|
|
|
|
|
BTREE_ITER_with_updates, k, ret)
|
|
|
|
if (bch2_snapshot_is_ancestor(c, k.k->p.snapshot, pos.snapshot) &&
|
|
|
|
bkey_is_inode(k.k)) {
|
|
|
|
ret = 1;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
bch2_trans_iter_exit(trans, &iter);
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
static int update_inode_has_children(struct btree_trans *trans,
|
|
|
|
struct bkey_s k,
|
|
|
|
bool have_child)
|
|
|
|
{
|
|
|
|
if (!have_child) {
|
|
|
|
int ret = bch2_inode_has_child_snapshots(trans, k.k->p);
|
|
|
|
if (ret)
|
|
|
|
return ret < 0 ? ret : 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
u64 f = bkey_inode_flags(k.s_c);
|
|
|
|
if (have_child != !!(f & BCH_INODE_has_child_snapshot))
|
|
|
|
bkey_inode_flags_set(k, f ^ BCH_INODE_has_child_snapshot);
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
static int update_parent_inode_has_children(struct btree_trans *trans, struct bpos pos,
|
|
|
|
bool have_child)
|
|
|
|
{
|
|
|
|
struct btree_iter iter;
|
|
|
|
struct bkey_s_c k = bch2_inode_get_iter_snapshot_parent(trans,
|
|
|
|
&iter, pos, BTREE_ITER_with_updates);
|
|
|
|
int ret = bkey_err(k);
|
|
|
|
if (ret)
|
|
|
|
return ret;
|
|
|
|
if (!k.k)
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
if (!have_child) {
|
|
|
|
ret = bch2_inode_has_child_snapshots(trans, k.k->p);
|
|
|
|
if (ret) {
|
|
|
|
ret = ret < 0 ? ret : 0;
|
|
|
|
goto err;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
u64 f = bkey_inode_flags(k);
|
|
|
|
if (have_child != !!(f & BCH_INODE_has_child_snapshot)) {
|
|
|
|
struct bkey_i *update = bch2_bkey_make_mut(trans, &iter, &k,
|
|
|
|
BTREE_UPDATE_internal_snapshot_node);
|
|
|
|
ret = PTR_ERR_OR_ZERO(update);
|
|
|
|
if (ret)
|
|
|
|
goto err;
|
|
|
|
|
|
|
|
bkey_inode_flags_set(bkey_i_to_s(update), f ^ BCH_INODE_has_child_snapshot);
|
|
|
|
}
|
|
|
|
err:
|
|
|
|
bch2_trans_iter_exit(trans, &iter);
|
|
|
|
return ret;
|
2023-07-17 00:56:29 -04:00
|
|
|
}
|
|
|
|
|
2023-12-28 00:05:54 -05:00
|
|
|
int bch2_trigger_inode(struct btree_trans *trans,
|
|
|
|
enum btree_id btree_id, unsigned level,
|
|
|
|
struct bkey_s_c old,
|
|
|
|
struct bkey_s new,
|
bcachefs: Fix type of flags parameter for some ->trigger() implementations
When building with clang's -Wincompatible-function-pointer-types-strict
(a warning designed to catch potential kCFI failures at build time),
there are several warnings along the lines of:
fs/bcachefs/bkey_methods.c:118:2: error: incompatible function pointer types initializing 'int (*)(struct btree_trans *, enum btree_id, unsigned int, struct bkey_s_c, struct bkey_s, enum btree_iter_update_trigger_flags)' with an expression of type 'int (struct btree_trans *, enum btree_id, unsigned int, struct bkey_s_c, struct bkey_s, unsigned int)' [-Werror,-Wincompatible-function-pointer-types-strict]
118 | BCH_BKEY_TYPES()
| ^~~~~~~~~~~~~~~~
fs/bcachefs/bcachefs_format.h:394:2: note: expanded from macro 'BCH_BKEY_TYPES'
394 | x(inode, 8) \
| ^~~~~~~~~~~~~~~~~~~~~~~~~~
fs/bcachefs/bkey_methods.c:117:41: note: expanded from macro 'x'
117 | #define x(name, nr) [KEY_TYPE_##name] = bch2_bkey_ops_##name,
| ^~~~~~~~~~~~~~~~~~~~
<scratch space>:277:1: note: expanded from here
277 | bch2_bkey_ops_inode
| ^~~~~~~~~~~~~~~~~~~
fs/bcachefs/inode.h:26:13: note: expanded from macro 'bch2_bkey_ops_inode'
26 | .trigger = bch2_trigger_inode, \
| ^~~~~~~~~~~~~~~~~~
There are several functions that did not have their flags parameter
converted to 'enum btree_iter_update_trigger_flags' in the recent
unification, which will cause kCFI failures at runtime because the
types, while ABI compatible (hence no warning from the non-strict
version of this warning), do not match exactly.
Fix up these functions (as well as a few other obvious functions that
should have it, even if there are no warnings currently) to resolve the
warnings and potential kCFI runtime failures.
Fixes: 31e4ef3280c8 ("bcachefs: iter/update/trigger/str_hash flag cleanup")
Signed-off-by: Nathan Chancellor <nathan@kernel.org>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
2024-04-23 11:58:09 -07:00
|
|
|
enum btree_iter_update_trigger_flags flags)
|
2023-07-17 00:41:48 -04:00
|
|
|
{
|
bcachefs: bcachefs_metadata_version_inode_has_child_snapshots
There's an inherent race in taking a snapshot while an unlinked file is
open, and then reattaching it in the child snapshot.
In the interior snapshot node the file will appear unlinked, as though
it should be deleted - it's not referenced by anything in that snapshot
- but we can't delete it, because the file data is referenced by the
child snapshot.
This was being handled incorrectly with
propagate_key_to_snapshot_leaves() - but that doesn't resolve the
fundamental inconsistency of "this file looks like it should be deleted
according to normal rules, but - ".
To fix this, we need to fix the rule for when an inode is deleted. The
previous rule, ignoring snapshots (there was no well-defined rule
for with snapshots) was:
Unlinked, non open files are deleted, either at recovery time or
during online fsck
The new rule is:
Unlinked, non open files, that do not exist in child snapshots, are
deleted.
To make this work transactionally, we add a new inode flag,
BCH_INODE_has_child_snapshot; it overrides BCH_INODE_unlinked when
considering whether to delete an inode, or put it on the deleted list.
For transactional consistency, clearing it handled by the inode trigger:
when deleting an inode we check if there are parent inodes which can now
have the BCH_INODE_has_child_snapshot flag cleared.
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
2024-09-29 22:11:37 -04:00
|
|
|
struct bch_fs *c = trans->c;
|
|
|
|
|
2024-04-07 18:05:34 -04:00
|
|
|
if ((flags & BTREE_TRIGGER_atomic) && (flags & BTREE_TRIGGER_insert)) {
|
2023-12-28 00:05:54 -05:00
|
|
|
BUG_ON(!trans->journal_res.seq);
|
|
|
|
bkey_s_to_inode_v3(new).v->bi_journal_seq = cpu_to_le64(trans->journal_res.seq);
|
2023-07-17 00:41:48 -04:00
|
|
|
}
|
|
|
|
|
2024-02-11 22:48:05 -05:00
|
|
|
s64 nr = bkey_is_inode(new.k) - bkey_is_inode(old.k);
|
|
|
|
if ((flags & (BTREE_TRIGGER_transactional|BTREE_TRIGGER_gc)) && nr) {
|
|
|
|
struct disk_accounting_pos acc = { .type = BCH_DISK_ACCOUNTING_nr_inodes };
|
|
|
|
int ret = bch2_disk_accounting_mod(trans, &acc, &nr, 1, flags & BTREE_TRIGGER_gc);
|
|
|
|
if (ret)
|
|
|
|
return ret;
|
|
|
|
}
|
2023-07-17 00:41:48 -04:00
|
|
|
|
bcachefs: bcachefs_metadata_version_inode_has_child_snapshots
There's an inherent race in taking a snapshot while an unlinked file is
open, and then reattaching it in the child snapshot.
In the interior snapshot node the file will appear unlinked, as though
it should be deleted - it's not referenced by anything in that snapshot
- but we can't delete it, because the file data is referenced by the
child snapshot.
This was being handled incorrectly with
propagate_key_to_snapshot_leaves() - but that doesn't resolve the
fundamental inconsistency of "this file looks like it should be deleted
according to normal rules, but - ".
To fix this, we need to fix the rule for when an inode is deleted. The
previous rule, ignoring snapshots (there was no well-defined rule
for with snapshots) was:
Unlinked, non open files are deleted, either at recovery time or
during online fsck
The new rule is:
Unlinked, non open files, that do not exist in child snapshots, are
deleted.
To make this work transactionally, we add a new inode flag,
BCH_INODE_has_child_snapshot; it overrides BCH_INODE_unlinked when
considering whether to delete an inode, or put it on the deleted list.
For transactional consistency, clearing it handled by the inode trigger:
when deleting an inode we check if there are parent inodes which can now
have the BCH_INODE_has_child_snapshot flag cleared.
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
2024-09-29 22:11:37 -04:00
|
|
|
if (flags & BTREE_TRIGGER_transactional) {
|
|
|
|
int unlinked_delta = (int) bkey_is_unlinked_inode(new.s_c) -
|
|
|
|
(int) bkey_is_unlinked_inode(old);
|
|
|
|
if (unlinked_delta) {
|
|
|
|
int ret = bch2_btree_bit_mod_buffered(trans, BTREE_ID_deleted_inodes,
|
|
|
|
new.k->p, unlinked_delta > 0);
|
|
|
|
if (ret)
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* If we're creating or deleting an inode at this snapshot ID,
|
|
|
|
* and there might be an inode in a parent snapshot ID, we might
|
|
|
|
* need to set or clear the has_child_snapshot flag on the
|
|
|
|
* parent.
|
|
|
|
*/
|
|
|
|
int deleted_delta = (int) bkey_is_inode(new.k) -
|
|
|
|
(int) bkey_is_inode(old.k);
|
|
|
|
if (deleted_delta &&
|
|
|
|
bch2_snapshot_parent(c, new.k->p.snapshot)) {
|
|
|
|
int ret = update_parent_inode_has_children(trans, new.k->p,
|
|
|
|
deleted_delta > 0);
|
|
|
|
if (ret)
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* When an inode is first updated in a new snapshot, we may need
|
|
|
|
* to clear has_child_snapshot
|
|
|
|
*/
|
|
|
|
if (deleted_delta > 0) {
|
|
|
|
int ret = update_inode_has_children(trans, new, false);
|
|
|
|
if (ret)
|
|
|
|
return ret;
|
|
|
|
}
|
2023-07-17 00:41:48 -04:00
|
|
|
}
|
2023-12-28 00:05:54 -05:00
|
|
|
|
2023-07-17 00:41:48 -04:00
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2024-08-12 21:31:25 -04:00
|
|
|
int bch2_inode_generation_validate(struct bch_fs *c, struct bkey_s_c k,
|
|
|
|
enum bch_validate_flags flags)
|
2018-11-01 15:10:01 -04:00
|
|
|
{
|
2023-10-24 20:44:36 -04:00
|
|
|
int ret = 0;
|
2018-11-01 15:10:01 -04:00
|
|
|
|
2024-08-12 21:31:25 -04:00
|
|
|
bkey_fsck_err_on(k.k->p.inode,
|
|
|
|
c, inode_pos_inode_nonzero,
|
2023-10-24 20:44:36 -04:00
|
|
|
"nonzero k.p.inode");
|
|
|
|
fsck_err:
|
|
|
|
return ret;
|
2018-11-01 15:10:01 -04:00
|
|
|
}
|
|
|
|
|
|
|
|
void bch2_inode_generation_to_text(struct printbuf *out, struct bch_fs *c,
|
|
|
|
struct bkey_s_c k)
|
|
|
|
{
|
2019-06-24 17:55:15 -04:00
|
|
|
struct bkey_s_c_inode_generation gen = bkey_s_c_to_inode_generation(k);
|
|
|
|
|
2023-02-03 21:01:40 -05:00
|
|
|
prt_printf(out, "generation: %u", le32_to_cpu(gen.v->bi_generation));
|
2017-03-16 22:18:50 -08:00
|
|
|
}
|
|
|
|
|
2019-10-02 18:35:36 -04:00
|
|
|
void bch2_inode_init_early(struct bch_fs *c,
|
|
|
|
struct bch_inode_unpacked *inode_u)
|
2017-03-16 22:18:50 -08:00
|
|
|
{
|
2019-10-04 15:58:43 -04:00
|
|
|
enum bch_str_hash_type str_hash =
|
|
|
|
bch2_str_hash_opt_to_type(c, c->opts.str_hash);
|
2017-03-16 22:18:50 -08:00
|
|
|
|
|
|
|
memset(inode_u, 0, sizeof(*inode_u));
|
|
|
|
|
2024-10-17 22:55:59 -04:00
|
|
|
SET_INODE_STR_HASH(inode_u, str_hash);
|
|
|
|
get_random_bytes(&inode_u->bi_hash_seed, sizeof(inode_u->bi_hash_seed));
|
2019-10-02 18:35:36 -04:00
|
|
|
}
|
2017-03-16 22:18:50 -08:00
|
|
|
|
2019-10-02 18:35:36 -04:00
|
|
|
void bch2_inode_init_late(struct bch_inode_unpacked *inode_u, u64 now,
|
|
|
|
uid_t uid, gid_t gid, umode_t mode, dev_t rdev,
|
|
|
|
struct bch_inode_unpacked *parent)
|
|
|
|
{
|
2017-03-16 22:18:50 -08:00
|
|
|
inode_u->bi_mode = mode;
|
|
|
|
inode_u->bi_uid = uid;
|
|
|
|
inode_u->bi_gid = gid;
|
|
|
|
inode_u->bi_dev = rdev;
|
|
|
|
inode_u->bi_atime = now;
|
|
|
|
inode_u->bi_mtime = now;
|
|
|
|
inode_u->bi_ctime = now;
|
|
|
|
inode_u->bi_otime = now;
|
|
|
|
|
2019-10-02 18:35:36 -04:00
|
|
|
if (parent && parent->bi_mode & S_ISGID) {
|
|
|
|
inode_u->bi_gid = parent->bi_gid;
|
|
|
|
if (S_ISDIR(mode))
|
|
|
|
inode_u->bi_mode |= S_ISGID;
|
|
|
|
}
|
|
|
|
|
2017-03-16 22:18:50 -08:00
|
|
|
if (parent) {
|
2018-12-13 08:24:21 -05:00
|
|
|
#define x(_name, ...) inode_u->bi_##_name = parent->bi_##_name;
|
|
|
|
BCH_INODE_OPTS()
|
2018-12-13 06:01:30 -05:00
|
|
|
#undef x
|
2017-03-16 22:18:50 -08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2019-10-02 18:35:36 -04:00
|
|
|
void bch2_inode_init(struct bch_fs *c, struct bch_inode_unpacked *inode_u,
|
|
|
|
uid_t uid, gid_t gid, umode_t mode, dev_t rdev,
|
|
|
|
struct bch_inode_unpacked *parent)
|
|
|
|
{
|
|
|
|
bch2_inode_init_early(c, inode_u);
|
|
|
|
bch2_inode_init_late(inode_u, bch2_current_time(c),
|
|
|
|
uid, gid, mode, rdev, parent);
|
|
|
|
}
|
|
|
|
|
2017-03-16 22:18:50 -08:00
|
|
|
static inline u32 bkey_generation(struct bkey_s_c k)
|
|
|
|
{
|
|
|
|
switch (k.k->type) {
|
2018-11-01 15:10:01 -04:00
|
|
|
case KEY_TYPE_inode:
|
2021-10-29 21:14:23 -04:00
|
|
|
case KEY_TYPE_inode_v2:
|
2017-03-16 22:18:50 -08:00
|
|
|
BUG();
|
2018-11-01 15:10:01 -04:00
|
|
|
case KEY_TYPE_inode_generation:
|
2017-03-16 22:18:50 -08:00
|
|
|
return le32_to_cpu(bkey_s_c_to_inode_generation(k).v->bi_generation);
|
|
|
|
default:
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2021-03-16 00:28:17 -04:00
|
|
|
/*
|
|
|
|
* This just finds an empty slot:
|
|
|
|
*/
|
2021-08-30 15:18:31 -04:00
|
|
|
int bch2_inode_create(struct btree_trans *trans,
|
|
|
|
struct btree_iter *iter,
|
|
|
|
struct bch_inode_unpacked *inode_u,
|
|
|
|
u32 snapshot, u64 cpu)
|
2020-10-27 18:56:21 -04:00
|
|
|
{
|
|
|
|
struct bch_fs *c = trans->c;
|
|
|
|
struct bkey_s_c k;
|
2021-03-15 19:18:30 -04:00
|
|
|
u64 min, max, start, pos, *hint;
|
2021-05-14 20:02:44 -04:00
|
|
|
int ret = 0;
|
2021-05-27 20:20:20 -04:00
|
|
|
unsigned bits = (c->opts.inodes_32bit ? 31 : 63);
|
2020-11-02 23:51:33 -05:00
|
|
|
|
2021-05-27 20:20:20 -04:00
|
|
|
if (c->opts.shard_inode_numbers) {
|
|
|
|
bits -= c->inode_shard_bits;
|
2020-11-02 23:51:33 -05:00
|
|
|
|
2021-05-27 20:20:20 -04:00
|
|
|
min = (cpu << bits);
|
|
|
|
max = (cpu << bits) | ~(ULLONG_MAX << bits);
|
|
|
|
|
|
|
|
min = max_t(u64, min, BLOCKDEV_INODE_MAX);
|
|
|
|
hint = c->unused_inode_hints + cpu;
|
|
|
|
} else {
|
|
|
|
min = BLOCKDEV_INODE_MAX;
|
|
|
|
max = ~(ULLONG_MAX << bits);
|
|
|
|
hint = c->unused_inode_hints;
|
|
|
|
}
|
2020-11-02 23:51:33 -05:00
|
|
|
|
|
|
|
start = READ_ONCE(*hint);
|
|
|
|
|
|
|
|
if (start >= max || start < min)
|
|
|
|
start = min;
|
2021-03-15 19:18:30 -04:00
|
|
|
|
|
|
|
pos = start;
|
2021-08-30 15:18:31 -04:00
|
|
|
bch2_trans_iter_init(trans, iter, BTREE_ID_inodes, POS(0, pos),
|
2024-04-07 18:05:34 -04:00
|
|
|
BTREE_ITER_all_snapshots|
|
|
|
|
BTREE_ITER_intent);
|
2020-11-02 23:51:33 -05:00
|
|
|
again:
|
2021-03-15 19:18:30 -04:00
|
|
|
while ((k = bch2_btree_iter_peek(iter)).k &&
|
|
|
|
!(ret = bkey_err(k)) &&
|
2022-11-24 03:12:22 -05:00
|
|
|
bkey_lt(k.k->p, POS(0, max))) {
|
2023-01-09 02:25:08 -05:00
|
|
|
if (pos < iter->pos.offset)
|
|
|
|
goto found_slot;
|
2017-03-16 22:18:50 -08:00
|
|
|
|
2020-11-02 23:51:33 -05:00
|
|
|
/*
|
2021-03-15 19:18:30 -04:00
|
|
|
* We don't need to iterate over keys in every snapshot once
|
|
|
|
* we've found just one:
|
2020-11-02 23:51:33 -05:00
|
|
|
*/
|
2021-03-15 19:18:30 -04:00
|
|
|
pos = iter->pos.offset + 1;
|
|
|
|
bch2_btree_iter_set_pos(iter, POS(0, pos));
|
|
|
|
}
|
|
|
|
|
2023-01-09 02:25:08 -05:00
|
|
|
if (!ret && pos < max)
|
|
|
|
goto found_slot;
|
2020-10-27 18:56:21 -04:00
|
|
|
|
2021-03-15 19:18:30 -04:00
|
|
|
if (!ret && start == min)
|
2022-09-18 17:10:33 -04:00
|
|
|
ret = -BCH_ERR_ENOSPC_inode_create;
|
2020-10-27 18:56:21 -04:00
|
|
|
|
2021-03-15 19:18:30 -04:00
|
|
|
if (ret) {
|
2021-08-30 15:18:31 -04:00
|
|
|
bch2_trans_iter_exit(trans, iter);
|
|
|
|
return ret;
|
2020-11-02 23:51:33 -05:00
|
|
|
}
|
2017-03-16 22:18:50 -08:00
|
|
|
|
2021-03-15 19:18:30 -04:00
|
|
|
/* Retry from start */
|
|
|
|
pos = start = min;
|
|
|
|
bch2_btree_iter_set_pos(iter, POS(0, pos));
|
|
|
|
goto again;
|
2020-11-02 23:51:33 -05:00
|
|
|
found_slot:
|
2021-03-15 19:18:30 -04:00
|
|
|
bch2_btree_iter_set_pos(iter, SPOS(0, pos, snapshot));
|
|
|
|
k = bch2_btree_iter_peek_slot(iter);
|
|
|
|
ret = bkey_err(k);
|
|
|
|
if (ret) {
|
2021-08-30 15:18:31 -04:00
|
|
|
bch2_trans_iter_exit(trans, iter);
|
|
|
|
return ret;
|
2021-03-15 19:18:30 -04:00
|
|
|
}
|
|
|
|
|
2020-11-02 23:51:33 -05:00
|
|
|
*hint = k.k->p.offset;
|
|
|
|
inode_u->bi_inum = k.k->p.offset;
|
|
|
|
inode_u->bi_generation = bkey_generation(k);
|
2021-08-30 15:18:31 -04:00
|
|
|
return 0;
|
2017-03-16 22:18:50 -08:00
|
|
|
}
|
|
|
|
|
2021-03-16 00:28:17 -04:00
|
|
|
static int bch2_inode_delete_keys(struct btree_trans *trans,
|
|
|
|
subvol_inum inum, enum btree_id id)
|
|
|
|
{
|
2021-12-27 18:25:23 -05:00
|
|
|
struct btree_iter iter;
|
|
|
|
struct bkey_s_c k;
|
|
|
|
struct bkey_i delete;
|
2023-09-27 14:44:56 -04:00
|
|
|
struct bpos end = POS(inum.inum, U64_MAX);
|
2021-12-27 18:25:23 -05:00
|
|
|
u32 snapshot;
|
2021-03-16 00:28:17 -04:00
|
|
|
int ret = 0;
|
|
|
|
|
2021-12-27 18:25:23 -05:00
|
|
|
/*
|
2023-01-06 06:29:04 -05:00
|
|
|
* We're never going to be deleting partial extents, no need to use an
|
|
|
|
* extent iterator:
|
2021-12-27 18:25:23 -05:00
|
|
|
*/
|
|
|
|
bch2_trans_iter_init(trans, &iter, id, POS(inum.inum, 0),
|
2024-04-07 18:05:34 -04:00
|
|
|
BTREE_ITER_intent);
|
2021-03-16 00:28:17 -04:00
|
|
|
|
2021-12-27 18:25:23 -05:00
|
|
|
while (1) {
|
2021-03-16 00:28:17 -04:00
|
|
|
bch2_trans_begin(trans);
|
|
|
|
|
|
|
|
ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot);
|
|
|
|
if (ret)
|
2021-12-27 18:25:23 -05:00
|
|
|
goto err;
|
2021-03-16 00:28:17 -04:00
|
|
|
|
2021-12-27 18:25:23 -05:00
|
|
|
bch2_btree_iter_set_snapshot(&iter, snapshot);
|
2021-03-16 00:28:17 -04:00
|
|
|
|
2023-09-27 14:44:56 -04:00
|
|
|
k = bch2_btree_iter_peek_upto(&iter, end);
|
2021-03-16 00:28:17 -04:00
|
|
|
ret = bkey_err(k);
|
|
|
|
if (ret)
|
|
|
|
goto err;
|
|
|
|
|
2022-03-11 12:31:52 -05:00
|
|
|
if (!k.k)
|
2021-12-27 18:25:23 -05:00
|
|
|
break;
|
|
|
|
|
2021-03-16 00:28:17 -04:00
|
|
|
bkey_init(&delete.k);
|
|
|
|
delete.k.p = iter.pos;
|
|
|
|
|
2024-04-07 18:05:34 -04:00
|
|
|
if (iter.flags & BTREE_ITER_is_extents)
|
2023-09-27 14:44:56 -04:00
|
|
|
bch2_key_resize(&delete.k,
|
|
|
|
bpos_min(end, k.k->p).offset -
|
|
|
|
iter.pos.offset);
|
|
|
|
|
2021-03-16 00:28:17 -04:00
|
|
|
ret = bch2_trans_update(trans, &iter, &delete, 0) ?:
|
2021-12-27 18:25:23 -05:00
|
|
|
bch2_trans_commit(trans, NULL, NULL,
|
2023-11-11 16:31:50 -05:00
|
|
|
BCH_TRANS_COMMIT_no_enospc);
|
2021-03-16 00:28:17 -04:00
|
|
|
err:
|
2022-07-17 23:06:38 -04:00
|
|
|
if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart))
|
2021-12-27 18:25:23 -05:00
|
|
|
break;
|
2021-03-16 00:28:17 -04:00
|
|
|
}
|
|
|
|
|
2021-12-27 18:25:23 -05:00
|
|
|
bch2_trans_iter_exit(trans, &iter);
|
2021-03-16 00:28:17 -04:00
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2022-01-12 02:13:21 -05:00
|
|
|
int bch2_inode_rm(struct bch_fs *c, subvol_inum inum)
|
2017-03-16 22:18:50 -08:00
|
|
|
{
|
2023-09-12 17:16:02 -04:00
|
|
|
struct btree_trans *trans = bch2_trans_get(c);
|
2021-08-30 15:18:31 -04:00
|
|
|
struct btree_iter iter = { NULL };
|
2017-03-16 22:18:50 -08:00
|
|
|
struct bkey_i_inode_generation delete;
|
2021-03-21 22:01:12 -04:00
|
|
|
struct bch_inode_unpacked inode_u;
|
2019-09-22 19:10:21 -04:00
|
|
|
struct bkey_s_c k;
|
2021-03-16 00:28:17 -04:00
|
|
|
u32 snapshot;
|
2017-03-16 22:18:50 -08:00
|
|
|
int ret;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* If this was a directory, there shouldn't be any real dirents left -
|
|
|
|
* but there could be whiteouts (from hash collisions) that we should
|
|
|
|
* delete:
|
|
|
|
*
|
|
|
|
* XXX: the dirent could ideally would delete whiteouts when they're no
|
|
|
|
* longer needed
|
|
|
|
*/
|
2023-09-12 17:16:02 -04:00
|
|
|
ret = bch2_inode_delete_keys(trans, inum, BTREE_ID_extents) ?:
|
|
|
|
bch2_inode_delete_keys(trans, inum, BTREE_ID_xattrs) ?:
|
|
|
|
bch2_inode_delete_keys(trans, inum, BTREE_ID_dirents);
|
2018-08-08 19:53:30 -04:00
|
|
|
if (ret)
|
2020-11-20 21:28:55 -05:00
|
|
|
goto err;
|
2019-09-22 19:10:21 -04:00
|
|
|
retry:
|
2023-09-12 17:16:02 -04:00
|
|
|
bch2_trans_begin(trans);
|
2019-09-22 19:10:21 -04:00
|
|
|
|
2023-09-12 17:16:02 -04:00
|
|
|
ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot);
|
2021-03-16 00:28:17 -04:00
|
|
|
if (ret)
|
|
|
|
goto err;
|
|
|
|
|
2023-09-12 17:16:02 -04:00
|
|
|
k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_inodes,
|
2023-04-29 19:33:09 -04:00
|
|
|
SPOS(0, inum.inum, snapshot),
|
2024-04-07 18:05:34 -04:00
|
|
|
BTREE_ITER_intent|BTREE_ITER_cached);
|
2019-09-22 19:10:21 -04:00
|
|
|
ret = bkey_err(k);
|
|
|
|
if (ret)
|
|
|
|
goto err;
|
2017-03-16 22:18:50 -08:00
|
|
|
|
2021-10-29 21:14:23 -04:00
|
|
|
if (!bkey_is_inode(k.k)) {
|
2023-09-12 17:16:02 -04:00
|
|
|
bch2_fs_inconsistent(c,
|
2022-11-22 20:15:33 -05:00
|
|
|
"inode %llu:%u not found when deleting",
|
|
|
|
inum.inum, snapshot);
|
2021-03-21 22:01:12 -04:00
|
|
|
ret = -EIO;
|
|
|
|
goto err;
|
2019-09-22 19:10:21 -04:00
|
|
|
}
|
2017-03-16 22:18:50 -08:00
|
|
|
|
2021-10-29 21:14:23 -04:00
|
|
|
bch2_inode_unpack(k, &inode_u);
|
2021-03-21 22:01:12 -04:00
|
|
|
|
|
|
|
bkey_inode_generation_init(&delete.k_i);
|
2021-08-30 15:18:31 -04:00
|
|
|
delete.k.p = iter.pos;
|
2021-03-21 22:01:12 -04:00
|
|
|
delete.v.bi_generation = cpu_to_le32(inode_u.bi_generation + 1);
|
2017-03-16 22:18:50 -08:00
|
|
|
|
2023-09-12 17:16:02 -04:00
|
|
|
ret = bch2_trans_update(trans, &iter, &delete.k_i, 0) ?:
|
|
|
|
bch2_trans_commit(trans, NULL, NULL,
|
2023-11-11 16:31:50 -05:00
|
|
|
BCH_TRANS_COMMIT_no_enospc);
|
2019-09-22 19:10:21 -04:00
|
|
|
err:
|
2023-09-12 17:16:02 -04:00
|
|
|
bch2_trans_iter_exit(trans, &iter);
|
2022-07-17 23:06:38 -04:00
|
|
|
if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
|
2019-09-22 19:10:21 -04:00
|
|
|
goto retry;
|
2017-03-16 22:18:50 -08:00
|
|
|
|
bcachefs: bcachefs_metadata_version_inode_has_child_snapshots
There's an inherent race in taking a snapshot while an unlinked file is
open, and then reattaching it in the child snapshot.
In the interior snapshot node the file will appear unlinked, as though
it should be deleted - it's not referenced by anything in that snapshot
- but we can't delete it, because the file data is referenced by the
child snapshot.
This was being handled incorrectly with
propagate_key_to_snapshot_leaves() - but that doesn't resolve the
fundamental inconsistency of "this file looks like it should be deleted
according to normal rules, but - ".
To fix this, we need to fix the rule for when an inode is deleted. The
previous rule, ignoring snapshots (there was no well-defined rule
for with snapshots) was:
Unlinked, non open files are deleted, either at recovery time or
during online fsck
The new rule is:
Unlinked, non open files, that do not exist in child snapshots, are
deleted.
To make this work transactionally, we add a new inode flag,
BCH_INODE_has_child_snapshot; it overrides BCH_INODE_unlinked when
considering whether to delete an inode, or put it on the deleted list.
For transactional consistency, clearing it handled by the inode trigger:
when deleting an inode we check if there are parent inodes which can now
have the BCH_INODE_has_child_snapshot flag cleared.
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
2024-09-29 22:11:37 -04:00
|
|
|
if (ret)
|
|
|
|
goto err2;
|
|
|
|
|
|
|
|
ret = delete_ancestor_snapshot_inodes(trans, SPOS(0, inum.inum, snapshot));
|
|
|
|
err2:
|
2023-09-12 17:16:02 -04:00
|
|
|
bch2_trans_put(trans);
|
2017-03-16 22:18:50 -08:00
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2023-09-10 22:05:50 -04:00
|
|
|
int bch2_inode_find_by_inum_nowarn_trans(struct btree_trans *trans,
|
|
|
|
subvol_inum inum,
|
|
|
|
struct bch_inode_unpacked *inode)
|
|
|
|
{
|
|
|
|
struct btree_iter iter;
|
|
|
|
int ret;
|
|
|
|
|
|
|
|
ret = bch2_inode_peek_nowarn(trans, &iter, inode, inum, 0);
|
|
|
|
if (!ret)
|
|
|
|
bch2_trans_iter_exit(trans, &iter);
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2021-11-06 00:03:40 -04:00
|
|
|
int bch2_inode_find_by_inum_trans(struct btree_trans *trans,
|
|
|
|
subvol_inum inum,
|
|
|
|
struct bch_inode_unpacked *inode)
|
2017-03-16 22:18:50 -08:00
|
|
|
{
|
2021-03-16 00:28:17 -04:00
|
|
|
struct btree_iter iter;
|
2019-10-25 19:06:26 -04:00
|
|
|
int ret;
|
2017-03-16 22:18:50 -08:00
|
|
|
|
2021-03-16 00:28:17 -04:00
|
|
|
ret = bch2_inode_peek(trans, &iter, inode, inum, 0);
|
|
|
|
if (!ret)
|
|
|
|
bch2_trans_iter_exit(trans, &iter);
|
2019-03-31 17:37:30 -04:00
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2021-03-16 00:28:17 -04:00
|
|
|
int bch2_inode_find_by_inum(struct bch_fs *c, subvol_inum inum,
|
2019-03-31 17:37:30 -04:00
|
|
|
struct bch_inode_unpacked *inode)
|
|
|
|
{
|
2024-10-13 21:53:26 -04:00
|
|
|
return bch2_trans_do(c, bch2_inode_find_by_inum_trans(trans, inum, inode));
|
2017-03-16 22:18:50 -08:00
|
|
|
}
|
2022-06-23 18:26:01 -04:00
|
|
|
|
|
|
|
int bch2_inode_nlink_inc(struct bch_inode_unpacked *bi)
|
|
|
|
{
|
2023-11-02 11:42:48 -04:00
|
|
|
if (bi->bi_flags & BCH_INODE_unlinked)
|
|
|
|
bi->bi_flags &= ~BCH_INODE_unlinked;
|
2022-06-23 18:26:01 -04:00
|
|
|
else {
|
|
|
|
if (bi->bi_nlink == U32_MAX)
|
|
|
|
return -EINVAL;
|
|
|
|
|
|
|
|
bi->bi_nlink++;
|
|
|
|
}
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
void bch2_inode_nlink_dec(struct btree_trans *trans, struct bch_inode_unpacked *bi)
|
|
|
|
{
|
2023-11-02 11:42:48 -04:00
|
|
|
if (bi->bi_nlink && (bi->bi_flags & BCH_INODE_unlinked)) {
|
2022-06-23 18:26:01 -04:00
|
|
|
bch2_trans_inconsistent(trans, "inode %llu unlinked but link count nonzero",
|
|
|
|
bi->bi_inum);
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
2023-11-02 11:42:48 -04:00
|
|
|
if (bi->bi_flags & BCH_INODE_unlinked) {
|
2022-06-23 18:26:01 -04:00
|
|
|
bch2_trans_inconsistent(trans, "inode %llu link count underflow", bi->bi_inum);
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (bi->bi_nlink)
|
|
|
|
bi->bi_nlink--;
|
|
|
|
else
|
2023-11-02 11:42:48 -04:00
|
|
|
bi->bi_flags |= BCH_INODE_unlinked;
|
2022-06-23 18:26:01 -04:00
|
|
|
}
|
2022-11-23 20:28:15 -05:00
|
|
|
|
|
|
|
struct bch_opts bch2_inode_opts_to_opts(struct bch_inode_unpacked *inode)
|
|
|
|
{
|
|
|
|
struct bch_opts ret = { 0 };
|
|
|
|
#define x(_name, _bits) \
|
|
|
|
if (inode->bi_##_name) \
|
|
|
|
opt_set(ret, _name, inode->bi_##_name - 1);
|
|
|
|
BCH_INODE_OPTS()
|
|
|
|
#undef x
|
|
|
|
return ret;
|
|
|
|
}
|
2022-11-23 20:14:55 -05:00
|
|
|
|
|
|
|
void bch2_inode_opts_get(struct bch_io_opts *opts, struct bch_fs *c,
|
|
|
|
struct bch_inode_unpacked *inode)
|
|
|
|
{
|
|
|
|
#define x(_name, _bits) opts->_name = inode_opt_get(c, inode, _name);
|
|
|
|
BCH_INODE_OPTS()
|
|
|
|
#undef x
|
bcachefs: Nocow support
This adds support for nocow mode, where we do writes in-place when
possible. Patch components:
- New boolean filesystem and inode option, nocow: note that when nocow
is enabled, data checksumming and compression are implicitly disabled
- To prevent in-place writes from racing with data moves
(data_update.c) or bucket reuse (i.e. a bucket being reused and
re-allocated while a nocow write is in flight, we have a new locking
mechanism.
Buckets can be locked for either data update or data move, using a
fixed size hash table of two_state_shared locks. We don't have any
chaining, meaning updates and moves to different buckets that hash to
the same lock will wait unnecessarily - we'll want to watch for this
becoming an issue.
- The allocator path also needs to check for in-place writes in flight
to a given bucket before giving it out: thus we add another counter
to bucket_alloc_state so we can track this.
- Fsync now may need to issue cache flushes to block devices instead of
flushing the journal. We add a device bitmask to bch_inode_info,
ei_devs_need_flush, which tracks devices that need to have flushes
issued - note that this will lead to unnecessary flushes when other
codepaths have already issued flushes, we may want to replace this with
a sequence number.
- New nocow write path: look up extents, and if they're writable write
to them - otherwise fall back to the normal COW write path.
XXX: switch to sequence numbers instead of bitmask for devs needing
journal flush
XXX: ei_quota_lock being a mutex means bch2_nocow_write_done() needs to
run in process context - see if we can improve this
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
2022-11-02 17:12:00 -04:00
|
|
|
|
|
|
|
if (opts->nocow)
|
|
|
|
opts->compression = opts->background_compression = opts->data_checksum = opts->erasure_code = 0;
|
2022-11-23 20:14:55 -05:00
|
|
|
}
|
2023-07-21 03:20:08 -04:00
|
|
|
|
2023-10-20 14:05:31 -04:00
|
|
|
int bch2_inum_opts_get(struct btree_trans *trans, subvol_inum inum, struct bch_io_opts *opts)
|
|
|
|
{
|
|
|
|
struct bch_inode_unpacked inode;
|
|
|
|
int ret = lockrestart_do(trans, bch2_inode_find_by_inum_trans(trans, inum, &inode));
|
|
|
|
|
|
|
|
if (ret)
|
|
|
|
return ret;
|
|
|
|
|
|
|
|
bch2_inode_opts_get(opts, trans->c, &inode);
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
bcachefs: bcachefs_metadata_version_inode_has_child_snapshots
There's an inherent race in taking a snapshot while an unlinked file is
open, and then reattaching it in the child snapshot.
In the interior snapshot node the file will appear unlinked, as though
it should be deleted - it's not referenced by anything in that snapshot
- but we can't delete it, because the file data is referenced by the
child snapshot.
This was being handled incorrectly with
propagate_key_to_snapshot_leaves() - but that doesn't resolve the
fundamental inconsistency of "this file looks like it should be deleted
according to normal rules, but - ".
To fix this, we need to fix the rule for when an inode is deleted. The
previous rule, ignoring snapshots (there was no well-defined rule
for with snapshots) was:
Unlinked, non open files are deleted, either at recovery time or
during online fsck
The new rule is:
Unlinked, non open files, that do not exist in child snapshots, are
deleted.
To make this work transactionally, we add a new inode flag,
BCH_INODE_has_child_snapshot; it overrides BCH_INODE_unlinked when
considering whether to delete an inode, or put it on the deleted list.
For transactional consistency, clearing it handled by the inode trigger:
when deleting an inode we check if there are parent inodes which can now
have the BCH_INODE_has_child_snapshot flag cleared.
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
2024-09-29 22:11:37 -04:00
|
|
|
static noinline int __bch2_inode_rm_snapshot(struct btree_trans *trans, u64 inum, u32 snapshot)
|
2023-07-21 03:20:08 -04:00
|
|
|
{
|
|
|
|
struct bch_fs *c = trans->c;
|
|
|
|
struct btree_iter iter = { NULL };
|
|
|
|
struct bkey_i_inode_generation delete;
|
|
|
|
struct bch_inode_unpacked inode_u;
|
|
|
|
struct bkey_s_c k;
|
|
|
|
int ret;
|
|
|
|
|
|
|
|
do {
|
|
|
|
ret = bch2_btree_delete_range_trans(trans, BTREE_ID_extents,
|
|
|
|
SPOS(inum, 0, snapshot),
|
|
|
|
SPOS(inum, U64_MAX, snapshot),
|
|
|
|
0, NULL) ?:
|
|
|
|
bch2_btree_delete_range_trans(trans, BTREE_ID_dirents,
|
|
|
|
SPOS(inum, 0, snapshot),
|
|
|
|
SPOS(inum, U64_MAX, snapshot),
|
|
|
|
0, NULL) ?:
|
|
|
|
bch2_btree_delete_range_trans(trans, BTREE_ID_xattrs,
|
|
|
|
SPOS(inum, 0, snapshot),
|
|
|
|
SPOS(inum, U64_MAX, snapshot),
|
|
|
|
0, NULL);
|
|
|
|
} while (ret == -BCH_ERR_transaction_restart_nested);
|
|
|
|
if (ret)
|
|
|
|
goto err;
|
|
|
|
retry:
|
|
|
|
bch2_trans_begin(trans);
|
|
|
|
|
|
|
|
k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_inodes,
|
2024-04-07 18:05:34 -04:00
|
|
|
SPOS(0, inum, snapshot), BTREE_ITER_intent);
|
2023-07-21 03:20:08 -04:00
|
|
|
ret = bkey_err(k);
|
|
|
|
if (ret)
|
|
|
|
goto err;
|
|
|
|
|
|
|
|
if (!bkey_is_inode(k.k)) {
|
|
|
|
bch2_fs_inconsistent(c,
|
|
|
|
"inode %llu:%u not found when deleting",
|
|
|
|
inum, snapshot);
|
|
|
|
ret = -EIO;
|
|
|
|
goto err;
|
|
|
|
}
|
|
|
|
|
|
|
|
bch2_inode_unpack(k, &inode_u);
|
|
|
|
|
|
|
|
/* Subvolume root? */
|
|
|
|
if (inode_u.bi_subvol)
|
|
|
|
bch_warn(c, "deleting inode %llu marked as unlinked, but also a subvolume root!?", inode_u.bi_inum);
|
|
|
|
|
|
|
|
bkey_inode_generation_init(&delete.k_i);
|
|
|
|
delete.k.p = iter.pos;
|
|
|
|
delete.v.bi_generation = cpu_to_le32(inode_u.bi_generation + 1);
|
|
|
|
|
|
|
|
ret = bch2_trans_update(trans, &iter, &delete.k_i, 0) ?:
|
|
|
|
bch2_trans_commit(trans, NULL, NULL,
|
2023-11-11 16:31:50 -05:00
|
|
|
BCH_TRANS_COMMIT_no_enospc);
|
2023-07-21 03:20:08 -04:00
|
|
|
err:
|
|
|
|
bch2_trans_iter_exit(trans, &iter);
|
|
|
|
if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
|
|
|
|
goto retry;
|
|
|
|
|
|
|
|
return ret ?: -BCH_ERR_transaction_restart_nested;
|
|
|
|
}
|
2023-07-17 00:56:29 -04:00
|
|
|
|
bcachefs: bcachefs_metadata_version_inode_has_child_snapshots
There's an inherent race in taking a snapshot while an unlinked file is
open, and then reattaching it in the child snapshot.
In the interior snapshot node the file will appear unlinked, as though
it should be deleted - it's not referenced by anything in that snapshot
- but we can't delete it, because the file data is referenced by the
child snapshot.
This was being handled incorrectly with
propagate_key_to_snapshot_leaves() - but that doesn't resolve the
fundamental inconsistency of "this file looks like it should be deleted
according to normal rules, but - ".
To fix this, we need to fix the rule for when an inode is deleted. The
previous rule, ignoring snapshots (there was no well-defined rule
for with snapshots) was:
Unlinked, non open files are deleted, either at recovery time or
during online fsck
The new rule is:
Unlinked, non open files, that do not exist in child snapshots, are
deleted.
To make this work transactionally, we add a new inode flag,
BCH_INODE_has_child_snapshot; it overrides BCH_INODE_unlinked when
considering whether to delete an inode, or put it on the deleted list.
For transactional consistency, clearing it handled by the inode trigger:
when deleting an inode we check if there are parent inodes which can now
have the BCH_INODE_has_child_snapshot flag cleared.
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
2024-09-29 22:11:37 -04:00
|
|
|
/*
|
|
|
|
* After deleting an inode, there may be versions in older snapshots that should
|
|
|
|
* also be deleted - if they're not referenced by sibling snapshots and not open
|
|
|
|
* in other subvolumes:
|
|
|
|
*/
|
|
|
|
static int delete_ancestor_snapshot_inodes(struct btree_trans *trans, struct bpos pos)
|
|
|
|
{
|
|
|
|
struct btree_iter iter;
|
|
|
|
struct bkey_s_c k;
|
|
|
|
int ret;
|
|
|
|
next_parent:
|
|
|
|
ret = lockrestart_do(trans,
|
|
|
|
bkey_err(k = bch2_inode_get_iter_snapshot_parent(trans, &iter, pos, 0)));
|
|
|
|
if (ret || !k.k)
|
|
|
|
return ret;
|
|
|
|
|
|
|
|
bool unlinked = bkey_is_unlinked_inode(k);
|
|
|
|
pos = k.k->p;
|
|
|
|
bch2_trans_iter_exit(trans, &iter);
|
|
|
|
|
|
|
|
if (!unlinked)
|
|
|
|
return 0;
|
|
|
|
|
2024-10-02 21:23:41 -04:00
|
|
|
ret = lockrestart_do(trans, bch2_inode_or_descendents_is_open(trans, pos));
|
|
|
|
if (ret)
|
|
|
|
return ret < 0 ? ret : 0;
|
bcachefs: bcachefs_metadata_version_inode_has_child_snapshots
There's an inherent race in taking a snapshot while an unlinked file is
open, and then reattaching it in the child snapshot.
In the interior snapshot node the file will appear unlinked, as though
it should be deleted - it's not referenced by anything in that snapshot
- but we can't delete it, because the file data is referenced by the
child snapshot.
This was being handled incorrectly with
propagate_key_to_snapshot_leaves() - but that doesn't resolve the
fundamental inconsistency of "this file looks like it should be deleted
according to normal rules, but - ".
To fix this, we need to fix the rule for when an inode is deleted. The
previous rule, ignoring snapshots (there was no well-defined rule
for with snapshots) was:
Unlinked, non open files are deleted, either at recovery time or
during online fsck
The new rule is:
Unlinked, non open files, that do not exist in child snapshots, are
deleted.
To make this work transactionally, we add a new inode flag,
BCH_INODE_has_child_snapshot; it overrides BCH_INODE_unlinked when
considering whether to delete an inode, or put it on the deleted list.
For transactional consistency, clearing it handled by the inode trigger:
when deleting an inode we check if there are parent inodes which can now
have the BCH_INODE_has_child_snapshot flag cleared.
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
2024-09-29 22:11:37 -04:00
|
|
|
|
|
|
|
ret = __bch2_inode_rm_snapshot(trans, pos.offset, pos.snapshot);
|
|
|
|
if (ret)
|
|
|
|
return ret;
|
|
|
|
goto next_parent;
|
|
|
|
}
|
|
|
|
|
|
|
|
int bch2_inode_rm_snapshot(struct btree_trans *trans, u64 inum, u32 snapshot)
|
|
|
|
{
|
|
|
|
return __bch2_inode_rm_snapshot(trans, inum, snapshot) ?:
|
|
|
|
delete_ancestor_snapshot_inodes(trans, SPOS(0, inum, snapshot));
|
|
|
|
}
|
|
|
|
|
2023-11-02 15:28:15 -04:00
|
|
|
static int may_delete_deleted_inode(struct btree_trans *trans,
|
|
|
|
struct btree_iter *iter,
|
|
|
|
struct bpos pos,
|
|
|
|
bool *need_another_pass)
|
2023-07-17 00:56:29 -04:00
|
|
|
{
|
|
|
|
struct bch_fs *c = trans->c;
|
2023-11-02 15:28:15 -04:00
|
|
|
struct btree_iter inode_iter;
|
2023-07-17 00:56:29 -04:00
|
|
|
struct bkey_s_c k;
|
|
|
|
struct bch_inode_unpacked inode;
|
bcachefs: bcachefs_metadata_version_inode_has_child_snapshots
There's an inherent race in taking a snapshot while an unlinked file is
open, and then reattaching it in the child snapshot.
In the interior snapshot node the file will appear unlinked, as though
it should be deleted - it's not referenced by anything in that snapshot
- but we can't delete it, because the file data is referenced by the
child snapshot.
This was being handled incorrectly with
propagate_key_to_snapshot_leaves() - but that doesn't resolve the
fundamental inconsistency of "this file looks like it should be deleted
according to normal rules, but - ".
To fix this, we need to fix the rule for when an inode is deleted. The
previous rule, ignoring snapshots (there was no well-defined rule
for with snapshots) was:
Unlinked, non open files are deleted, either at recovery time or
during online fsck
The new rule is:
Unlinked, non open files, that do not exist in child snapshots, are
deleted.
To make this work transactionally, we add a new inode flag,
BCH_INODE_has_child_snapshot; it overrides BCH_INODE_unlinked when
considering whether to delete an inode, or put it on the deleted list.
For transactional consistency, clearing it handled by the inode trigger:
when deleting an inode we check if there are parent inodes which can now
have the BCH_INODE_has_child_snapshot flag cleared.
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
2024-09-29 22:11:37 -04:00
|
|
|
struct printbuf buf = PRINTBUF;
|
2023-07-17 00:56:29 -04:00
|
|
|
int ret;
|
|
|
|
|
2024-04-07 18:05:34 -04:00
|
|
|
k = bch2_bkey_get_iter(trans, &inode_iter, BTREE_ID_inodes, pos, BTREE_ITER_cached);
|
2023-07-17 00:56:29 -04:00
|
|
|
ret = bkey_err(k);
|
|
|
|
if (ret)
|
|
|
|
return ret;
|
|
|
|
|
|
|
|
ret = bkey_is_inode(k.k) ? 0 : -BCH_ERR_ENOENT_inode;
|
2024-02-08 21:10:32 -05:00
|
|
|
if (fsck_err_on(!bkey_is_inode(k.k),
|
|
|
|
trans, deleted_inode_missing,
|
2023-07-17 00:56:29 -04:00
|
|
|
"nonexistent inode %llu:%u in deleted_inodes btree",
|
|
|
|
pos.offset, pos.snapshot))
|
|
|
|
goto delete;
|
|
|
|
|
|
|
|
ret = bch2_inode_unpack(k, &inode);
|
|
|
|
if (ret)
|
2023-11-02 15:28:15 -04:00
|
|
|
goto out;
|
2023-07-17 00:56:29 -04:00
|
|
|
|
2023-12-07 12:39:13 -05:00
|
|
|
if (S_ISDIR(inode.bi_mode)) {
|
2024-02-13 16:20:04 +08:00
|
|
|
ret = bch2_empty_dir_snapshot(trans, pos.offset, 0, pos.snapshot);
|
|
|
|
if (fsck_err_on(bch2_err_matches(ret, ENOTEMPTY),
|
2024-02-08 21:10:32 -05:00
|
|
|
trans, deleted_inode_is_dir,
|
2023-12-07 12:39:13 -05:00
|
|
|
"non empty directory %llu:%u in deleted_inodes btree",
|
|
|
|
pos.offset, pos.snapshot))
|
|
|
|
goto delete;
|
|
|
|
if (ret)
|
|
|
|
goto out;
|
|
|
|
}
|
2023-08-12 12:34:47 -04:00
|
|
|
|
2024-02-08 21:10:32 -05:00
|
|
|
if (fsck_err_on(!(inode.bi_flags & BCH_INODE_unlinked),
|
|
|
|
trans, deleted_inode_not_unlinked,
|
2023-07-17 00:56:29 -04:00
|
|
|
"non-deleted inode %llu:%u in deleted_inodes btree",
|
|
|
|
pos.offset, pos.snapshot))
|
|
|
|
goto delete;
|
|
|
|
|
bcachefs: bcachefs_metadata_version_inode_has_child_snapshots
There's an inherent race in taking a snapshot while an unlinked file is
open, and then reattaching it in the child snapshot.
In the interior snapshot node the file will appear unlinked, as though
it should be deleted - it's not referenced by anything in that snapshot
- but we can't delete it, because the file data is referenced by the
child snapshot.
This was being handled incorrectly with
propagate_key_to_snapshot_leaves() - but that doesn't resolve the
fundamental inconsistency of "this file looks like it should be deleted
according to normal rules, but - ".
To fix this, we need to fix the rule for when an inode is deleted. The
previous rule, ignoring snapshots (there was no well-defined rule
for with snapshots) was:
Unlinked, non open files are deleted, either at recovery time or
during online fsck
The new rule is:
Unlinked, non open files, that do not exist in child snapshots, are
deleted.
To make this work transactionally, we add a new inode flag,
BCH_INODE_has_child_snapshot; it overrides BCH_INODE_unlinked when
considering whether to delete an inode, or put it on the deleted list.
For transactional consistency, clearing it handled by the inode trigger:
when deleting an inode we check if there are parent inodes which can now
have the BCH_INODE_has_child_snapshot flag cleared.
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
2024-09-29 22:11:37 -04:00
|
|
|
if (fsck_err_on(inode.bi_flags & BCH_INODE_has_child_snapshot,
|
|
|
|
trans, deleted_inode_has_child_snapshots,
|
|
|
|
"inode with child snapshots %llu:%u in deleted_inodes btree",
|
|
|
|
pos.offset, pos.snapshot))
|
|
|
|
goto delete;
|
|
|
|
|
|
|
|
ret = bch2_inode_has_child_snapshots(trans, k.k->p);
|
|
|
|
if (ret < 0)
|
|
|
|
goto out;
|
|
|
|
|
|
|
|
if (ret) {
|
|
|
|
if (fsck_err(trans, inode_has_child_snapshots_wrong,
|
|
|
|
"inode has_child_snapshots flag wrong (should be set)\n%s",
|
|
|
|
(printbuf_reset(&buf),
|
|
|
|
bch2_inode_unpacked_to_text(&buf, &inode),
|
|
|
|
buf.buf))) {
|
|
|
|
inode.bi_flags |= BCH_INODE_has_child_snapshot;
|
|
|
|
ret = __bch2_fsck_write_inode(trans, &inode);
|
|
|
|
if (ret)
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
goto delete;
|
|
|
|
|
|
|
|
}
|
|
|
|
|
2024-09-26 16:19:58 -04:00
|
|
|
if (test_bit(BCH_FS_clean_recovery, &c->flags) &&
|
2024-02-08 21:10:32 -05:00
|
|
|
!fsck_err(trans, deleted_inode_but_clean,
|
2023-11-02 15:28:15 -04:00
|
|
|
"filesystem marked as clean but have deleted inode %llu:%u",
|
|
|
|
pos.offset, pos.snapshot)) {
|
|
|
|
ret = 0;
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
|
|
|
|
ret = 1;
|
|
|
|
out:
|
2023-07-17 00:56:29 -04:00
|
|
|
fsck_err:
|
2023-11-02 15:28:15 -04:00
|
|
|
bch2_trans_iter_exit(trans, &inode_iter);
|
bcachefs: bcachefs_metadata_version_inode_has_child_snapshots
There's an inherent race in taking a snapshot while an unlinked file is
open, and then reattaching it in the child snapshot.
In the interior snapshot node the file will appear unlinked, as though
it should be deleted - it's not referenced by anything in that snapshot
- but we can't delete it, because the file data is referenced by the
child snapshot.
This was being handled incorrectly with
propagate_key_to_snapshot_leaves() - but that doesn't resolve the
fundamental inconsistency of "this file looks like it should be deleted
according to normal rules, but - ".
To fix this, we need to fix the rule for when an inode is deleted. The
previous rule, ignoring snapshots (there was no well-defined rule
for with snapshots) was:
Unlinked, non open files are deleted, either at recovery time or
during online fsck
The new rule is:
Unlinked, non open files, that do not exist in child snapshots, are
deleted.
To make this work transactionally, we add a new inode flag,
BCH_INODE_has_child_snapshot; it overrides BCH_INODE_unlinked when
considering whether to delete an inode, or put it on the deleted list.
For transactional consistency, clearing it handled by the inode trigger:
when deleting an inode we check if there are parent inodes which can now
have the BCH_INODE_has_child_snapshot flag cleared.
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
2024-09-29 22:11:37 -04:00
|
|
|
printbuf_exit(&buf);
|
2023-07-17 00:56:29 -04:00
|
|
|
return ret;
|
|
|
|
delete:
|
2024-02-08 19:10:19 -05:00
|
|
|
ret = bch2_btree_bit_mod_buffered(trans, BTREE_ID_deleted_inodes, pos, false);
|
2023-11-02 15:28:15 -04:00
|
|
|
goto out;
|
2023-07-17 00:56:29 -04:00
|
|
|
}
|
|
|
|
|
|
|
|
int bch2_delete_dead_inodes(struct bch_fs *c)
|
|
|
|
{
|
2023-09-12 17:16:02 -04:00
|
|
|
struct btree_trans *trans = bch2_trans_get(c);
|
2023-11-02 15:28:15 -04:00
|
|
|
bool need_another_pass;
|
2023-07-17 00:56:29 -04:00
|
|
|
int ret;
|
2023-11-02 15:28:15 -04:00
|
|
|
again:
|
2024-03-10 23:00:23 -04:00
|
|
|
/*
|
|
|
|
* if we ran check_inodes() unlinked inodes will have already been
|
|
|
|
* cleaned up but the write buffer will be out of sync; therefore we
|
|
|
|
* alway need a write buffer flush
|
|
|
|
*/
|
|
|
|
ret = bch2_btree_write_buffer_flush_sync(trans);
|
|
|
|
if (ret)
|
|
|
|
goto err;
|
|
|
|
|
2023-11-02 15:28:15 -04:00
|
|
|
need_another_pass = false;
|
2023-07-17 00:56:29 -04:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Weird transaction restart handling here because on successful delete,
|
|
|
|
* bch2_inode_rm_snapshot() will return a nested transaction restart,
|
|
|
|
* but we can't retry because the btree write buffer won't have been
|
|
|
|
* flushed and we'd spin:
|
|
|
|
*/
|
2023-12-07 23:28:26 -05:00
|
|
|
ret = for_each_btree_key_commit(trans, iter, BTREE_ID_deleted_inodes, POS_MIN,
|
2024-04-07 18:05:34 -04:00
|
|
|
BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k,
|
2023-12-07 23:28:26 -05:00
|
|
|
NULL, NULL, BCH_TRANS_COMMIT_no_enospc, ({
|
|
|
|
ret = may_delete_deleted_inode(trans, &iter, k.k->p, &need_another_pass);
|
|
|
|
if (ret > 0) {
|
2023-11-02 15:28:15 -04:00
|
|
|
bch_verbose(c, "deleting unlinked inode %llu:%u", k.k->p.offset, k.k->p.snapshot);
|
|
|
|
|
2023-09-12 17:16:02 -04:00
|
|
|
ret = bch2_inode_rm_snapshot(trans, k.k->p.offset, k.k->p.snapshot);
|
2023-12-07 23:28:26 -05:00
|
|
|
/*
|
|
|
|
* We don't want to loop here: a transaction restart
|
|
|
|
* error here means we handled a transaction restart and
|
|
|
|
* we're actually done, but if we loop we'll retry the
|
|
|
|
* same key because the write buffer hasn't been flushed
|
|
|
|
* yet
|
|
|
|
*/
|
|
|
|
if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) {
|
|
|
|
ret = 0;
|
|
|
|
continue;
|
|
|
|
}
|
2023-07-17 00:56:29 -04:00
|
|
|
}
|
2023-12-07 23:28:26 -05:00
|
|
|
|
|
|
|
ret;
|
|
|
|
}));
|
2023-11-02 15:28:15 -04:00
|
|
|
|
2024-03-10 23:00:23 -04:00
|
|
|
if (!ret && need_another_pass)
|
2023-11-02 15:28:15 -04:00
|
|
|
goto again;
|
2023-07-17 00:56:29 -04:00
|
|
|
err:
|
2023-09-12 17:16:02 -04:00
|
|
|
bch2_trans_put(trans);
|
2023-07-17 00:56:29 -04:00
|
|
|
return ret;
|
|
|
|
}
|