mirror of
git://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
synced 2025-09-18 22:14:16 +00:00
bcachefs: Fixes for going RO
Now that interior btree updates are fully transactional, we don't need to write out alloc info in a loop. However, interior btree updates do put more things in the journal, so we still need a loop in the RO sequence. Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com> Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
This commit is contained in:
parent
baeed3c3c0
commit
039fc4c522
5 changed files with 64 additions and 37 deletions
|
@ -869,6 +869,15 @@ static int bch2_invalidate_one_bucket2(struct btree_trans *trans,
|
||||||
if (!invalidating_cached_data)
|
if (!invalidating_cached_data)
|
||||||
goto out;
|
goto out;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* If the read-only path is trying to shut down, we can't be generating
|
||||||
|
* new btree updates:
|
||||||
|
*/
|
||||||
|
if (test_bit(BCH_FS_ALLOCATOR_STOPPING, &c->flags)) {
|
||||||
|
ret = 1;
|
||||||
|
goto out;
|
||||||
|
}
|
||||||
|
|
||||||
BUG_ON(BKEY_ALLOC_VAL_U64s_MAX > 8);
|
BUG_ON(BKEY_ALLOC_VAL_U64s_MAX > 8);
|
||||||
|
|
||||||
bch2_btree_iter_set_pos(iter, POS(ca->dev_idx, b));
|
bch2_btree_iter_set_pos(iter, POS(ca->dev_idx, b));
|
||||||
|
@ -956,7 +965,7 @@ out:
|
||||||
percpu_up_read(&c->mark_lock);
|
percpu_up_read(&c->mark_lock);
|
||||||
}
|
}
|
||||||
|
|
||||||
return ret;
|
return ret < 0 ? ret : 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
static bool bch2_invalidate_one_bucket(struct bch_fs *c, struct bch_dev *ca,
|
static bool bch2_invalidate_one_bucket(struct bch_fs *c, struct bch_dev *ca,
|
||||||
|
|
|
@ -482,6 +482,7 @@ enum {
|
||||||
BCH_FS_ALLOC_CLEAN,
|
BCH_FS_ALLOC_CLEAN,
|
||||||
BCH_FS_ALLOCATOR_STARTED,
|
BCH_FS_ALLOCATOR_STARTED,
|
||||||
BCH_FS_ALLOCATOR_RUNNING,
|
BCH_FS_ALLOCATOR_RUNNING,
|
||||||
|
BCH_FS_ALLOCATOR_STOPPING,
|
||||||
BCH_FS_INITIAL_GC_DONE,
|
BCH_FS_INITIAL_GC_DONE,
|
||||||
BCH_FS_FSCK_DONE,
|
BCH_FS_FSCK_DONE,
|
||||||
BCH_FS_STARTED,
|
BCH_FS_STARTED,
|
||||||
|
|
|
@ -413,10 +413,12 @@ journal_get_next_pin(struct journal *j, u64 max_seq, u64 *seq)
|
||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
|
|
||||||
static void journal_flush_pins(struct journal *j, u64 seq_to_flush,
|
/* returns true if we did work */
|
||||||
|
static bool journal_flush_pins(struct journal *j, u64 seq_to_flush,
|
||||||
unsigned min_nr)
|
unsigned min_nr)
|
||||||
{
|
{
|
||||||
struct journal_entry_pin *pin;
|
struct journal_entry_pin *pin;
|
||||||
|
bool ret = false;
|
||||||
u64 seq;
|
u64 seq;
|
||||||
|
|
||||||
lockdep_assert_held(&j->reclaim_lock);
|
lockdep_assert_held(&j->reclaim_lock);
|
||||||
|
@ -431,7 +433,10 @@ static void journal_flush_pins(struct journal *j, u64 seq_to_flush,
|
||||||
BUG_ON(j->flush_in_progress != pin);
|
BUG_ON(j->flush_in_progress != pin);
|
||||||
j->flush_in_progress = NULL;
|
j->flush_in_progress = NULL;
|
||||||
wake_up(&j->pin_flush_wait);
|
wake_up(&j->pin_flush_wait);
|
||||||
|
ret = true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
return ret;
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -523,7 +528,8 @@ void bch2_journal_reclaim_work(struct work_struct *work)
|
||||||
mutex_unlock(&j->reclaim_lock);
|
mutex_unlock(&j->reclaim_lock);
|
||||||
}
|
}
|
||||||
|
|
||||||
static int journal_flush_done(struct journal *j, u64 seq_to_flush)
|
static int journal_flush_done(struct journal *j, u64 seq_to_flush,
|
||||||
|
bool *did_work)
|
||||||
{
|
{
|
||||||
int ret;
|
int ret;
|
||||||
|
|
||||||
|
@ -533,7 +539,7 @@ static int journal_flush_done(struct journal *j, u64 seq_to_flush)
|
||||||
|
|
||||||
mutex_lock(&j->reclaim_lock);
|
mutex_lock(&j->reclaim_lock);
|
||||||
|
|
||||||
journal_flush_pins(j, seq_to_flush, 0);
|
*did_work = journal_flush_pins(j, seq_to_flush, 0);
|
||||||
|
|
||||||
spin_lock(&j->lock);
|
spin_lock(&j->lock);
|
||||||
/*
|
/*
|
||||||
|
@ -551,12 +557,17 @@ static int journal_flush_done(struct journal *j, u64 seq_to_flush)
|
||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
|
|
||||||
void bch2_journal_flush_pins(struct journal *j, u64 seq_to_flush)
|
bool bch2_journal_flush_pins(struct journal *j, u64 seq_to_flush)
|
||||||
{
|
{
|
||||||
if (!test_bit(JOURNAL_STARTED, &j->flags))
|
bool did_work = false;
|
||||||
return;
|
|
||||||
|
|
||||||
closure_wait_event(&j->async_wait, journal_flush_done(j, seq_to_flush));
|
if (!test_bit(JOURNAL_STARTED, &j->flags))
|
||||||
|
return false;
|
||||||
|
|
||||||
|
closure_wait_event(&j->async_wait,
|
||||||
|
journal_flush_done(j, seq_to_flush, &did_work));
|
||||||
|
|
||||||
|
return did_work;
|
||||||
}
|
}
|
||||||
|
|
||||||
int bch2_journal_flush_device_pins(struct journal *j, int dev_idx)
|
int bch2_journal_flush_device_pins(struct journal *j, int dev_idx)
|
||||||
|
|
|
@ -53,11 +53,11 @@ void bch2_journal_do_discards(struct journal *);
|
||||||
void bch2_journal_reclaim(struct journal *);
|
void bch2_journal_reclaim(struct journal *);
|
||||||
void bch2_journal_reclaim_work(struct work_struct *);
|
void bch2_journal_reclaim_work(struct work_struct *);
|
||||||
|
|
||||||
void bch2_journal_flush_pins(struct journal *, u64);
|
bool bch2_journal_flush_pins(struct journal *, u64);
|
||||||
|
|
||||||
static inline void bch2_journal_flush_all_pins(struct journal *j)
|
static inline bool bch2_journal_flush_all_pins(struct journal *j)
|
||||||
{
|
{
|
||||||
bch2_journal_flush_pins(j, U64_MAX);
|
return bch2_journal_flush_pins(j, U64_MAX);
|
||||||
}
|
}
|
||||||
|
|
||||||
int bch2_journal_flush_device_pins(struct journal *, int);
|
int bch2_journal_flush_device_pins(struct journal *, int);
|
||||||
|
|
|
@ -175,7 +175,7 @@ struct bch_fs *bch2_uuid_to_fs(__uuid_t uuid)
|
||||||
static void __bch2_fs_read_only(struct bch_fs *c)
|
static void __bch2_fs_read_only(struct bch_fs *c)
|
||||||
{
|
{
|
||||||
struct bch_dev *ca;
|
struct bch_dev *ca;
|
||||||
bool wrote;
|
bool wrote = false;
|
||||||
unsigned i, clean_passes = 0;
|
unsigned i, clean_passes = 0;
|
||||||
int ret;
|
int ret;
|
||||||
|
|
||||||
|
@ -200,39 +200,46 @@ static void __bch2_fs_read_only(struct bch_fs *c)
|
||||||
goto nowrote_alloc;
|
goto nowrote_alloc;
|
||||||
|
|
||||||
bch_verbose(c, "writing alloc info");
|
bch_verbose(c, "writing alloc info");
|
||||||
|
/*
|
||||||
|
* This should normally just be writing the bucket read/write clocks:
|
||||||
|
*/
|
||||||
|
ret = bch2_stripes_write(c, BTREE_INSERT_NOCHECK_RW, &wrote) ?:
|
||||||
|
bch2_alloc_write(c, BTREE_INSERT_NOCHECK_RW, &wrote);
|
||||||
|
bch_verbose(c, "writing alloc info complete");
|
||||||
|
|
||||||
|
if (ret && !test_bit(BCH_FS_EMERGENCY_RO, &c->flags))
|
||||||
|
bch2_fs_inconsistent(c, "error writing out alloc info %i", ret);
|
||||||
|
|
||||||
|
if (ret)
|
||||||
|
goto nowrote_alloc;
|
||||||
|
|
||||||
|
bch_verbose(c, "flushing journal and stopping allocators");
|
||||||
|
|
||||||
|
bch2_journal_flush_all_pins(&c->journal);
|
||||||
|
set_bit(BCH_FS_ALLOCATOR_STOPPING, &c->flags);
|
||||||
|
|
||||||
do {
|
do {
|
||||||
wrote = false;
|
clean_passes++;
|
||||||
|
|
||||||
ret = bch2_stripes_write(c, BTREE_INSERT_NOCHECK_RW, &wrote) ?:
|
if (bch2_journal_flush_all_pins(&c->journal))
|
||||||
bch2_alloc_write(c, BTREE_INSERT_NOCHECK_RW, &wrote);
|
clean_passes = 0;
|
||||||
|
|
||||||
if (ret && !test_bit(BCH_FS_EMERGENCY_RO, &c->flags))
|
|
||||||
bch2_fs_inconsistent(c, "error writing out alloc info %i", ret);
|
|
||||||
|
|
||||||
if (ret)
|
|
||||||
goto nowrote_alloc;
|
|
||||||
|
|
||||||
for_each_member_device(ca, c, i)
|
|
||||||
bch2_dev_allocator_quiesce(c, ca);
|
|
||||||
|
|
||||||
bch2_journal_flush_all_pins(&c->journal);
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* We need to explicitly wait on btree interior updates to complete
|
* In flight interior btree updates will generate more journal
|
||||||
* before stopping the journal, flushing all journal pins isn't
|
* updates and btree updates (alloc btree):
|
||||||
* sufficient, because in the BTREE_INTERIOR_UPDATING_ROOT case btree
|
|
||||||
* interior updates have to drop their journal pin before they're
|
|
||||||
* fully complete:
|
|
||||||
*/
|
*/
|
||||||
closure_wait_event(&c->btree_interior_update_wait,
|
if (bch2_btree_interior_updates_nr_pending(c)) {
|
||||||
!bch2_btree_interior_updates_nr_pending(c));
|
closure_wait_event(&c->btree_interior_update_wait,
|
||||||
|
!bch2_btree_interior_updates_nr_pending(c));
|
||||||
|
clean_passes = 0;
|
||||||
|
}
|
||||||
flush_work(&c->btree_interior_update_work);
|
flush_work(&c->btree_interior_update_work);
|
||||||
|
|
||||||
clean_passes = wrote ? 0 : clean_passes + 1;
|
if (bch2_journal_flush_all_pins(&c->journal))
|
||||||
|
clean_passes = 0;
|
||||||
} while (clean_passes < 2);
|
} while (clean_passes < 2);
|
||||||
|
bch_verbose(c, "flushing journal and stopping allocators complete");
|
||||||
|
|
||||||
bch_verbose(c, "writing alloc info complete");
|
|
||||||
set_bit(BCH_FS_ALLOC_CLEAN, &c->flags);
|
set_bit(BCH_FS_ALLOC_CLEAN, &c->flags);
|
||||||
nowrote_alloc:
|
nowrote_alloc:
|
||||||
closure_wait_event(&c->btree_interior_update_wait,
|
closure_wait_event(&c->btree_interior_update_wait,
|
||||||
|
@ -243,11 +250,10 @@ nowrote_alloc:
|
||||||
bch2_dev_allocator_stop(ca);
|
bch2_dev_allocator_stop(ca);
|
||||||
|
|
||||||
clear_bit(BCH_FS_ALLOCATOR_RUNNING, &c->flags);
|
clear_bit(BCH_FS_ALLOCATOR_RUNNING, &c->flags);
|
||||||
|
clear_bit(BCH_FS_ALLOCATOR_STOPPING, &c->flags);
|
||||||
|
|
||||||
bch2_fs_journal_stop(&c->journal);
|
bch2_fs_journal_stop(&c->journal);
|
||||||
|
|
||||||
/* XXX: mark super that alloc info is persistent */
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* the journal kicks off btree writes via reclaim - wait for in flight
|
* the journal kicks off btree writes via reclaim - wait for in flight
|
||||||
* writes after stopping journal:
|
* writes after stopping journal:
|
||||||
|
|
Loading…
Add table
Reference in a new issue