2025-02-15 19:37:19 +01:00
|
|
|
// SPDX-License-Identifier: GPL-2.0
|
|
|
|
/*
|
|
|
|
* Copyright (c) 2023-2025 Christoph Hellwig.
|
|
|
|
* Copyright (c) 2024-2025, Western Digital Corporation or its affiliates.
|
|
|
|
*/
|
|
|
|
#include "xfs.h"
|
|
|
|
#include "xfs_shared.h"
|
|
|
|
#include "xfs_format.h"
|
|
|
|
#include "xfs_log_format.h"
|
|
|
|
#include "xfs_trans_resv.h"
|
|
|
|
#include "xfs_mount.h"
|
|
|
|
#include "xfs_inode.h"
|
|
|
|
#include "xfs_btree.h"
|
|
|
|
#include "xfs_trans.h"
|
|
|
|
#include "xfs_icache.h"
|
|
|
|
#include "xfs_rmap.h"
|
|
|
|
#include "xfs_rtbitmap.h"
|
|
|
|
#include "xfs_rtrmap_btree.h"
|
|
|
|
#include "xfs_zone_alloc.h"
|
|
|
|
#include "xfs_zone_priv.h"
|
|
|
|
#include "xfs_zones.h"
|
|
|
|
#include "xfs_trace.h"
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Implement Garbage Collection (GC) of partially used zoned.
|
|
|
|
*
|
|
|
|
* To support the purely sequential writes in each zone, zoned XFS needs to be
|
|
|
|
* able to move data remaining in a zone out of it to reset the zone to prepare
|
|
|
|
* for writing to it again.
|
|
|
|
*
|
|
|
|
* This is done by the GC thread implemented in this file. To support that a
|
|
|
|
* number of zones (XFS_GC_ZONES) is reserved from the user visible capacity to
|
|
|
|
* write the garbage collected data into.
|
|
|
|
*
|
|
|
|
* Whenever the available space is below the chosen threshold, the GC thread
|
|
|
|
* looks for potential non-empty but not fully used zones that are worth
|
|
|
|
* reclaiming. Once found the rmap for the victim zone is queried, and after
|
|
|
|
* a bit of sorting to reduce fragmentation, the still live extents are read
|
|
|
|
* into memory and written to the GC target zone, and the bmap btree of the
|
|
|
|
* files is updated to point to the new location. To avoid taking the IOLOCK
|
|
|
|
* and MMAPLOCK for the entire GC process and thus affecting the latency of
|
|
|
|
* user reads and writes to the files, the GC writes are speculative and the
|
|
|
|
* I/O completion checks that no other writes happened for the affected regions
|
|
|
|
* before remapping.
|
|
|
|
*
|
|
|
|
* Once a zone does not contain any valid data, be that through GC or user
|
|
|
|
* block removal, it is queued for for a zone reset. The reset operation
|
|
|
|
* carefully ensures that the RT device cache is flushed and all transactions
|
|
|
|
* referencing the rmap have been committed to disk.
|
|
|
|
*/
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Size of each GC scratch pad. This is also the upper bound for each
|
|
|
|
* GC I/O, which helps to keep latency down.
|
|
|
|
*/
|
|
|
|
#define XFS_GC_CHUNK_SIZE SZ_1M
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Scratchpad data to read GCed data into.
|
|
|
|
*
|
|
|
|
* The offset member tracks where the next allocation starts, and freed tracks
|
|
|
|
* the amount of space that is not used anymore.
|
|
|
|
*/
|
|
|
|
#define XFS_ZONE_GC_NR_SCRATCH 2
|
|
|
|
struct xfs_zone_scratch {
|
|
|
|
struct folio *folio;
|
|
|
|
unsigned int offset;
|
|
|
|
unsigned int freed;
|
|
|
|
};
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Chunk that is read and written for each GC operation.
|
|
|
|
*
|
|
|
|
* Note that for writes to actual zoned devices, the chunk can be split when
|
|
|
|
* reaching the hardware limit.
|
|
|
|
*/
|
|
|
|
struct xfs_gc_bio {
|
|
|
|
struct xfs_zone_gc_data *data;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Entry into the reading/writing/resetting list. Only accessed from
|
|
|
|
* the GC thread, so no locking needed.
|
|
|
|
*/
|
|
|
|
struct list_head entry;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* State of this gc_bio. Done means the current I/O completed.
|
|
|
|
* Set from the bio end I/O handler, read from the GC thread.
|
|
|
|
*/
|
|
|
|
enum {
|
|
|
|
XFS_GC_BIO_NEW,
|
|
|
|
XFS_GC_BIO_DONE,
|
|
|
|
} state;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Pointer to the inode and byte range in the inode that this
|
|
|
|
* GC chunk is operating on.
|
|
|
|
*/
|
|
|
|
struct xfs_inode *ip;
|
|
|
|
loff_t offset;
|
|
|
|
unsigned int len;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Existing startblock (in the zone to be freed) and newly assigned
|
|
|
|
* daddr in the zone GCed into.
|
|
|
|
*/
|
|
|
|
xfs_fsblock_t old_startblock;
|
|
|
|
xfs_daddr_t new_daddr;
|
|
|
|
struct xfs_zone_scratch *scratch;
|
|
|
|
|
|
|
|
/* Are we writing to a sequential write required zone? */
|
|
|
|
bool is_seq;
|
|
|
|
|
|
|
|
/* Open Zone being written to */
|
|
|
|
struct xfs_open_zone *oz;
|
|
|
|
|
|
|
|
/* Bio used for reads and writes, including the bvec used by it */
|
|
|
|
struct bio_vec bv;
|
|
|
|
struct bio bio; /* must be last */
|
|
|
|
};
|
|
|
|
|
|
|
|
#define XFS_ZONE_GC_RECS 1024
|
|
|
|
|
|
|
|
/* iterator, needs to be reinitialized for each victim zone */
|
|
|
|
struct xfs_zone_gc_iter {
|
|
|
|
struct xfs_rtgroup *victim_rtg;
|
|
|
|
unsigned int rec_count;
|
|
|
|
unsigned int rec_idx;
|
|
|
|
xfs_agblock_t next_startblock;
|
|
|
|
struct xfs_rmap_irec *recs;
|
|
|
|
};
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Per-mount GC state.
|
|
|
|
*/
|
|
|
|
struct xfs_zone_gc_data {
|
|
|
|
struct xfs_mount *mp;
|
|
|
|
|
|
|
|
/* bioset used to allocate the gc_bios */
|
|
|
|
struct bio_set bio_set;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Scratchpad used, and index to indicated which one is used.
|
|
|
|
*/
|
|
|
|
struct xfs_zone_scratch scratch[XFS_ZONE_GC_NR_SCRATCH];
|
|
|
|
unsigned int scratch_idx;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* List of bios currently being read, written and reset.
|
|
|
|
* These lists are only accessed by the GC thread itself, and must only
|
|
|
|
* be processed in order.
|
|
|
|
*/
|
|
|
|
struct list_head reading;
|
|
|
|
struct list_head writing;
|
|
|
|
struct list_head resetting;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Iterator for the victim zone.
|
|
|
|
*/
|
|
|
|
struct xfs_zone_gc_iter iter;
|
|
|
|
};
|
|
|
|
|
|
|
|
/*
|
|
|
|
* We aim to keep enough zones free in stock to fully use the open zone limit
|
2025-03-25 09:10:49 +00:00
|
|
|
* for data placement purposes. Additionally, the m_zonegc_low_space tunable
|
|
|
|
* can be set to make sure a fraction of the unused blocks are available for
|
|
|
|
* writing.
|
2025-02-15 19:37:19 +01:00
|
|
|
*/
|
|
|
|
bool
|
|
|
|
xfs_zoned_need_gc(
|
|
|
|
struct xfs_mount *mp)
|
|
|
|
{
|
2025-04-22 14:54:54 +02:00
|
|
|
s64 available, free, threshold;
|
|
|
|
s32 remainder;
|
2025-03-25 09:10:49 +00:00
|
|
|
|
2025-02-15 19:37:19 +01:00
|
|
|
if (!xfs_group_marked(mp, XG_TYPE_RTG, XFS_RTG_RECLAIMABLE))
|
|
|
|
return false;
|
2025-03-25 09:10:49 +00:00
|
|
|
|
|
|
|
available = xfs_estimate_freecounter(mp, XC_FREE_RTAVAILABLE);
|
|
|
|
|
|
|
|
if (available <
|
2025-02-15 19:37:19 +01:00
|
|
|
mp->m_groups[XG_TYPE_RTG].blocks *
|
|
|
|
(mp->m_max_open_zones - XFS_OPEN_GC_ZONES))
|
|
|
|
return true;
|
2025-03-25 09:10:49 +00:00
|
|
|
|
|
|
|
free = xfs_estimate_freecounter(mp, XC_FREE_RTEXTENTS);
|
2025-04-22 14:54:54 +02:00
|
|
|
|
|
|
|
threshold = div_s64_rem(free, 100, &remainder);
|
|
|
|
threshold = threshold * mp->m_zonegc_low_space +
|
|
|
|
remainder * div_s64(mp->m_zonegc_low_space, 100);
|
|
|
|
|
|
|
|
if (available < threshold)
|
2025-03-25 09:10:49 +00:00
|
|
|
return true;
|
|
|
|
|
2025-02-15 19:37:19 +01:00
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
static struct xfs_zone_gc_data *
|
|
|
|
xfs_zone_gc_data_alloc(
|
|
|
|
struct xfs_mount *mp)
|
|
|
|
{
|
|
|
|
struct xfs_zone_gc_data *data;
|
|
|
|
int i;
|
|
|
|
|
|
|
|
data = kzalloc(sizeof(*data), GFP_KERNEL);
|
|
|
|
if (!data)
|
|
|
|
return NULL;
|
|
|
|
data->iter.recs = kcalloc(XFS_ZONE_GC_RECS, sizeof(*data->iter.recs),
|
|
|
|
GFP_KERNEL);
|
|
|
|
if (!data->iter.recs)
|
|
|
|
goto out_free_data;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* We actually only need a single bio_vec. It would be nice to have
|
|
|
|
* a flag that only allocates the inline bvecs and not the separate
|
|
|
|
* bvec pool.
|
|
|
|
*/
|
|
|
|
if (bioset_init(&data->bio_set, 16, offsetof(struct xfs_gc_bio, bio),
|
|
|
|
BIOSET_NEED_BVECS))
|
|
|
|
goto out_free_recs;
|
|
|
|
for (i = 0; i < XFS_ZONE_GC_NR_SCRATCH; i++) {
|
|
|
|
data->scratch[i].folio =
|
|
|
|
folio_alloc(GFP_KERNEL, get_order(XFS_GC_CHUNK_SIZE));
|
|
|
|
if (!data->scratch[i].folio)
|
|
|
|
goto out_free_scratch;
|
|
|
|
}
|
|
|
|
INIT_LIST_HEAD(&data->reading);
|
|
|
|
INIT_LIST_HEAD(&data->writing);
|
|
|
|
INIT_LIST_HEAD(&data->resetting);
|
|
|
|
data->mp = mp;
|
|
|
|
return data;
|
|
|
|
|
|
|
|
out_free_scratch:
|
|
|
|
while (--i >= 0)
|
|
|
|
folio_put(data->scratch[i].folio);
|
|
|
|
bioset_exit(&data->bio_set);
|
|
|
|
out_free_recs:
|
|
|
|
kfree(data->iter.recs);
|
|
|
|
out_free_data:
|
|
|
|
kfree(data);
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
|
|
|
xfs_zone_gc_data_free(
|
|
|
|
struct xfs_zone_gc_data *data)
|
|
|
|
{
|
|
|
|
int i;
|
|
|
|
|
|
|
|
for (i = 0; i < XFS_ZONE_GC_NR_SCRATCH; i++)
|
|
|
|
folio_put(data->scratch[i].folio);
|
|
|
|
bioset_exit(&data->bio_set);
|
|
|
|
kfree(data->iter.recs);
|
|
|
|
kfree(data);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
|
|
|
xfs_zone_gc_iter_init(
|
|
|
|
struct xfs_zone_gc_iter *iter,
|
|
|
|
struct xfs_rtgroup *victim_rtg)
|
|
|
|
|
|
|
|
{
|
|
|
|
iter->next_startblock = 0;
|
|
|
|
iter->rec_count = 0;
|
|
|
|
iter->rec_idx = 0;
|
|
|
|
iter->victim_rtg = victim_rtg;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Query the rmap of the victim zone to gather the records to evacuate.
|
|
|
|
*/
|
|
|
|
static int
|
|
|
|
xfs_zone_gc_query_cb(
|
|
|
|
struct xfs_btree_cur *cur,
|
|
|
|
const struct xfs_rmap_irec *irec,
|
|
|
|
void *private)
|
|
|
|
{
|
|
|
|
struct xfs_zone_gc_iter *iter = private;
|
|
|
|
|
|
|
|
ASSERT(!XFS_RMAP_NON_INODE_OWNER(irec->rm_owner));
|
|
|
|
ASSERT(!xfs_is_sb_inum(cur->bc_mp, irec->rm_owner));
|
|
|
|
ASSERT(!(irec->rm_flags & (XFS_RMAP_ATTR_FORK | XFS_RMAP_BMBT_BLOCK)));
|
|
|
|
|
|
|
|
iter->recs[iter->rec_count] = *irec;
|
|
|
|
if (++iter->rec_count == XFS_ZONE_GC_RECS) {
|
|
|
|
iter->next_startblock =
|
|
|
|
irec->rm_startblock + irec->rm_blockcount;
|
|
|
|
return 1;
|
|
|
|
}
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
static int
|
|
|
|
xfs_zone_gc_rmap_rec_cmp(
|
|
|
|
const void *a,
|
|
|
|
const void *b)
|
|
|
|
{
|
|
|
|
const struct xfs_rmap_irec *reca = a;
|
|
|
|
const struct xfs_rmap_irec *recb = b;
|
|
|
|
int diff;
|
|
|
|
|
|
|
|
diff = cmp_int(reca->rm_owner, recb->rm_owner);
|
|
|
|
if (diff)
|
|
|
|
return diff;
|
|
|
|
return cmp_int(reca->rm_offset, recb->rm_offset);
|
|
|
|
}
|
|
|
|
|
|
|
|
static int
|
|
|
|
xfs_zone_gc_query(
|
|
|
|
struct xfs_mount *mp,
|
|
|
|
struct xfs_zone_gc_iter *iter)
|
|
|
|
{
|
|
|
|
struct xfs_rtgroup *rtg = iter->victim_rtg;
|
|
|
|
struct xfs_rmap_irec ri_low = { };
|
|
|
|
struct xfs_rmap_irec ri_high;
|
|
|
|
struct xfs_btree_cur *cur;
|
|
|
|
struct xfs_trans *tp;
|
|
|
|
int error;
|
|
|
|
|
|
|
|
ASSERT(iter->next_startblock <= rtg_blocks(rtg));
|
|
|
|
if (iter->next_startblock == rtg_blocks(rtg))
|
|
|
|
goto done;
|
|
|
|
|
|
|
|
ASSERT(iter->next_startblock < rtg_blocks(rtg));
|
|
|
|
ri_low.rm_startblock = iter->next_startblock;
|
|
|
|
memset(&ri_high, 0xFF, sizeof(ri_high));
|
|
|
|
|
|
|
|
iter->rec_idx = 0;
|
|
|
|
iter->rec_count = 0;
|
|
|
|
|
2025-07-16 14:43:15 +02:00
|
|
|
tp = xfs_trans_alloc_empty(mp);
|
2025-02-15 19:37:19 +01:00
|
|
|
xfs_rtgroup_lock(rtg, XFS_RTGLOCK_RMAP);
|
|
|
|
cur = xfs_rtrmapbt_init_cursor(tp, rtg);
|
|
|
|
error = xfs_rmap_query_range(cur, &ri_low, &ri_high,
|
|
|
|
xfs_zone_gc_query_cb, iter);
|
|
|
|
xfs_rtgroup_unlock(rtg, XFS_RTGLOCK_RMAP);
|
|
|
|
xfs_btree_del_cursor(cur, error < 0 ? error : 0);
|
|
|
|
xfs_trans_cancel(tp);
|
|
|
|
|
|
|
|
if (error < 0)
|
|
|
|
return error;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Sort the rmap records by inode number and increasing offset to
|
|
|
|
* defragment the mappings.
|
|
|
|
*
|
|
|
|
* This could be further enhanced by an even bigger look ahead window,
|
|
|
|
* but that's better left until we have better detection of changes to
|
|
|
|
* inode mapping to avoid the potential of GCing already dead data.
|
|
|
|
*/
|
|
|
|
sort(iter->recs, iter->rec_count, sizeof(iter->recs[0]),
|
|
|
|
xfs_zone_gc_rmap_rec_cmp, NULL);
|
|
|
|
|
|
|
|
if (error == 0) {
|
|
|
|
/*
|
|
|
|
* We finished iterating through the zone.
|
|
|
|
*/
|
|
|
|
iter->next_startblock = rtg_blocks(rtg);
|
|
|
|
if (iter->rec_count == 0)
|
|
|
|
goto done;
|
|
|
|
}
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
done:
|
|
|
|
xfs_rtgroup_rele(iter->victim_rtg);
|
|
|
|
iter->victim_rtg = NULL;
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
static bool
|
|
|
|
xfs_zone_gc_iter_next(
|
|
|
|
struct xfs_mount *mp,
|
|
|
|
struct xfs_zone_gc_iter *iter,
|
|
|
|
struct xfs_rmap_irec *chunk_rec,
|
|
|
|
struct xfs_inode **ipp)
|
|
|
|
{
|
|
|
|
struct xfs_rmap_irec *irec;
|
|
|
|
int error;
|
|
|
|
|
|
|
|
if (!iter->victim_rtg)
|
|
|
|
return false;
|
|
|
|
|
|
|
|
retry:
|
|
|
|
if (iter->rec_idx == iter->rec_count) {
|
|
|
|
error = xfs_zone_gc_query(mp, iter);
|
|
|
|
if (error)
|
|
|
|
goto fail;
|
|
|
|
if (!iter->victim_rtg)
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
irec = &iter->recs[iter->rec_idx];
|
|
|
|
error = xfs_iget(mp, NULL, irec->rm_owner,
|
|
|
|
XFS_IGET_UNTRUSTED | XFS_IGET_DONTCACHE, 0, ipp);
|
|
|
|
if (error) {
|
|
|
|
/*
|
|
|
|
* If the inode was already deleted, skip over it.
|
|
|
|
*/
|
|
|
|
if (error == -ENOENT) {
|
|
|
|
iter->rec_idx++;
|
|
|
|
goto retry;
|
|
|
|
}
|
|
|
|
goto fail;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (!S_ISREG(VFS_I(*ipp)->i_mode) || !XFS_IS_REALTIME_INODE(*ipp)) {
|
|
|
|
iter->rec_idx++;
|
|
|
|
xfs_irele(*ipp);
|
|
|
|
goto retry;
|
|
|
|
}
|
|
|
|
|
|
|
|
*chunk_rec = *irec;
|
|
|
|
return true;
|
|
|
|
|
|
|
|
fail:
|
|
|
|
xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
|
|
|
xfs_zone_gc_iter_advance(
|
|
|
|
struct xfs_zone_gc_iter *iter,
|
|
|
|
xfs_extlen_t count_fsb)
|
|
|
|
{
|
|
|
|
struct xfs_rmap_irec *irec = &iter->recs[iter->rec_idx];
|
|
|
|
|
|
|
|
irec->rm_offset += count_fsb;
|
|
|
|
irec->rm_startblock += count_fsb;
|
|
|
|
irec->rm_blockcount -= count_fsb;
|
|
|
|
if (!irec->rm_blockcount)
|
|
|
|
iter->rec_idx++;
|
|
|
|
}
|
|
|
|
|
|
|
|
static struct xfs_rtgroup *
|
|
|
|
xfs_zone_gc_pick_victim_from(
|
|
|
|
struct xfs_mount *mp,
|
|
|
|
uint32_t bucket)
|
|
|
|
{
|
|
|
|
struct xfs_zone_info *zi = mp->m_zone_info;
|
|
|
|
uint32_t victim_used = U32_MAX;
|
|
|
|
struct xfs_rtgroup *victim_rtg = NULL;
|
|
|
|
uint32_t bit;
|
|
|
|
|
|
|
|
if (!zi->zi_used_bucket_entries[bucket])
|
|
|
|
return NULL;
|
|
|
|
|
|
|
|
for_each_set_bit(bit, zi->zi_used_bucket_bitmap[bucket],
|
|
|
|
mp->m_sb.sb_rgcount) {
|
|
|
|
struct xfs_rtgroup *rtg = xfs_rtgroup_grab(mp, bit);
|
|
|
|
|
|
|
|
if (!rtg)
|
|
|
|
continue;
|
|
|
|
|
|
|
|
/* skip zones that are just waiting for a reset */
|
|
|
|
if (rtg_rmap(rtg)->i_used_blocks == 0 ||
|
|
|
|
rtg_rmap(rtg)->i_used_blocks >= victim_used) {
|
|
|
|
xfs_rtgroup_rele(rtg);
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (victim_rtg)
|
|
|
|
xfs_rtgroup_rele(victim_rtg);
|
|
|
|
victim_rtg = rtg;
|
|
|
|
victim_used = rtg_rmap(rtg)->i_used_blocks;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Any zone that is less than 1 percent used is fair game for
|
|
|
|
* instant reclaim. All of these zones are in the last
|
|
|
|
* bucket, so avoid the expensive division for the zones
|
|
|
|
* in the other buckets.
|
|
|
|
*/
|
|
|
|
if (bucket == 0 &&
|
|
|
|
rtg_rmap(rtg)->i_used_blocks < rtg_blocks(rtg) / 100)
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
|
|
|
return victim_rtg;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Iterate through all zones marked as reclaimable and find a candidate to
|
|
|
|
* reclaim.
|
|
|
|
*/
|
|
|
|
static bool
|
|
|
|
xfs_zone_gc_select_victim(
|
|
|
|
struct xfs_zone_gc_data *data)
|
|
|
|
{
|
|
|
|
struct xfs_zone_gc_iter *iter = &data->iter;
|
|
|
|
struct xfs_mount *mp = data->mp;
|
|
|
|
struct xfs_zone_info *zi = mp->m_zone_info;
|
|
|
|
struct xfs_rtgroup *victim_rtg = NULL;
|
|
|
|
unsigned int bucket;
|
|
|
|
|
|
|
|
if (xfs_is_shutdown(mp))
|
|
|
|
return false;
|
|
|
|
|
|
|
|
if (iter->victim_rtg)
|
|
|
|
return true;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Don't start new work if we are asked to stop or park.
|
|
|
|
*/
|
|
|
|
if (kthread_should_stop() || kthread_should_park())
|
|
|
|
return false;
|
|
|
|
|
|
|
|
if (!xfs_zoned_need_gc(mp))
|
|
|
|
return false;
|
|
|
|
|
|
|
|
spin_lock(&zi->zi_used_buckets_lock);
|
|
|
|
for (bucket = 0; bucket < XFS_ZONE_USED_BUCKETS; bucket++) {
|
|
|
|
victim_rtg = xfs_zone_gc_pick_victim_from(mp, bucket);
|
|
|
|
if (victim_rtg)
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
spin_unlock(&zi->zi_used_buckets_lock);
|
|
|
|
|
|
|
|
if (!victim_rtg)
|
|
|
|
return false;
|
|
|
|
|
|
|
|
trace_xfs_zone_gc_select_victim(victim_rtg, bucket);
|
|
|
|
xfs_zone_gc_iter_init(iter, victim_rtg);
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
|
|
|
static struct xfs_open_zone *
|
|
|
|
xfs_zone_gc_steal_open(
|
|
|
|
struct xfs_zone_info *zi)
|
|
|
|
{
|
|
|
|
struct xfs_open_zone *oz, *found = NULL;
|
|
|
|
|
|
|
|
spin_lock(&zi->zi_open_zones_lock);
|
|
|
|
list_for_each_entry(oz, &zi->zi_open_zones, oz_entry) {
|
2025-07-16 14:54:03 +02:00
|
|
|
if (!found || oz->oz_allocated < found->oz_allocated)
|
2025-02-15 19:37:19 +01:00
|
|
|
found = oz;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (found) {
|
|
|
|
found->oz_is_gc = true;
|
|
|
|
list_del_init(&found->oz_entry);
|
|
|
|
zi->zi_nr_open_zones--;
|
|
|
|
}
|
|
|
|
|
|
|
|
spin_unlock(&zi->zi_open_zones_lock);
|
|
|
|
return found;
|
|
|
|
}
|
|
|
|
|
|
|
|
static struct xfs_open_zone *
|
|
|
|
xfs_zone_gc_select_target(
|
|
|
|
struct xfs_mount *mp)
|
|
|
|
{
|
|
|
|
struct xfs_zone_info *zi = mp->m_zone_info;
|
|
|
|
struct xfs_open_zone *oz = zi->zi_open_gc_zone;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* We need to wait for pending writes to finish.
|
|
|
|
*/
|
|
|
|
if (oz && oz->oz_written < rtg_blocks(oz->oz_rtg))
|
|
|
|
return NULL;
|
|
|
|
|
|
|
|
ASSERT(zi->zi_nr_open_zones <=
|
|
|
|
mp->m_max_open_zones - XFS_OPEN_GC_ZONES);
|
2025-01-31 09:39:36 +01:00
|
|
|
oz = xfs_open_zone(mp, WRITE_LIFE_NOT_SET, true);
|
2025-02-15 19:37:19 +01:00
|
|
|
if (oz)
|
|
|
|
trace_xfs_zone_gc_target_opened(oz->oz_rtg);
|
|
|
|
spin_lock(&zi->zi_open_zones_lock);
|
|
|
|
zi->zi_open_gc_zone = oz;
|
|
|
|
spin_unlock(&zi->zi_open_zones_lock);
|
|
|
|
return oz;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Ensure we have a valid open zone to write the GC data to.
|
|
|
|
*
|
|
|
|
* If the current target zone has space keep writing to it, else first wait for
|
|
|
|
* all pending writes and then pick a new one.
|
|
|
|
*/
|
|
|
|
static struct xfs_open_zone *
|
|
|
|
xfs_zone_gc_ensure_target(
|
|
|
|
struct xfs_mount *mp)
|
|
|
|
{
|
|
|
|
struct xfs_open_zone *oz = mp->m_zone_info->zi_open_gc_zone;
|
|
|
|
|
2025-07-16 14:54:03 +02:00
|
|
|
if (!oz || oz->oz_allocated == rtg_blocks(oz->oz_rtg))
|
2025-02-15 19:37:19 +01:00
|
|
|
return xfs_zone_gc_select_target(mp);
|
|
|
|
return oz;
|
|
|
|
}
|
|
|
|
|
|
|
|
static unsigned int
|
|
|
|
xfs_zone_gc_scratch_available(
|
|
|
|
struct xfs_zone_gc_data *data)
|
|
|
|
{
|
|
|
|
return XFS_GC_CHUNK_SIZE - data->scratch[data->scratch_idx].offset;
|
|
|
|
}
|
|
|
|
|
|
|
|
static bool
|
|
|
|
xfs_zone_gc_space_available(
|
|
|
|
struct xfs_zone_gc_data *data)
|
|
|
|
{
|
|
|
|
struct xfs_open_zone *oz;
|
|
|
|
|
|
|
|
oz = xfs_zone_gc_ensure_target(data->mp);
|
|
|
|
if (!oz)
|
|
|
|
return false;
|
2025-07-16 14:54:03 +02:00
|
|
|
return oz->oz_allocated < rtg_blocks(oz->oz_rtg) &&
|
2025-02-15 19:37:19 +01:00
|
|
|
xfs_zone_gc_scratch_available(data);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
|
|
|
xfs_zone_gc_end_io(
|
|
|
|
struct bio *bio)
|
|
|
|
{
|
|
|
|
struct xfs_gc_bio *chunk =
|
|
|
|
container_of(bio, struct xfs_gc_bio, bio);
|
|
|
|
struct xfs_zone_gc_data *data = chunk->data;
|
|
|
|
|
|
|
|
WRITE_ONCE(chunk->state, XFS_GC_BIO_DONE);
|
|
|
|
wake_up_process(data->mp->m_zone_info->zi_gc_thread);
|
|
|
|
}
|
|
|
|
|
|
|
|
static struct xfs_open_zone *
|
|
|
|
xfs_zone_gc_alloc_blocks(
|
|
|
|
struct xfs_zone_gc_data *data,
|
|
|
|
xfs_extlen_t *count_fsb,
|
|
|
|
xfs_daddr_t *daddr,
|
|
|
|
bool *is_seq)
|
|
|
|
{
|
|
|
|
struct xfs_mount *mp = data->mp;
|
|
|
|
struct xfs_open_zone *oz;
|
|
|
|
|
|
|
|
oz = xfs_zone_gc_ensure_target(mp);
|
|
|
|
if (!oz)
|
|
|
|
return NULL;
|
|
|
|
|
|
|
|
*count_fsb = min(*count_fsb,
|
|
|
|
XFS_B_TO_FSB(mp, xfs_zone_gc_scratch_available(data)));
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Directly allocate GC blocks from the reserved pool.
|
|
|
|
*
|
|
|
|
* If we'd take them from the normal pool we could be stealing blocks
|
|
|
|
* from a regular writer, which would then have to wait for GC and
|
|
|
|
* deadlock.
|
|
|
|
*/
|
|
|
|
spin_lock(&mp->m_sb_lock);
|
|
|
|
*count_fsb = min(*count_fsb,
|
2025-07-16 14:54:03 +02:00
|
|
|
rtg_blocks(oz->oz_rtg) - oz->oz_allocated);
|
2025-02-15 19:37:19 +01:00
|
|
|
*count_fsb = min3(*count_fsb,
|
|
|
|
mp->m_free[XC_FREE_RTEXTENTS].res_avail,
|
|
|
|
mp->m_free[XC_FREE_RTAVAILABLE].res_avail);
|
|
|
|
mp->m_free[XC_FREE_RTEXTENTS].res_avail -= *count_fsb;
|
|
|
|
mp->m_free[XC_FREE_RTAVAILABLE].res_avail -= *count_fsb;
|
|
|
|
spin_unlock(&mp->m_sb_lock);
|
|
|
|
|
|
|
|
if (!*count_fsb)
|
|
|
|
return NULL;
|
|
|
|
|
|
|
|
*daddr = xfs_gbno_to_daddr(&oz->oz_rtg->rtg_group, 0);
|
|
|
|
*is_seq = bdev_zone_is_seq(mp->m_rtdev_targp->bt_bdev, *daddr);
|
|
|
|
if (!*is_seq)
|
2025-07-16 14:54:03 +02:00
|
|
|
*daddr += XFS_FSB_TO_BB(mp, oz->oz_allocated);
|
|
|
|
oz->oz_allocated += *count_fsb;
|
2025-02-15 19:37:19 +01:00
|
|
|
atomic_inc(&oz->oz_ref);
|
|
|
|
return oz;
|
|
|
|
}
|
|
|
|
|
|
|
|
static bool
|
|
|
|
xfs_zone_gc_start_chunk(
|
|
|
|
struct xfs_zone_gc_data *data)
|
|
|
|
{
|
|
|
|
struct xfs_zone_gc_iter *iter = &data->iter;
|
|
|
|
struct xfs_mount *mp = data->mp;
|
|
|
|
struct block_device *bdev = mp->m_rtdev_targp->bt_bdev;
|
|
|
|
struct xfs_open_zone *oz;
|
|
|
|
struct xfs_rmap_irec irec;
|
|
|
|
struct xfs_gc_bio *chunk;
|
|
|
|
struct xfs_inode *ip;
|
|
|
|
struct bio *bio;
|
|
|
|
xfs_daddr_t daddr;
|
|
|
|
bool is_seq;
|
|
|
|
|
|
|
|
if (xfs_is_shutdown(mp))
|
|
|
|
return false;
|
|
|
|
|
|
|
|
if (!xfs_zone_gc_iter_next(mp, iter, &irec, &ip))
|
|
|
|
return false;
|
|
|
|
oz = xfs_zone_gc_alloc_blocks(data, &irec.rm_blockcount, &daddr,
|
|
|
|
&is_seq);
|
|
|
|
if (!oz) {
|
|
|
|
xfs_irele(ip);
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
bio = bio_alloc_bioset(bdev, 1, REQ_OP_READ, GFP_NOFS, &data->bio_set);
|
|
|
|
|
|
|
|
chunk = container_of(bio, struct xfs_gc_bio, bio);
|
|
|
|
chunk->ip = ip;
|
|
|
|
chunk->offset = XFS_FSB_TO_B(mp, irec.rm_offset);
|
|
|
|
chunk->len = XFS_FSB_TO_B(mp, irec.rm_blockcount);
|
|
|
|
chunk->old_startblock =
|
|
|
|
xfs_rgbno_to_rtb(iter->victim_rtg, irec.rm_startblock);
|
|
|
|
chunk->new_daddr = daddr;
|
|
|
|
chunk->is_seq = is_seq;
|
|
|
|
chunk->scratch = &data->scratch[data->scratch_idx];
|
|
|
|
chunk->data = data;
|
|
|
|
chunk->oz = oz;
|
|
|
|
|
|
|
|
bio->bi_iter.bi_sector = xfs_rtb_to_daddr(mp, chunk->old_startblock);
|
|
|
|
bio->bi_end_io = xfs_zone_gc_end_io;
|
|
|
|
bio_add_folio_nofail(bio, chunk->scratch->folio, chunk->len,
|
|
|
|
chunk->scratch->offset);
|
|
|
|
chunk->scratch->offset += chunk->len;
|
|
|
|
if (chunk->scratch->offset == XFS_GC_CHUNK_SIZE) {
|
|
|
|
data->scratch_idx =
|
|
|
|
(data->scratch_idx + 1) % XFS_ZONE_GC_NR_SCRATCH;
|
|
|
|
}
|
|
|
|
WRITE_ONCE(chunk->state, XFS_GC_BIO_NEW);
|
|
|
|
list_add_tail(&chunk->entry, &data->reading);
|
|
|
|
xfs_zone_gc_iter_advance(iter, irec.rm_blockcount);
|
|
|
|
|
|
|
|
submit_bio(bio);
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
|
|
|
xfs_zone_gc_free_chunk(
|
|
|
|
struct xfs_gc_bio *chunk)
|
|
|
|
{
|
|
|
|
list_del(&chunk->entry);
|
|
|
|
xfs_open_zone_put(chunk->oz);
|
|
|
|
xfs_irele(chunk->ip);
|
|
|
|
bio_put(&chunk->bio);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
|
|
|
xfs_zone_gc_submit_write(
|
|
|
|
struct xfs_zone_gc_data *data,
|
|
|
|
struct xfs_gc_bio *chunk)
|
|
|
|
{
|
|
|
|
if (chunk->is_seq) {
|
|
|
|
chunk->bio.bi_opf &= ~REQ_OP_WRITE;
|
|
|
|
chunk->bio.bi_opf |= REQ_OP_ZONE_APPEND;
|
|
|
|
}
|
|
|
|
chunk->bio.bi_iter.bi_sector = chunk->new_daddr;
|
|
|
|
chunk->bio.bi_end_io = xfs_zone_gc_end_io;
|
|
|
|
submit_bio(&chunk->bio);
|
|
|
|
}
|
|
|
|
|
|
|
|
static struct xfs_gc_bio *
|
|
|
|
xfs_zone_gc_split_write(
|
|
|
|
struct xfs_zone_gc_data *data,
|
|
|
|
struct xfs_gc_bio *chunk)
|
|
|
|
{
|
|
|
|
struct queue_limits *lim =
|
|
|
|
&bdev_get_queue(chunk->bio.bi_bdev)->limits;
|
|
|
|
struct xfs_gc_bio *split_chunk;
|
|
|
|
int split_sectors;
|
|
|
|
unsigned int split_len;
|
|
|
|
struct bio *split;
|
|
|
|
unsigned int nsegs;
|
|
|
|
|
|
|
|
if (!chunk->is_seq)
|
|
|
|
return NULL;
|
|
|
|
|
|
|
|
split_sectors = bio_split_rw_at(&chunk->bio, lim, &nsegs,
|
|
|
|
lim->max_zone_append_sectors << SECTOR_SHIFT);
|
|
|
|
if (!split_sectors)
|
|
|
|
return NULL;
|
|
|
|
|
|
|
|
/* ensure the split chunk is still block size aligned */
|
|
|
|
split_sectors = ALIGN_DOWN(split_sectors << SECTOR_SHIFT,
|
|
|
|
data->mp->m_sb.sb_blocksize) >> SECTOR_SHIFT;
|
|
|
|
split_len = split_sectors << SECTOR_SHIFT;
|
|
|
|
|
|
|
|
split = bio_split(&chunk->bio, split_sectors, GFP_NOFS, &data->bio_set);
|
|
|
|
split_chunk = container_of(split, struct xfs_gc_bio, bio);
|
|
|
|
split_chunk->data = data;
|
|
|
|
ihold(VFS_I(chunk->ip));
|
|
|
|
split_chunk->ip = chunk->ip;
|
|
|
|
split_chunk->is_seq = chunk->is_seq;
|
|
|
|
split_chunk->scratch = chunk->scratch;
|
|
|
|
split_chunk->offset = chunk->offset;
|
|
|
|
split_chunk->len = split_len;
|
|
|
|
split_chunk->old_startblock = chunk->old_startblock;
|
|
|
|
split_chunk->new_daddr = chunk->new_daddr;
|
|
|
|
split_chunk->oz = chunk->oz;
|
|
|
|
atomic_inc(&chunk->oz->oz_ref);
|
|
|
|
|
|
|
|
chunk->offset += split_len;
|
|
|
|
chunk->len -= split_len;
|
|
|
|
chunk->old_startblock += XFS_B_TO_FSB(data->mp, split_len);
|
|
|
|
|
|
|
|
/* add right before the original chunk */
|
|
|
|
WRITE_ONCE(split_chunk->state, XFS_GC_BIO_NEW);
|
|
|
|
list_add_tail(&split_chunk->entry, &chunk->entry);
|
|
|
|
return split_chunk;
|
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
|
|
|
xfs_zone_gc_write_chunk(
|
|
|
|
struct xfs_gc_bio *chunk)
|
|
|
|
{
|
|
|
|
struct xfs_zone_gc_data *data = chunk->data;
|
|
|
|
struct xfs_mount *mp = chunk->ip->i_mount;
|
2025-05-12 16:43:05 +02:00
|
|
|
phys_addr_t bvec_paddr =
|
|
|
|
bvec_phys(bio_first_bvec_all(&chunk->bio));
|
2025-02-15 19:37:19 +01:00
|
|
|
struct xfs_gc_bio *split_chunk;
|
|
|
|
|
|
|
|
if (chunk->bio.bi_status)
|
|
|
|
xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
|
|
|
|
if (xfs_is_shutdown(mp)) {
|
|
|
|
xfs_zone_gc_free_chunk(chunk);
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
WRITE_ONCE(chunk->state, XFS_GC_BIO_NEW);
|
|
|
|
list_move_tail(&chunk->entry, &data->writing);
|
|
|
|
|
|
|
|
bio_reset(&chunk->bio, mp->m_rtdev_targp->bt_bdev, REQ_OP_WRITE);
|
|
|
|
bio_add_folio_nofail(&chunk->bio, chunk->scratch->folio, chunk->len,
|
2025-05-12 16:43:05 +02:00
|
|
|
offset_in_folio(chunk->scratch->folio, bvec_paddr));
|
2025-02-15 19:37:19 +01:00
|
|
|
|
|
|
|
while ((split_chunk = xfs_zone_gc_split_write(data, chunk)))
|
|
|
|
xfs_zone_gc_submit_write(data, split_chunk);
|
|
|
|
xfs_zone_gc_submit_write(data, chunk);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
|
|
|
xfs_zone_gc_finish_chunk(
|
|
|
|
struct xfs_gc_bio *chunk)
|
|
|
|
{
|
|
|
|
uint iolock = XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL;
|
|
|
|
struct xfs_inode *ip = chunk->ip;
|
|
|
|
struct xfs_mount *mp = ip->i_mount;
|
|
|
|
int error;
|
|
|
|
|
|
|
|
if (chunk->bio.bi_status)
|
|
|
|
xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
|
|
|
|
if (xfs_is_shutdown(mp)) {
|
|
|
|
xfs_zone_gc_free_chunk(chunk);
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
chunk->scratch->freed += chunk->len;
|
|
|
|
if (chunk->scratch->freed == chunk->scratch->offset) {
|
|
|
|
chunk->scratch->offset = 0;
|
|
|
|
chunk->scratch->freed = 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Cycle through the iolock and wait for direct I/O and layouts to
|
|
|
|
* ensure no one is reading from the old mapping before it goes away.
|
|
|
|
*
|
|
|
|
* Note that xfs_zoned_end_io() below checks that no other writer raced
|
|
|
|
* with us to update the mapping by checking that the old startblock
|
|
|
|
* didn't change.
|
|
|
|
*/
|
|
|
|
xfs_ilock(ip, iolock);
|
|
|
|
error = xfs_break_layouts(VFS_I(ip), &iolock, BREAK_UNMAP);
|
|
|
|
if (!error)
|
|
|
|
inode_dio_wait(VFS_I(ip));
|
|
|
|
xfs_iunlock(ip, iolock);
|
|
|
|
if (error)
|
|
|
|
goto free;
|
|
|
|
|
|
|
|
if (chunk->is_seq)
|
|
|
|
chunk->new_daddr = chunk->bio.bi_iter.bi_sector;
|
|
|
|
error = xfs_zoned_end_io(ip, chunk->offset, chunk->len,
|
|
|
|
chunk->new_daddr, chunk->oz, chunk->old_startblock);
|
|
|
|
free:
|
|
|
|
if (error)
|
|
|
|
xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
|
|
|
|
xfs_zone_gc_free_chunk(chunk);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
|
|
|
xfs_zone_gc_finish_reset(
|
|
|
|
struct xfs_gc_bio *chunk)
|
|
|
|
{
|
|
|
|
struct xfs_rtgroup *rtg = chunk->bio.bi_private;
|
|
|
|
struct xfs_mount *mp = rtg_mount(rtg);
|
|
|
|
struct xfs_zone_info *zi = mp->m_zone_info;
|
|
|
|
|
|
|
|
if (chunk->bio.bi_status) {
|
|
|
|
xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
|
|
|
|
xfs_group_set_mark(&rtg->rtg_group, XFS_RTG_FREE);
|
|
|
|
atomic_inc(&zi->zi_nr_free_zones);
|
|
|
|
|
|
|
|
xfs_zoned_add_available(mp, rtg_blocks(rtg));
|
|
|
|
|
|
|
|
wake_up_all(&zi->zi_zone_wait);
|
|
|
|
out:
|
|
|
|
list_del(&chunk->entry);
|
|
|
|
bio_put(&chunk->bio);
|
|
|
|
}
|
|
|
|
|
|
|
|
static bool
|
|
|
|
xfs_zone_gc_prepare_reset(
|
|
|
|
struct bio *bio,
|
|
|
|
struct xfs_rtgroup *rtg)
|
|
|
|
{
|
|
|
|
trace_xfs_zone_reset(rtg);
|
|
|
|
|
|
|
|
ASSERT(rtg_rmap(rtg)->i_used_blocks == 0);
|
|
|
|
bio->bi_iter.bi_sector = xfs_gbno_to_daddr(&rtg->rtg_group, 0);
|
|
|
|
if (!bdev_zone_is_seq(bio->bi_bdev, bio->bi_iter.bi_sector)) {
|
|
|
|
if (!bdev_max_discard_sectors(bio->bi_bdev))
|
|
|
|
return false;
|
|
|
|
bio->bi_opf = REQ_OP_DISCARD | REQ_SYNC;
|
|
|
|
bio->bi_iter.bi_size =
|
|
|
|
XFS_FSB_TO_B(rtg_mount(rtg), rtg_blocks(rtg));
|
|
|
|
}
|
|
|
|
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
|
|
|
int
|
|
|
|
xfs_zone_gc_reset_sync(
|
|
|
|
struct xfs_rtgroup *rtg)
|
|
|
|
{
|
|
|
|
int error = 0;
|
|
|
|
struct bio bio;
|
|
|
|
|
|
|
|
bio_init(&bio, rtg_mount(rtg)->m_rtdev_targp->bt_bdev, NULL, 0,
|
|
|
|
REQ_OP_ZONE_RESET);
|
|
|
|
if (xfs_zone_gc_prepare_reset(&bio, rtg))
|
|
|
|
error = submit_bio_wait(&bio);
|
|
|
|
bio_uninit(&bio);
|
|
|
|
|
|
|
|
return error;
|
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
|
|
|
xfs_zone_gc_reset_zones(
|
|
|
|
struct xfs_zone_gc_data *data,
|
|
|
|
struct xfs_group *reset_list)
|
|
|
|
{
|
|
|
|
struct xfs_group *next = reset_list;
|
|
|
|
|
|
|
|
if (blkdev_issue_flush(data->mp->m_rtdev_targp->bt_bdev) < 0) {
|
|
|
|
xfs_force_shutdown(data->mp, SHUTDOWN_META_IO_ERROR);
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
do {
|
|
|
|
struct xfs_rtgroup *rtg = to_rtg(next);
|
|
|
|
struct xfs_gc_bio *chunk;
|
|
|
|
struct bio *bio;
|
|
|
|
|
|
|
|
xfs_log_force_inode(rtg_rmap(rtg));
|
|
|
|
|
|
|
|
next = rtg_group(rtg)->xg_next_reset;
|
|
|
|
rtg_group(rtg)->xg_next_reset = NULL;
|
|
|
|
|
|
|
|
bio = bio_alloc_bioset(rtg_mount(rtg)->m_rtdev_targp->bt_bdev,
|
|
|
|
0, REQ_OP_ZONE_RESET, GFP_NOFS, &data->bio_set);
|
|
|
|
bio->bi_private = rtg;
|
|
|
|
bio->bi_end_io = xfs_zone_gc_end_io;
|
|
|
|
|
|
|
|
chunk = container_of(bio, struct xfs_gc_bio, bio);
|
|
|
|
chunk->data = data;
|
|
|
|
WRITE_ONCE(chunk->state, XFS_GC_BIO_NEW);
|
|
|
|
list_add_tail(&chunk->entry, &data->resetting);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Also use the bio to drive the state machine when neither
|
|
|
|
* zone reset nor discard is supported to keep things simple.
|
|
|
|
*/
|
|
|
|
if (xfs_zone_gc_prepare_reset(bio, rtg))
|
|
|
|
submit_bio(bio);
|
|
|
|
else
|
|
|
|
bio_endio(bio);
|
|
|
|
} while (next);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Handle the work to read and write data for GC and to reset the zones,
|
|
|
|
* including handling all completions.
|
|
|
|
*
|
|
|
|
* Note that the order of the chunks is preserved so that we don't undo the
|
|
|
|
* optimal order established by xfs_zone_gc_query().
|
|
|
|
*/
|
|
|
|
static bool
|
|
|
|
xfs_zone_gc_handle_work(
|
|
|
|
struct xfs_zone_gc_data *data)
|
|
|
|
{
|
|
|
|
struct xfs_zone_info *zi = data->mp->m_zone_info;
|
|
|
|
struct xfs_gc_bio *chunk, *next;
|
|
|
|
struct xfs_group *reset_list;
|
|
|
|
struct blk_plug plug;
|
|
|
|
|
|
|
|
spin_lock(&zi->zi_reset_list_lock);
|
|
|
|
reset_list = zi->zi_reset_list;
|
|
|
|
zi->zi_reset_list = NULL;
|
|
|
|
spin_unlock(&zi->zi_reset_list_lock);
|
|
|
|
|
|
|
|
if (!xfs_zone_gc_select_victim(data) ||
|
|
|
|
!xfs_zone_gc_space_available(data)) {
|
|
|
|
if (list_empty(&data->reading) &&
|
|
|
|
list_empty(&data->writing) &&
|
|
|
|
list_empty(&data->resetting) &&
|
|
|
|
!reset_list)
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
__set_current_state(TASK_RUNNING);
|
|
|
|
try_to_freeze();
|
|
|
|
|
|
|
|
if (reset_list)
|
|
|
|
xfs_zone_gc_reset_zones(data, reset_list);
|
|
|
|
|
|
|
|
list_for_each_entry_safe(chunk, next, &data->resetting, entry) {
|
|
|
|
if (READ_ONCE(chunk->state) != XFS_GC_BIO_DONE)
|
|
|
|
break;
|
|
|
|
xfs_zone_gc_finish_reset(chunk);
|
|
|
|
}
|
|
|
|
|
|
|
|
list_for_each_entry_safe(chunk, next, &data->writing, entry) {
|
|
|
|
if (READ_ONCE(chunk->state) != XFS_GC_BIO_DONE)
|
|
|
|
break;
|
|
|
|
xfs_zone_gc_finish_chunk(chunk);
|
|
|
|
}
|
|
|
|
|
|
|
|
blk_start_plug(&plug);
|
|
|
|
list_for_each_entry_safe(chunk, next, &data->reading, entry) {
|
|
|
|
if (READ_ONCE(chunk->state) != XFS_GC_BIO_DONE)
|
|
|
|
break;
|
|
|
|
xfs_zone_gc_write_chunk(chunk);
|
|
|
|
}
|
|
|
|
blk_finish_plug(&plug);
|
|
|
|
|
|
|
|
blk_start_plug(&plug);
|
|
|
|
while (xfs_zone_gc_start_chunk(data))
|
|
|
|
;
|
|
|
|
blk_finish_plug(&plug);
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Note that the current GC algorithm would break reflinks and thus duplicate
|
|
|
|
* data that was shared by multiple owners before. Because of that reflinks
|
|
|
|
* are currently not supported on zoned file systems and can't be created or
|
|
|
|
* mounted.
|
|
|
|
*/
|
|
|
|
static int
|
|
|
|
xfs_zoned_gcd(
|
|
|
|
void *private)
|
|
|
|
{
|
|
|
|
struct xfs_zone_gc_data *data = private;
|
|
|
|
struct xfs_mount *mp = data->mp;
|
|
|
|
struct xfs_zone_info *zi = mp->m_zone_info;
|
|
|
|
unsigned int nofs_flag;
|
|
|
|
|
|
|
|
nofs_flag = memalloc_nofs_save();
|
|
|
|
set_freezable();
|
|
|
|
|
|
|
|
for (;;) {
|
|
|
|
set_current_state(TASK_INTERRUPTIBLE | TASK_FREEZABLE);
|
|
|
|
xfs_set_zonegc_running(mp);
|
|
|
|
if (xfs_zone_gc_handle_work(data))
|
|
|
|
continue;
|
|
|
|
|
|
|
|
if (list_empty(&data->reading) &&
|
|
|
|
list_empty(&data->writing) &&
|
|
|
|
list_empty(&data->resetting) &&
|
|
|
|
!zi->zi_reset_list) {
|
|
|
|
xfs_clear_zonegc_running(mp);
|
|
|
|
xfs_zoned_resv_wake_all(mp);
|
|
|
|
|
|
|
|
if (kthread_should_stop()) {
|
|
|
|
__set_current_state(TASK_RUNNING);
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (kthread_should_park()) {
|
|
|
|
__set_current_state(TASK_RUNNING);
|
|
|
|
kthread_parkme();
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
schedule();
|
|
|
|
}
|
|
|
|
xfs_clear_zonegc_running(mp);
|
|
|
|
|
|
|
|
if (data->iter.victim_rtg)
|
|
|
|
xfs_rtgroup_rele(data->iter.victim_rtg);
|
|
|
|
|
|
|
|
memalloc_nofs_restore(nofs_flag);
|
|
|
|
xfs_zone_gc_data_free(data);
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
void
|
|
|
|
xfs_zone_gc_start(
|
|
|
|
struct xfs_mount *mp)
|
|
|
|
{
|
|
|
|
if (xfs_has_zoned(mp))
|
|
|
|
kthread_unpark(mp->m_zone_info->zi_gc_thread);
|
|
|
|
}
|
|
|
|
|
|
|
|
void
|
|
|
|
xfs_zone_gc_stop(
|
|
|
|
struct xfs_mount *mp)
|
|
|
|
{
|
|
|
|
if (xfs_has_zoned(mp))
|
|
|
|
kthread_park(mp->m_zone_info->zi_gc_thread);
|
|
|
|
}
|
|
|
|
|
|
|
|
int
|
|
|
|
xfs_zone_gc_mount(
|
|
|
|
struct xfs_mount *mp)
|
|
|
|
{
|
|
|
|
struct xfs_zone_info *zi = mp->m_zone_info;
|
|
|
|
struct xfs_zone_gc_data *data;
|
|
|
|
struct xfs_open_zone *oz;
|
|
|
|
int error;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* If there are no free zones available for GC, pick the open zone with
|
|
|
|
* the least used space to GC into. This should only happen after an
|
|
|
|
* unclean shutdown near ENOSPC while GC was ongoing.
|
|
|
|
*
|
|
|
|
* We also need to do this for the first gc zone allocation if we
|
|
|
|
* unmounted while at the open limit.
|
|
|
|
*/
|
|
|
|
if (!xfs_group_marked(mp, XG_TYPE_RTG, XFS_RTG_FREE) ||
|
|
|
|
zi->zi_nr_open_zones == mp->m_max_open_zones)
|
|
|
|
oz = xfs_zone_gc_steal_open(zi);
|
|
|
|
else
|
2025-01-31 09:39:36 +01:00
|
|
|
oz = xfs_open_zone(mp, WRITE_LIFE_NOT_SET, true);
|
2025-02-15 19:37:19 +01:00
|
|
|
if (!oz) {
|
|
|
|
xfs_warn(mp, "unable to allocate a zone for gc");
|
|
|
|
error = -EIO;
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
|
|
|
|
trace_xfs_zone_gc_target_opened(oz->oz_rtg);
|
|
|
|
zi->zi_open_gc_zone = oz;
|
|
|
|
|
|
|
|
data = xfs_zone_gc_data_alloc(mp);
|
|
|
|
if (!data) {
|
|
|
|
error = -ENOMEM;
|
|
|
|
goto out_put_gc_zone;
|
|
|
|
}
|
|
|
|
|
|
|
|
mp->m_zone_info->zi_gc_thread = kthread_create(xfs_zoned_gcd, data,
|
|
|
|
"xfs-zone-gc/%s", mp->m_super->s_id);
|
|
|
|
if (IS_ERR(mp->m_zone_info->zi_gc_thread)) {
|
|
|
|
xfs_warn(mp, "unable to create zone gc thread");
|
|
|
|
error = PTR_ERR(mp->m_zone_info->zi_gc_thread);
|
|
|
|
goto out_free_gc_data;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* xfs_zone_gc_start will unpark for rw mounts */
|
|
|
|
kthread_park(mp->m_zone_info->zi_gc_thread);
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
out_free_gc_data:
|
|
|
|
kfree(data);
|
|
|
|
out_put_gc_zone:
|
|
|
|
xfs_open_zone_put(zi->zi_open_gc_zone);
|
|
|
|
out:
|
|
|
|
return error;
|
|
|
|
}
|
|
|
|
|
|
|
|
void
|
|
|
|
xfs_zone_gc_unmount(
|
|
|
|
struct xfs_mount *mp)
|
|
|
|
{
|
|
|
|
struct xfs_zone_info *zi = mp->m_zone_info;
|
|
|
|
|
|
|
|
kthread_stop(zi->zi_gc_thread);
|
|
|
|
if (zi->zi_open_gc_zone)
|
|
|
|
xfs_open_zone_put(zi->zi_open_gc_zone);
|
|
|
|
}
|