mirror of
git://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
synced 2025-08-05 16:54:27 +00:00

This member just tracks how much space we handed out for sequential write required zones. Only for conventional space it actually is the pointer where thing are written at, otherwise zone append manages that. Signed-off-by: Christoph Hellwig <hch@lst.de> Reviewed-by: Darrick J. Wong <djwong@kernel.org> Signed-off-by: Carlos Maiolino <cem@kernel.org>
1178 lines
29 KiB
C
1178 lines
29 KiB
C
// SPDX-License-Identifier: GPL-2.0
|
|
/*
|
|
* Copyright (c) 2023-2025 Christoph Hellwig.
|
|
* Copyright (c) 2024-2025, Western Digital Corporation or its affiliates.
|
|
*/
|
|
#include "xfs.h"
|
|
#include "xfs_shared.h"
|
|
#include "xfs_format.h"
|
|
#include "xfs_log_format.h"
|
|
#include "xfs_trans_resv.h"
|
|
#include "xfs_mount.h"
|
|
#include "xfs_inode.h"
|
|
#include "xfs_btree.h"
|
|
#include "xfs_trans.h"
|
|
#include "xfs_icache.h"
|
|
#include "xfs_rmap.h"
|
|
#include "xfs_rtbitmap.h"
|
|
#include "xfs_rtrmap_btree.h"
|
|
#include "xfs_zone_alloc.h"
|
|
#include "xfs_zone_priv.h"
|
|
#include "xfs_zones.h"
|
|
#include "xfs_trace.h"
|
|
|
|
/*
|
|
* Implement Garbage Collection (GC) of partially used zoned.
|
|
*
|
|
* To support the purely sequential writes in each zone, zoned XFS needs to be
|
|
* able to move data remaining in a zone out of it to reset the zone to prepare
|
|
* for writing to it again.
|
|
*
|
|
* This is done by the GC thread implemented in this file. To support that a
|
|
* number of zones (XFS_GC_ZONES) is reserved from the user visible capacity to
|
|
* write the garbage collected data into.
|
|
*
|
|
* Whenever the available space is below the chosen threshold, the GC thread
|
|
* looks for potential non-empty but not fully used zones that are worth
|
|
* reclaiming. Once found the rmap for the victim zone is queried, and after
|
|
* a bit of sorting to reduce fragmentation, the still live extents are read
|
|
* into memory and written to the GC target zone, and the bmap btree of the
|
|
* files is updated to point to the new location. To avoid taking the IOLOCK
|
|
* and MMAPLOCK for the entire GC process and thus affecting the latency of
|
|
* user reads and writes to the files, the GC writes are speculative and the
|
|
* I/O completion checks that no other writes happened for the affected regions
|
|
* before remapping.
|
|
*
|
|
* Once a zone does not contain any valid data, be that through GC or user
|
|
* block removal, it is queued for for a zone reset. The reset operation
|
|
* carefully ensures that the RT device cache is flushed and all transactions
|
|
* referencing the rmap have been committed to disk.
|
|
*/
|
|
|
|
/*
|
|
* Size of each GC scratch pad. This is also the upper bound for each
|
|
* GC I/O, which helps to keep latency down.
|
|
*/
|
|
#define XFS_GC_CHUNK_SIZE SZ_1M
|
|
|
|
/*
|
|
* Scratchpad data to read GCed data into.
|
|
*
|
|
* The offset member tracks where the next allocation starts, and freed tracks
|
|
* the amount of space that is not used anymore.
|
|
*/
|
|
#define XFS_ZONE_GC_NR_SCRATCH 2
|
|
struct xfs_zone_scratch {
|
|
struct folio *folio;
|
|
unsigned int offset;
|
|
unsigned int freed;
|
|
};
|
|
|
|
/*
|
|
* Chunk that is read and written for each GC operation.
|
|
*
|
|
* Note that for writes to actual zoned devices, the chunk can be split when
|
|
* reaching the hardware limit.
|
|
*/
|
|
struct xfs_gc_bio {
|
|
struct xfs_zone_gc_data *data;
|
|
|
|
/*
|
|
* Entry into the reading/writing/resetting list. Only accessed from
|
|
* the GC thread, so no locking needed.
|
|
*/
|
|
struct list_head entry;
|
|
|
|
/*
|
|
* State of this gc_bio. Done means the current I/O completed.
|
|
* Set from the bio end I/O handler, read from the GC thread.
|
|
*/
|
|
enum {
|
|
XFS_GC_BIO_NEW,
|
|
XFS_GC_BIO_DONE,
|
|
} state;
|
|
|
|
/*
|
|
* Pointer to the inode and byte range in the inode that this
|
|
* GC chunk is operating on.
|
|
*/
|
|
struct xfs_inode *ip;
|
|
loff_t offset;
|
|
unsigned int len;
|
|
|
|
/*
|
|
* Existing startblock (in the zone to be freed) and newly assigned
|
|
* daddr in the zone GCed into.
|
|
*/
|
|
xfs_fsblock_t old_startblock;
|
|
xfs_daddr_t new_daddr;
|
|
struct xfs_zone_scratch *scratch;
|
|
|
|
/* Are we writing to a sequential write required zone? */
|
|
bool is_seq;
|
|
|
|
/* Open Zone being written to */
|
|
struct xfs_open_zone *oz;
|
|
|
|
/* Bio used for reads and writes, including the bvec used by it */
|
|
struct bio_vec bv;
|
|
struct bio bio; /* must be last */
|
|
};
|
|
|
|
#define XFS_ZONE_GC_RECS 1024
|
|
|
|
/* iterator, needs to be reinitialized for each victim zone */
|
|
struct xfs_zone_gc_iter {
|
|
struct xfs_rtgroup *victim_rtg;
|
|
unsigned int rec_count;
|
|
unsigned int rec_idx;
|
|
xfs_agblock_t next_startblock;
|
|
struct xfs_rmap_irec *recs;
|
|
};
|
|
|
|
/*
|
|
* Per-mount GC state.
|
|
*/
|
|
struct xfs_zone_gc_data {
|
|
struct xfs_mount *mp;
|
|
|
|
/* bioset used to allocate the gc_bios */
|
|
struct bio_set bio_set;
|
|
|
|
/*
|
|
* Scratchpad used, and index to indicated which one is used.
|
|
*/
|
|
struct xfs_zone_scratch scratch[XFS_ZONE_GC_NR_SCRATCH];
|
|
unsigned int scratch_idx;
|
|
|
|
/*
|
|
* List of bios currently being read, written and reset.
|
|
* These lists are only accessed by the GC thread itself, and must only
|
|
* be processed in order.
|
|
*/
|
|
struct list_head reading;
|
|
struct list_head writing;
|
|
struct list_head resetting;
|
|
|
|
/*
|
|
* Iterator for the victim zone.
|
|
*/
|
|
struct xfs_zone_gc_iter iter;
|
|
};
|
|
|
|
/*
|
|
* We aim to keep enough zones free in stock to fully use the open zone limit
|
|
* for data placement purposes. Additionally, the m_zonegc_low_space tunable
|
|
* can be set to make sure a fraction of the unused blocks are available for
|
|
* writing.
|
|
*/
|
|
bool
|
|
xfs_zoned_need_gc(
|
|
struct xfs_mount *mp)
|
|
{
|
|
s64 available, free, threshold;
|
|
s32 remainder;
|
|
|
|
if (!xfs_group_marked(mp, XG_TYPE_RTG, XFS_RTG_RECLAIMABLE))
|
|
return false;
|
|
|
|
available = xfs_estimate_freecounter(mp, XC_FREE_RTAVAILABLE);
|
|
|
|
if (available <
|
|
mp->m_groups[XG_TYPE_RTG].blocks *
|
|
(mp->m_max_open_zones - XFS_OPEN_GC_ZONES))
|
|
return true;
|
|
|
|
free = xfs_estimate_freecounter(mp, XC_FREE_RTEXTENTS);
|
|
|
|
threshold = div_s64_rem(free, 100, &remainder);
|
|
threshold = threshold * mp->m_zonegc_low_space +
|
|
remainder * div_s64(mp->m_zonegc_low_space, 100);
|
|
|
|
if (available < threshold)
|
|
return true;
|
|
|
|
return false;
|
|
}
|
|
|
|
static struct xfs_zone_gc_data *
|
|
xfs_zone_gc_data_alloc(
|
|
struct xfs_mount *mp)
|
|
{
|
|
struct xfs_zone_gc_data *data;
|
|
int i;
|
|
|
|
data = kzalloc(sizeof(*data), GFP_KERNEL);
|
|
if (!data)
|
|
return NULL;
|
|
data->iter.recs = kcalloc(XFS_ZONE_GC_RECS, sizeof(*data->iter.recs),
|
|
GFP_KERNEL);
|
|
if (!data->iter.recs)
|
|
goto out_free_data;
|
|
|
|
/*
|
|
* We actually only need a single bio_vec. It would be nice to have
|
|
* a flag that only allocates the inline bvecs and not the separate
|
|
* bvec pool.
|
|
*/
|
|
if (bioset_init(&data->bio_set, 16, offsetof(struct xfs_gc_bio, bio),
|
|
BIOSET_NEED_BVECS))
|
|
goto out_free_recs;
|
|
for (i = 0; i < XFS_ZONE_GC_NR_SCRATCH; i++) {
|
|
data->scratch[i].folio =
|
|
folio_alloc(GFP_KERNEL, get_order(XFS_GC_CHUNK_SIZE));
|
|
if (!data->scratch[i].folio)
|
|
goto out_free_scratch;
|
|
}
|
|
INIT_LIST_HEAD(&data->reading);
|
|
INIT_LIST_HEAD(&data->writing);
|
|
INIT_LIST_HEAD(&data->resetting);
|
|
data->mp = mp;
|
|
return data;
|
|
|
|
out_free_scratch:
|
|
while (--i >= 0)
|
|
folio_put(data->scratch[i].folio);
|
|
bioset_exit(&data->bio_set);
|
|
out_free_recs:
|
|
kfree(data->iter.recs);
|
|
out_free_data:
|
|
kfree(data);
|
|
return NULL;
|
|
}
|
|
|
|
static void
|
|
xfs_zone_gc_data_free(
|
|
struct xfs_zone_gc_data *data)
|
|
{
|
|
int i;
|
|
|
|
for (i = 0; i < XFS_ZONE_GC_NR_SCRATCH; i++)
|
|
folio_put(data->scratch[i].folio);
|
|
bioset_exit(&data->bio_set);
|
|
kfree(data->iter.recs);
|
|
kfree(data);
|
|
}
|
|
|
|
static void
|
|
xfs_zone_gc_iter_init(
|
|
struct xfs_zone_gc_iter *iter,
|
|
struct xfs_rtgroup *victim_rtg)
|
|
|
|
{
|
|
iter->next_startblock = 0;
|
|
iter->rec_count = 0;
|
|
iter->rec_idx = 0;
|
|
iter->victim_rtg = victim_rtg;
|
|
}
|
|
|
|
/*
|
|
* Query the rmap of the victim zone to gather the records to evacuate.
|
|
*/
|
|
static int
|
|
xfs_zone_gc_query_cb(
|
|
struct xfs_btree_cur *cur,
|
|
const struct xfs_rmap_irec *irec,
|
|
void *private)
|
|
{
|
|
struct xfs_zone_gc_iter *iter = private;
|
|
|
|
ASSERT(!XFS_RMAP_NON_INODE_OWNER(irec->rm_owner));
|
|
ASSERT(!xfs_is_sb_inum(cur->bc_mp, irec->rm_owner));
|
|
ASSERT(!(irec->rm_flags & (XFS_RMAP_ATTR_FORK | XFS_RMAP_BMBT_BLOCK)));
|
|
|
|
iter->recs[iter->rec_count] = *irec;
|
|
if (++iter->rec_count == XFS_ZONE_GC_RECS) {
|
|
iter->next_startblock =
|
|
irec->rm_startblock + irec->rm_blockcount;
|
|
return 1;
|
|
}
|
|
return 0;
|
|
}
|
|
|
|
static int
|
|
xfs_zone_gc_rmap_rec_cmp(
|
|
const void *a,
|
|
const void *b)
|
|
{
|
|
const struct xfs_rmap_irec *reca = a;
|
|
const struct xfs_rmap_irec *recb = b;
|
|
int diff;
|
|
|
|
diff = cmp_int(reca->rm_owner, recb->rm_owner);
|
|
if (diff)
|
|
return diff;
|
|
return cmp_int(reca->rm_offset, recb->rm_offset);
|
|
}
|
|
|
|
static int
|
|
xfs_zone_gc_query(
|
|
struct xfs_mount *mp,
|
|
struct xfs_zone_gc_iter *iter)
|
|
{
|
|
struct xfs_rtgroup *rtg = iter->victim_rtg;
|
|
struct xfs_rmap_irec ri_low = { };
|
|
struct xfs_rmap_irec ri_high;
|
|
struct xfs_btree_cur *cur;
|
|
struct xfs_trans *tp;
|
|
int error;
|
|
|
|
ASSERT(iter->next_startblock <= rtg_blocks(rtg));
|
|
if (iter->next_startblock == rtg_blocks(rtg))
|
|
goto done;
|
|
|
|
ASSERT(iter->next_startblock < rtg_blocks(rtg));
|
|
ri_low.rm_startblock = iter->next_startblock;
|
|
memset(&ri_high, 0xFF, sizeof(ri_high));
|
|
|
|
iter->rec_idx = 0;
|
|
iter->rec_count = 0;
|
|
|
|
tp = xfs_trans_alloc_empty(mp);
|
|
xfs_rtgroup_lock(rtg, XFS_RTGLOCK_RMAP);
|
|
cur = xfs_rtrmapbt_init_cursor(tp, rtg);
|
|
error = xfs_rmap_query_range(cur, &ri_low, &ri_high,
|
|
xfs_zone_gc_query_cb, iter);
|
|
xfs_rtgroup_unlock(rtg, XFS_RTGLOCK_RMAP);
|
|
xfs_btree_del_cursor(cur, error < 0 ? error : 0);
|
|
xfs_trans_cancel(tp);
|
|
|
|
if (error < 0)
|
|
return error;
|
|
|
|
/*
|
|
* Sort the rmap records by inode number and increasing offset to
|
|
* defragment the mappings.
|
|
*
|
|
* This could be further enhanced by an even bigger look ahead window,
|
|
* but that's better left until we have better detection of changes to
|
|
* inode mapping to avoid the potential of GCing already dead data.
|
|
*/
|
|
sort(iter->recs, iter->rec_count, sizeof(iter->recs[0]),
|
|
xfs_zone_gc_rmap_rec_cmp, NULL);
|
|
|
|
if (error == 0) {
|
|
/*
|
|
* We finished iterating through the zone.
|
|
*/
|
|
iter->next_startblock = rtg_blocks(rtg);
|
|
if (iter->rec_count == 0)
|
|
goto done;
|
|
}
|
|
|
|
return 0;
|
|
done:
|
|
xfs_rtgroup_rele(iter->victim_rtg);
|
|
iter->victim_rtg = NULL;
|
|
return 0;
|
|
}
|
|
|
|
static bool
|
|
xfs_zone_gc_iter_next(
|
|
struct xfs_mount *mp,
|
|
struct xfs_zone_gc_iter *iter,
|
|
struct xfs_rmap_irec *chunk_rec,
|
|
struct xfs_inode **ipp)
|
|
{
|
|
struct xfs_rmap_irec *irec;
|
|
int error;
|
|
|
|
if (!iter->victim_rtg)
|
|
return false;
|
|
|
|
retry:
|
|
if (iter->rec_idx == iter->rec_count) {
|
|
error = xfs_zone_gc_query(mp, iter);
|
|
if (error)
|
|
goto fail;
|
|
if (!iter->victim_rtg)
|
|
return false;
|
|
}
|
|
|
|
irec = &iter->recs[iter->rec_idx];
|
|
error = xfs_iget(mp, NULL, irec->rm_owner,
|
|
XFS_IGET_UNTRUSTED | XFS_IGET_DONTCACHE, 0, ipp);
|
|
if (error) {
|
|
/*
|
|
* If the inode was already deleted, skip over it.
|
|
*/
|
|
if (error == -ENOENT) {
|
|
iter->rec_idx++;
|
|
goto retry;
|
|
}
|
|
goto fail;
|
|
}
|
|
|
|
if (!S_ISREG(VFS_I(*ipp)->i_mode) || !XFS_IS_REALTIME_INODE(*ipp)) {
|
|
iter->rec_idx++;
|
|
xfs_irele(*ipp);
|
|
goto retry;
|
|
}
|
|
|
|
*chunk_rec = *irec;
|
|
return true;
|
|
|
|
fail:
|
|
xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
|
|
return false;
|
|
}
|
|
|
|
static void
|
|
xfs_zone_gc_iter_advance(
|
|
struct xfs_zone_gc_iter *iter,
|
|
xfs_extlen_t count_fsb)
|
|
{
|
|
struct xfs_rmap_irec *irec = &iter->recs[iter->rec_idx];
|
|
|
|
irec->rm_offset += count_fsb;
|
|
irec->rm_startblock += count_fsb;
|
|
irec->rm_blockcount -= count_fsb;
|
|
if (!irec->rm_blockcount)
|
|
iter->rec_idx++;
|
|
}
|
|
|
|
static struct xfs_rtgroup *
|
|
xfs_zone_gc_pick_victim_from(
|
|
struct xfs_mount *mp,
|
|
uint32_t bucket)
|
|
{
|
|
struct xfs_zone_info *zi = mp->m_zone_info;
|
|
uint32_t victim_used = U32_MAX;
|
|
struct xfs_rtgroup *victim_rtg = NULL;
|
|
uint32_t bit;
|
|
|
|
if (!zi->zi_used_bucket_entries[bucket])
|
|
return NULL;
|
|
|
|
for_each_set_bit(bit, zi->zi_used_bucket_bitmap[bucket],
|
|
mp->m_sb.sb_rgcount) {
|
|
struct xfs_rtgroup *rtg = xfs_rtgroup_grab(mp, bit);
|
|
|
|
if (!rtg)
|
|
continue;
|
|
|
|
/* skip zones that are just waiting for a reset */
|
|
if (rtg_rmap(rtg)->i_used_blocks == 0 ||
|
|
rtg_rmap(rtg)->i_used_blocks >= victim_used) {
|
|
xfs_rtgroup_rele(rtg);
|
|
continue;
|
|
}
|
|
|
|
if (victim_rtg)
|
|
xfs_rtgroup_rele(victim_rtg);
|
|
victim_rtg = rtg;
|
|
victim_used = rtg_rmap(rtg)->i_used_blocks;
|
|
|
|
/*
|
|
* Any zone that is less than 1 percent used is fair game for
|
|
* instant reclaim. All of these zones are in the last
|
|
* bucket, so avoid the expensive division for the zones
|
|
* in the other buckets.
|
|
*/
|
|
if (bucket == 0 &&
|
|
rtg_rmap(rtg)->i_used_blocks < rtg_blocks(rtg) / 100)
|
|
break;
|
|
}
|
|
|
|
return victim_rtg;
|
|
}
|
|
|
|
/*
|
|
* Iterate through all zones marked as reclaimable and find a candidate to
|
|
* reclaim.
|
|
*/
|
|
static bool
|
|
xfs_zone_gc_select_victim(
|
|
struct xfs_zone_gc_data *data)
|
|
{
|
|
struct xfs_zone_gc_iter *iter = &data->iter;
|
|
struct xfs_mount *mp = data->mp;
|
|
struct xfs_zone_info *zi = mp->m_zone_info;
|
|
struct xfs_rtgroup *victim_rtg = NULL;
|
|
unsigned int bucket;
|
|
|
|
if (xfs_is_shutdown(mp))
|
|
return false;
|
|
|
|
if (iter->victim_rtg)
|
|
return true;
|
|
|
|
/*
|
|
* Don't start new work if we are asked to stop or park.
|
|
*/
|
|
if (kthread_should_stop() || kthread_should_park())
|
|
return false;
|
|
|
|
if (!xfs_zoned_need_gc(mp))
|
|
return false;
|
|
|
|
spin_lock(&zi->zi_used_buckets_lock);
|
|
for (bucket = 0; bucket < XFS_ZONE_USED_BUCKETS; bucket++) {
|
|
victim_rtg = xfs_zone_gc_pick_victim_from(mp, bucket);
|
|
if (victim_rtg)
|
|
break;
|
|
}
|
|
spin_unlock(&zi->zi_used_buckets_lock);
|
|
|
|
if (!victim_rtg)
|
|
return false;
|
|
|
|
trace_xfs_zone_gc_select_victim(victim_rtg, bucket);
|
|
xfs_zone_gc_iter_init(iter, victim_rtg);
|
|
return true;
|
|
}
|
|
|
|
static struct xfs_open_zone *
|
|
xfs_zone_gc_steal_open(
|
|
struct xfs_zone_info *zi)
|
|
{
|
|
struct xfs_open_zone *oz, *found = NULL;
|
|
|
|
spin_lock(&zi->zi_open_zones_lock);
|
|
list_for_each_entry(oz, &zi->zi_open_zones, oz_entry) {
|
|
if (!found || oz->oz_allocated < found->oz_allocated)
|
|
found = oz;
|
|
}
|
|
|
|
if (found) {
|
|
found->oz_is_gc = true;
|
|
list_del_init(&found->oz_entry);
|
|
zi->zi_nr_open_zones--;
|
|
}
|
|
|
|
spin_unlock(&zi->zi_open_zones_lock);
|
|
return found;
|
|
}
|
|
|
|
static struct xfs_open_zone *
|
|
xfs_zone_gc_select_target(
|
|
struct xfs_mount *mp)
|
|
{
|
|
struct xfs_zone_info *zi = mp->m_zone_info;
|
|
struct xfs_open_zone *oz = zi->zi_open_gc_zone;
|
|
|
|
/*
|
|
* We need to wait for pending writes to finish.
|
|
*/
|
|
if (oz && oz->oz_written < rtg_blocks(oz->oz_rtg))
|
|
return NULL;
|
|
|
|
ASSERT(zi->zi_nr_open_zones <=
|
|
mp->m_max_open_zones - XFS_OPEN_GC_ZONES);
|
|
oz = xfs_open_zone(mp, WRITE_LIFE_NOT_SET, true);
|
|
if (oz)
|
|
trace_xfs_zone_gc_target_opened(oz->oz_rtg);
|
|
spin_lock(&zi->zi_open_zones_lock);
|
|
zi->zi_open_gc_zone = oz;
|
|
spin_unlock(&zi->zi_open_zones_lock);
|
|
return oz;
|
|
}
|
|
|
|
/*
|
|
* Ensure we have a valid open zone to write the GC data to.
|
|
*
|
|
* If the current target zone has space keep writing to it, else first wait for
|
|
* all pending writes and then pick a new one.
|
|
*/
|
|
static struct xfs_open_zone *
|
|
xfs_zone_gc_ensure_target(
|
|
struct xfs_mount *mp)
|
|
{
|
|
struct xfs_open_zone *oz = mp->m_zone_info->zi_open_gc_zone;
|
|
|
|
if (!oz || oz->oz_allocated == rtg_blocks(oz->oz_rtg))
|
|
return xfs_zone_gc_select_target(mp);
|
|
return oz;
|
|
}
|
|
|
|
static unsigned int
|
|
xfs_zone_gc_scratch_available(
|
|
struct xfs_zone_gc_data *data)
|
|
{
|
|
return XFS_GC_CHUNK_SIZE - data->scratch[data->scratch_idx].offset;
|
|
}
|
|
|
|
static bool
|
|
xfs_zone_gc_space_available(
|
|
struct xfs_zone_gc_data *data)
|
|
{
|
|
struct xfs_open_zone *oz;
|
|
|
|
oz = xfs_zone_gc_ensure_target(data->mp);
|
|
if (!oz)
|
|
return false;
|
|
return oz->oz_allocated < rtg_blocks(oz->oz_rtg) &&
|
|
xfs_zone_gc_scratch_available(data);
|
|
}
|
|
|
|
static void
|
|
xfs_zone_gc_end_io(
|
|
struct bio *bio)
|
|
{
|
|
struct xfs_gc_bio *chunk =
|
|
container_of(bio, struct xfs_gc_bio, bio);
|
|
struct xfs_zone_gc_data *data = chunk->data;
|
|
|
|
WRITE_ONCE(chunk->state, XFS_GC_BIO_DONE);
|
|
wake_up_process(data->mp->m_zone_info->zi_gc_thread);
|
|
}
|
|
|
|
static struct xfs_open_zone *
|
|
xfs_zone_gc_alloc_blocks(
|
|
struct xfs_zone_gc_data *data,
|
|
xfs_extlen_t *count_fsb,
|
|
xfs_daddr_t *daddr,
|
|
bool *is_seq)
|
|
{
|
|
struct xfs_mount *mp = data->mp;
|
|
struct xfs_open_zone *oz;
|
|
|
|
oz = xfs_zone_gc_ensure_target(mp);
|
|
if (!oz)
|
|
return NULL;
|
|
|
|
*count_fsb = min(*count_fsb,
|
|
XFS_B_TO_FSB(mp, xfs_zone_gc_scratch_available(data)));
|
|
|
|
/*
|
|
* Directly allocate GC blocks from the reserved pool.
|
|
*
|
|
* If we'd take them from the normal pool we could be stealing blocks
|
|
* from a regular writer, which would then have to wait for GC and
|
|
* deadlock.
|
|
*/
|
|
spin_lock(&mp->m_sb_lock);
|
|
*count_fsb = min(*count_fsb,
|
|
rtg_blocks(oz->oz_rtg) - oz->oz_allocated);
|
|
*count_fsb = min3(*count_fsb,
|
|
mp->m_free[XC_FREE_RTEXTENTS].res_avail,
|
|
mp->m_free[XC_FREE_RTAVAILABLE].res_avail);
|
|
mp->m_free[XC_FREE_RTEXTENTS].res_avail -= *count_fsb;
|
|
mp->m_free[XC_FREE_RTAVAILABLE].res_avail -= *count_fsb;
|
|
spin_unlock(&mp->m_sb_lock);
|
|
|
|
if (!*count_fsb)
|
|
return NULL;
|
|
|
|
*daddr = xfs_gbno_to_daddr(&oz->oz_rtg->rtg_group, 0);
|
|
*is_seq = bdev_zone_is_seq(mp->m_rtdev_targp->bt_bdev, *daddr);
|
|
if (!*is_seq)
|
|
*daddr += XFS_FSB_TO_BB(mp, oz->oz_allocated);
|
|
oz->oz_allocated += *count_fsb;
|
|
atomic_inc(&oz->oz_ref);
|
|
return oz;
|
|
}
|
|
|
|
static bool
|
|
xfs_zone_gc_start_chunk(
|
|
struct xfs_zone_gc_data *data)
|
|
{
|
|
struct xfs_zone_gc_iter *iter = &data->iter;
|
|
struct xfs_mount *mp = data->mp;
|
|
struct block_device *bdev = mp->m_rtdev_targp->bt_bdev;
|
|
struct xfs_open_zone *oz;
|
|
struct xfs_rmap_irec irec;
|
|
struct xfs_gc_bio *chunk;
|
|
struct xfs_inode *ip;
|
|
struct bio *bio;
|
|
xfs_daddr_t daddr;
|
|
bool is_seq;
|
|
|
|
if (xfs_is_shutdown(mp))
|
|
return false;
|
|
|
|
if (!xfs_zone_gc_iter_next(mp, iter, &irec, &ip))
|
|
return false;
|
|
oz = xfs_zone_gc_alloc_blocks(data, &irec.rm_blockcount, &daddr,
|
|
&is_seq);
|
|
if (!oz) {
|
|
xfs_irele(ip);
|
|
return false;
|
|
}
|
|
|
|
bio = bio_alloc_bioset(bdev, 1, REQ_OP_READ, GFP_NOFS, &data->bio_set);
|
|
|
|
chunk = container_of(bio, struct xfs_gc_bio, bio);
|
|
chunk->ip = ip;
|
|
chunk->offset = XFS_FSB_TO_B(mp, irec.rm_offset);
|
|
chunk->len = XFS_FSB_TO_B(mp, irec.rm_blockcount);
|
|
chunk->old_startblock =
|
|
xfs_rgbno_to_rtb(iter->victim_rtg, irec.rm_startblock);
|
|
chunk->new_daddr = daddr;
|
|
chunk->is_seq = is_seq;
|
|
chunk->scratch = &data->scratch[data->scratch_idx];
|
|
chunk->data = data;
|
|
chunk->oz = oz;
|
|
|
|
bio->bi_iter.bi_sector = xfs_rtb_to_daddr(mp, chunk->old_startblock);
|
|
bio->bi_end_io = xfs_zone_gc_end_io;
|
|
bio_add_folio_nofail(bio, chunk->scratch->folio, chunk->len,
|
|
chunk->scratch->offset);
|
|
chunk->scratch->offset += chunk->len;
|
|
if (chunk->scratch->offset == XFS_GC_CHUNK_SIZE) {
|
|
data->scratch_idx =
|
|
(data->scratch_idx + 1) % XFS_ZONE_GC_NR_SCRATCH;
|
|
}
|
|
WRITE_ONCE(chunk->state, XFS_GC_BIO_NEW);
|
|
list_add_tail(&chunk->entry, &data->reading);
|
|
xfs_zone_gc_iter_advance(iter, irec.rm_blockcount);
|
|
|
|
submit_bio(bio);
|
|
return true;
|
|
}
|
|
|
|
static void
|
|
xfs_zone_gc_free_chunk(
|
|
struct xfs_gc_bio *chunk)
|
|
{
|
|
list_del(&chunk->entry);
|
|
xfs_open_zone_put(chunk->oz);
|
|
xfs_irele(chunk->ip);
|
|
bio_put(&chunk->bio);
|
|
}
|
|
|
|
static void
|
|
xfs_zone_gc_submit_write(
|
|
struct xfs_zone_gc_data *data,
|
|
struct xfs_gc_bio *chunk)
|
|
{
|
|
if (chunk->is_seq) {
|
|
chunk->bio.bi_opf &= ~REQ_OP_WRITE;
|
|
chunk->bio.bi_opf |= REQ_OP_ZONE_APPEND;
|
|
}
|
|
chunk->bio.bi_iter.bi_sector = chunk->new_daddr;
|
|
chunk->bio.bi_end_io = xfs_zone_gc_end_io;
|
|
submit_bio(&chunk->bio);
|
|
}
|
|
|
|
static struct xfs_gc_bio *
|
|
xfs_zone_gc_split_write(
|
|
struct xfs_zone_gc_data *data,
|
|
struct xfs_gc_bio *chunk)
|
|
{
|
|
struct queue_limits *lim =
|
|
&bdev_get_queue(chunk->bio.bi_bdev)->limits;
|
|
struct xfs_gc_bio *split_chunk;
|
|
int split_sectors;
|
|
unsigned int split_len;
|
|
struct bio *split;
|
|
unsigned int nsegs;
|
|
|
|
if (!chunk->is_seq)
|
|
return NULL;
|
|
|
|
split_sectors = bio_split_rw_at(&chunk->bio, lim, &nsegs,
|
|
lim->max_zone_append_sectors << SECTOR_SHIFT);
|
|
if (!split_sectors)
|
|
return NULL;
|
|
|
|
/* ensure the split chunk is still block size aligned */
|
|
split_sectors = ALIGN_DOWN(split_sectors << SECTOR_SHIFT,
|
|
data->mp->m_sb.sb_blocksize) >> SECTOR_SHIFT;
|
|
split_len = split_sectors << SECTOR_SHIFT;
|
|
|
|
split = bio_split(&chunk->bio, split_sectors, GFP_NOFS, &data->bio_set);
|
|
split_chunk = container_of(split, struct xfs_gc_bio, bio);
|
|
split_chunk->data = data;
|
|
ihold(VFS_I(chunk->ip));
|
|
split_chunk->ip = chunk->ip;
|
|
split_chunk->is_seq = chunk->is_seq;
|
|
split_chunk->scratch = chunk->scratch;
|
|
split_chunk->offset = chunk->offset;
|
|
split_chunk->len = split_len;
|
|
split_chunk->old_startblock = chunk->old_startblock;
|
|
split_chunk->new_daddr = chunk->new_daddr;
|
|
split_chunk->oz = chunk->oz;
|
|
atomic_inc(&chunk->oz->oz_ref);
|
|
|
|
chunk->offset += split_len;
|
|
chunk->len -= split_len;
|
|
chunk->old_startblock += XFS_B_TO_FSB(data->mp, split_len);
|
|
|
|
/* add right before the original chunk */
|
|
WRITE_ONCE(split_chunk->state, XFS_GC_BIO_NEW);
|
|
list_add_tail(&split_chunk->entry, &chunk->entry);
|
|
return split_chunk;
|
|
}
|
|
|
|
static void
|
|
xfs_zone_gc_write_chunk(
|
|
struct xfs_gc_bio *chunk)
|
|
{
|
|
struct xfs_zone_gc_data *data = chunk->data;
|
|
struct xfs_mount *mp = chunk->ip->i_mount;
|
|
phys_addr_t bvec_paddr =
|
|
bvec_phys(bio_first_bvec_all(&chunk->bio));
|
|
struct xfs_gc_bio *split_chunk;
|
|
|
|
if (chunk->bio.bi_status)
|
|
xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
|
|
if (xfs_is_shutdown(mp)) {
|
|
xfs_zone_gc_free_chunk(chunk);
|
|
return;
|
|
}
|
|
|
|
WRITE_ONCE(chunk->state, XFS_GC_BIO_NEW);
|
|
list_move_tail(&chunk->entry, &data->writing);
|
|
|
|
bio_reset(&chunk->bio, mp->m_rtdev_targp->bt_bdev, REQ_OP_WRITE);
|
|
bio_add_folio_nofail(&chunk->bio, chunk->scratch->folio, chunk->len,
|
|
offset_in_folio(chunk->scratch->folio, bvec_paddr));
|
|
|
|
while ((split_chunk = xfs_zone_gc_split_write(data, chunk)))
|
|
xfs_zone_gc_submit_write(data, split_chunk);
|
|
xfs_zone_gc_submit_write(data, chunk);
|
|
}
|
|
|
|
static void
|
|
xfs_zone_gc_finish_chunk(
|
|
struct xfs_gc_bio *chunk)
|
|
{
|
|
uint iolock = XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL;
|
|
struct xfs_inode *ip = chunk->ip;
|
|
struct xfs_mount *mp = ip->i_mount;
|
|
int error;
|
|
|
|
if (chunk->bio.bi_status)
|
|
xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
|
|
if (xfs_is_shutdown(mp)) {
|
|
xfs_zone_gc_free_chunk(chunk);
|
|
return;
|
|
}
|
|
|
|
chunk->scratch->freed += chunk->len;
|
|
if (chunk->scratch->freed == chunk->scratch->offset) {
|
|
chunk->scratch->offset = 0;
|
|
chunk->scratch->freed = 0;
|
|
}
|
|
|
|
/*
|
|
* Cycle through the iolock and wait for direct I/O and layouts to
|
|
* ensure no one is reading from the old mapping before it goes away.
|
|
*
|
|
* Note that xfs_zoned_end_io() below checks that no other writer raced
|
|
* with us to update the mapping by checking that the old startblock
|
|
* didn't change.
|
|
*/
|
|
xfs_ilock(ip, iolock);
|
|
error = xfs_break_layouts(VFS_I(ip), &iolock, BREAK_UNMAP);
|
|
if (!error)
|
|
inode_dio_wait(VFS_I(ip));
|
|
xfs_iunlock(ip, iolock);
|
|
if (error)
|
|
goto free;
|
|
|
|
if (chunk->is_seq)
|
|
chunk->new_daddr = chunk->bio.bi_iter.bi_sector;
|
|
error = xfs_zoned_end_io(ip, chunk->offset, chunk->len,
|
|
chunk->new_daddr, chunk->oz, chunk->old_startblock);
|
|
free:
|
|
if (error)
|
|
xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
|
|
xfs_zone_gc_free_chunk(chunk);
|
|
}
|
|
|
|
static void
|
|
xfs_zone_gc_finish_reset(
|
|
struct xfs_gc_bio *chunk)
|
|
{
|
|
struct xfs_rtgroup *rtg = chunk->bio.bi_private;
|
|
struct xfs_mount *mp = rtg_mount(rtg);
|
|
struct xfs_zone_info *zi = mp->m_zone_info;
|
|
|
|
if (chunk->bio.bi_status) {
|
|
xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
|
|
goto out;
|
|
}
|
|
|
|
xfs_group_set_mark(&rtg->rtg_group, XFS_RTG_FREE);
|
|
atomic_inc(&zi->zi_nr_free_zones);
|
|
|
|
xfs_zoned_add_available(mp, rtg_blocks(rtg));
|
|
|
|
wake_up_all(&zi->zi_zone_wait);
|
|
out:
|
|
list_del(&chunk->entry);
|
|
bio_put(&chunk->bio);
|
|
}
|
|
|
|
static bool
|
|
xfs_zone_gc_prepare_reset(
|
|
struct bio *bio,
|
|
struct xfs_rtgroup *rtg)
|
|
{
|
|
trace_xfs_zone_reset(rtg);
|
|
|
|
ASSERT(rtg_rmap(rtg)->i_used_blocks == 0);
|
|
bio->bi_iter.bi_sector = xfs_gbno_to_daddr(&rtg->rtg_group, 0);
|
|
if (!bdev_zone_is_seq(bio->bi_bdev, bio->bi_iter.bi_sector)) {
|
|
if (!bdev_max_discard_sectors(bio->bi_bdev))
|
|
return false;
|
|
bio->bi_opf = REQ_OP_DISCARD | REQ_SYNC;
|
|
bio->bi_iter.bi_size =
|
|
XFS_FSB_TO_B(rtg_mount(rtg), rtg_blocks(rtg));
|
|
}
|
|
|
|
return true;
|
|
}
|
|
|
|
int
|
|
xfs_zone_gc_reset_sync(
|
|
struct xfs_rtgroup *rtg)
|
|
{
|
|
int error = 0;
|
|
struct bio bio;
|
|
|
|
bio_init(&bio, rtg_mount(rtg)->m_rtdev_targp->bt_bdev, NULL, 0,
|
|
REQ_OP_ZONE_RESET);
|
|
if (xfs_zone_gc_prepare_reset(&bio, rtg))
|
|
error = submit_bio_wait(&bio);
|
|
bio_uninit(&bio);
|
|
|
|
return error;
|
|
}
|
|
|
|
static void
|
|
xfs_zone_gc_reset_zones(
|
|
struct xfs_zone_gc_data *data,
|
|
struct xfs_group *reset_list)
|
|
{
|
|
struct xfs_group *next = reset_list;
|
|
|
|
if (blkdev_issue_flush(data->mp->m_rtdev_targp->bt_bdev) < 0) {
|
|
xfs_force_shutdown(data->mp, SHUTDOWN_META_IO_ERROR);
|
|
return;
|
|
}
|
|
|
|
do {
|
|
struct xfs_rtgroup *rtg = to_rtg(next);
|
|
struct xfs_gc_bio *chunk;
|
|
struct bio *bio;
|
|
|
|
xfs_log_force_inode(rtg_rmap(rtg));
|
|
|
|
next = rtg_group(rtg)->xg_next_reset;
|
|
rtg_group(rtg)->xg_next_reset = NULL;
|
|
|
|
bio = bio_alloc_bioset(rtg_mount(rtg)->m_rtdev_targp->bt_bdev,
|
|
0, REQ_OP_ZONE_RESET, GFP_NOFS, &data->bio_set);
|
|
bio->bi_private = rtg;
|
|
bio->bi_end_io = xfs_zone_gc_end_io;
|
|
|
|
chunk = container_of(bio, struct xfs_gc_bio, bio);
|
|
chunk->data = data;
|
|
WRITE_ONCE(chunk->state, XFS_GC_BIO_NEW);
|
|
list_add_tail(&chunk->entry, &data->resetting);
|
|
|
|
/*
|
|
* Also use the bio to drive the state machine when neither
|
|
* zone reset nor discard is supported to keep things simple.
|
|
*/
|
|
if (xfs_zone_gc_prepare_reset(bio, rtg))
|
|
submit_bio(bio);
|
|
else
|
|
bio_endio(bio);
|
|
} while (next);
|
|
}
|
|
|
|
/*
|
|
* Handle the work to read and write data for GC and to reset the zones,
|
|
* including handling all completions.
|
|
*
|
|
* Note that the order of the chunks is preserved so that we don't undo the
|
|
* optimal order established by xfs_zone_gc_query().
|
|
*/
|
|
static bool
|
|
xfs_zone_gc_handle_work(
|
|
struct xfs_zone_gc_data *data)
|
|
{
|
|
struct xfs_zone_info *zi = data->mp->m_zone_info;
|
|
struct xfs_gc_bio *chunk, *next;
|
|
struct xfs_group *reset_list;
|
|
struct blk_plug plug;
|
|
|
|
spin_lock(&zi->zi_reset_list_lock);
|
|
reset_list = zi->zi_reset_list;
|
|
zi->zi_reset_list = NULL;
|
|
spin_unlock(&zi->zi_reset_list_lock);
|
|
|
|
if (!xfs_zone_gc_select_victim(data) ||
|
|
!xfs_zone_gc_space_available(data)) {
|
|
if (list_empty(&data->reading) &&
|
|
list_empty(&data->writing) &&
|
|
list_empty(&data->resetting) &&
|
|
!reset_list)
|
|
return false;
|
|
}
|
|
|
|
__set_current_state(TASK_RUNNING);
|
|
try_to_freeze();
|
|
|
|
if (reset_list)
|
|
xfs_zone_gc_reset_zones(data, reset_list);
|
|
|
|
list_for_each_entry_safe(chunk, next, &data->resetting, entry) {
|
|
if (READ_ONCE(chunk->state) != XFS_GC_BIO_DONE)
|
|
break;
|
|
xfs_zone_gc_finish_reset(chunk);
|
|
}
|
|
|
|
list_for_each_entry_safe(chunk, next, &data->writing, entry) {
|
|
if (READ_ONCE(chunk->state) != XFS_GC_BIO_DONE)
|
|
break;
|
|
xfs_zone_gc_finish_chunk(chunk);
|
|
}
|
|
|
|
blk_start_plug(&plug);
|
|
list_for_each_entry_safe(chunk, next, &data->reading, entry) {
|
|
if (READ_ONCE(chunk->state) != XFS_GC_BIO_DONE)
|
|
break;
|
|
xfs_zone_gc_write_chunk(chunk);
|
|
}
|
|
blk_finish_plug(&plug);
|
|
|
|
blk_start_plug(&plug);
|
|
while (xfs_zone_gc_start_chunk(data))
|
|
;
|
|
blk_finish_plug(&plug);
|
|
return true;
|
|
}
|
|
|
|
/*
|
|
* Note that the current GC algorithm would break reflinks and thus duplicate
|
|
* data that was shared by multiple owners before. Because of that reflinks
|
|
* are currently not supported on zoned file systems and can't be created or
|
|
* mounted.
|
|
*/
|
|
static int
|
|
xfs_zoned_gcd(
|
|
void *private)
|
|
{
|
|
struct xfs_zone_gc_data *data = private;
|
|
struct xfs_mount *mp = data->mp;
|
|
struct xfs_zone_info *zi = mp->m_zone_info;
|
|
unsigned int nofs_flag;
|
|
|
|
nofs_flag = memalloc_nofs_save();
|
|
set_freezable();
|
|
|
|
for (;;) {
|
|
set_current_state(TASK_INTERRUPTIBLE | TASK_FREEZABLE);
|
|
xfs_set_zonegc_running(mp);
|
|
if (xfs_zone_gc_handle_work(data))
|
|
continue;
|
|
|
|
if (list_empty(&data->reading) &&
|
|
list_empty(&data->writing) &&
|
|
list_empty(&data->resetting) &&
|
|
!zi->zi_reset_list) {
|
|
xfs_clear_zonegc_running(mp);
|
|
xfs_zoned_resv_wake_all(mp);
|
|
|
|
if (kthread_should_stop()) {
|
|
__set_current_state(TASK_RUNNING);
|
|
break;
|
|
}
|
|
|
|
if (kthread_should_park()) {
|
|
__set_current_state(TASK_RUNNING);
|
|
kthread_parkme();
|
|
continue;
|
|
}
|
|
}
|
|
|
|
schedule();
|
|
}
|
|
xfs_clear_zonegc_running(mp);
|
|
|
|
if (data->iter.victim_rtg)
|
|
xfs_rtgroup_rele(data->iter.victim_rtg);
|
|
|
|
memalloc_nofs_restore(nofs_flag);
|
|
xfs_zone_gc_data_free(data);
|
|
return 0;
|
|
}
|
|
|
|
void
|
|
xfs_zone_gc_start(
|
|
struct xfs_mount *mp)
|
|
{
|
|
if (xfs_has_zoned(mp))
|
|
kthread_unpark(mp->m_zone_info->zi_gc_thread);
|
|
}
|
|
|
|
void
|
|
xfs_zone_gc_stop(
|
|
struct xfs_mount *mp)
|
|
{
|
|
if (xfs_has_zoned(mp))
|
|
kthread_park(mp->m_zone_info->zi_gc_thread);
|
|
}
|
|
|
|
int
|
|
xfs_zone_gc_mount(
|
|
struct xfs_mount *mp)
|
|
{
|
|
struct xfs_zone_info *zi = mp->m_zone_info;
|
|
struct xfs_zone_gc_data *data;
|
|
struct xfs_open_zone *oz;
|
|
int error;
|
|
|
|
/*
|
|
* If there are no free zones available for GC, pick the open zone with
|
|
* the least used space to GC into. This should only happen after an
|
|
* unclean shutdown near ENOSPC while GC was ongoing.
|
|
*
|
|
* We also need to do this for the first gc zone allocation if we
|
|
* unmounted while at the open limit.
|
|
*/
|
|
if (!xfs_group_marked(mp, XG_TYPE_RTG, XFS_RTG_FREE) ||
|
|
zi->zi_nr_open_zones == mp->m_max_open_zones)
|
|
oz = xfs_zone_gc_steal_open(zi);
|
|
else
|
|
oz = xfs_open_zone(mp, WRITE_LIFE_NOT_SET, true);
|
|
if (!oz) {
|
|
xfs_warn(mp, "unable to allocate a zone for gc");
|
|
error = -EIO;
|
|
goto out;
|
|
}
|
|
|
|
trace_xfs_zone_gc_target_opened(oz->oz_rtg);
|
|
zi->zi_open_gc_zone = oz;
|
|
|
|
data = xfs_zone_gc_data_alloc(mp);
|
|
if (!data) {
|
|
error = -ENOMEM;
|
|
goto out_put_gc_zone;
|
|
}
|
|
|
|
mp->m_zone_info->zi_gc_thread = kthread_create(xfs_zoned_gcd, data,
|
|
"xfs-zone-gc/%s", mp->m_super->s_id);
|
|
if (IS_ERR(mp->m_zone_info->zi_gc_thread)) {
|
|
xfs_warn(mp, "unable to create zone gc thread");
|
|
error = PTR_ERR(mp->m_zone_info->zi_gc_thread);
|
|
goto out_free_gc_data;
|
|
}
|
|
|
|
/* xfs_zone_gc_start will unpark for rw mounts */
|
|
kthread_park(mp->m_zone_info->zi_gc_thread);
|
|
return 0;
|
|
|
|
out_free_gc_data:
|
|
kfree(data);
|
|
out_put_gc_zone:
|
|
xfs_open_zone_put(zi->zi_open_gc_zone);
|
|
out:
|
|
return error;
|
|
}
|
|
|
|
void
|
|
xfs_zone_gc_unmount(
|
|
struct xfs_mount *mp)
|
|
{
|
|
struct xfs_zone_info *zi = mp->m_zone_info;
|
|
|
|
kthread_stop(zi->zi_gc_thread);
|
|
if (zi->zi_open_gc_zone)
|
|
xfs_open_zone_put(zi->zi_open_gc_zone);
|
|
}
|