XFS - new code for 6.15

Signed-off-by: Carlos Maiolino <cem@kernel.org>
 -----BEGIN PGP SIGNATURE-----
 
 iJUEABMJAB0WIQSmtYVZ/MfVMGUq1GNcsMJ8RxYuYwUCZ+Eg9QAKCRBcsMJ8RxYu
 Y707AYCssAqTYkwPm937oACNbNpL3d2Q/3kP6ku+LmEZM+1HlD2K9cwsdqEWBcWw
 oPA4ClwBgKP2dnn66oaaFSxEWMj/1evpbzAqptSKBJys83Ge7PFGiFG4Tyk7AUvl
 kAc1FcIYrQ==
 =9SdI
 -----END PGP SIGNATURE-----

Merge tag 'xfs-6.15-merge' of git://git.kernel.org/pub/scm/fs/xfs/xfs-linux

Pull xfs updates from Carlos Maiolino:

 - XFS zoned allocator: Enables XFS to support zoned devices using its
   real-time allocator

 - Use folios/vmalloc for buffer cache backing memory

 - Some code cleanups and bug fixes

* tag 'xfs-6.15-merge' of git://git.kernel.org/pub/scm/fs/xfs/xfs-linux: (70 commits)
  xfs: remove the flags argument to xfs_buf_get_uncached
  xfs: remove the flags argument to xfs_buf_read_uncached
  xfs: remove xfs_buf_free_maps
  xfs: remove xfs_buf_get_maps
  xfs: call xfs_buf_alloc_backing_mem from _xfs_buf_alloc
  xfs: remove unnecessary NULL check before kvfree()
  xfs: don't wake zone space waiters without m_zone_info
  xfs: don't increment m_generation for all errors in xfs_growfs_data
  xfs: fix a missing unlock in xfs_growfs_data
  xfs: Remove duplicate xfs_rtbitmap.h header
  xfs: trigger zone GC when out of available rt blocks
  xfs: trace what memory backs a buffer
  xfs: cleanup mapping tmpfs folios into the buffer cache
  xfs: use vmalloc instead of vm_map_area for buffer backing memory
  xfs: buffer items don't straddle pages anymore
  xfs: kill XBF_UNMAPPED
  xfs: convert buffer cache to use high order folios
  xfs: remove the kmalloc to page allocator fallback
  xfs: refactor backing memory allocations for buffers
  xfs: remove xfs_buf_is_vmapped
  ...
This commit is contained in:
Linus Torvalds 2025-03-27 13:07:00 -07:00
commit c148bc7535
82 changed files with 5934 additions and 1502 deletions

View file

@ -64,6 +64,7 @@ xfs-y += $(addprefix libxfs/, \
xfs-$(CONFIG_XFS_RT) += $(addprefix libxfs/, \
xfs_rtbitmap.o \
xfs_rtgroup.o \
xfs_zones.o \
)
# highlevel code
@ -136,7 +137,11 @@ xfs-$(CONFIG_XFS_QUOTA) += xfs_dquot.o \
xfs_quotaops.o
# xfs_rtbitmap is shared with libxfs
xfs-$(CONFIG_XFS_RT) += xfs_rtalloc.o
xfs-$(CONFIG_XFS_RT) += xfs_rtalloc.o \
xfs_zone_alloc.o \
xfs_zone_gc.o \
xfs_zone_info.o \
xfs_zone_space_resv.o
xfs-$(CONFIG_XFS_POSIX_ACL) += xfs_acl.o
xfs-$(CONFIG_SYSCTL) += xfs_sysctl.o

View file

@ -301,7 +301,7 @@ xfs_get_aghdr_buf(
struct xfs_buf *bp;
int error;
error = xfs_buf_get_uncached(mp->m_ddev_targp, numblks, 0, &bp);
error = xfs_buf_get_uncached(mp->m_ddev_targp, numblks, &bp);
if (error)
return error;

View file

@ -34,13 +34,13 @@
#include "xfs_ag.h"
#include "xfs_ag_resv.h"
#include "xfs_refcount.h"
#include "xfs_icache.h"
#include "xfs_iomap.h"
#include "xfs_health.h"
#include "xfs_bmap_item.h"
#include "xfs_symlink_remote.h"
#include "xfs_inode_util.h"
#include "xfs_rtgroup.h"
#include "xfs_zone_alloc.h"
struct kmem_cache *xfs_bmap_intent_cache;
@ -171,18 +171,16 @@ xfs_bmbt_update(
* Compute the worst-case number of indirect blocks that will be used
* for ip's delayed extent of length "len".
*/
STATIC xfs_filblks_t
xfs_filblks_t
xfs_bmap_worst_indlen(
xfs_inode_t *ip, /* incore inode pointer */
xfs_filblks_t len) /* delayed extent length */
struct xfs_inode *ip, /* incore inode pointer */
xfs_filblks_t len) /* delayed extent length */
{
int level; /* btree level number */
int maxrecs; /* maximum record count at this level */
xfs_mount_t *mp; /* mount structure */
xfs_filblks_t rval; /* return value */
struct xfs_mount *mp = ip->i_mount;
int maxrecs = mp->m_bmap_dmxr[0];
int level;
xfs_filblks_t rval;
mp = ip->i_mount;
maxrecs = mp->m_bmap_dmxr[0];
for (level = 0, rval = 0;
level < XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK);
level++) {
@ -2571,146 +2569,6 @@ done:
#undef PREV
}
/*
* Convert a hole to a delayed allocation.
*/
STATIC void
xfs_bmap_add_extent_hole_delay(
xfs_inode_t *ip, /* incore inode pointer */
int whichfork,
struct xfs_iext_cursor *icur,
xfs_bmbt_irec_t *new) /* new data to add to file extents */
{
struct xfs_ifork *ifp; /* inode fork pointer */
xfs_bmbt_irec_t left; /* left neighbor extent entry */
xfs_filblks_t newlen=0; /* new indirect size */
xfs_filblks_t oldlen=0; /* old indirect size */
xfs_bmbt_irec_t right; /* right neighbor extent entry */
uint32_t state = xfs_bmap_fork_to_state(whichfork);
xfs_filblks_t temp; /* temp for indirect calculations */
ifp = xfs_ifork_ptr(ip, whichfork);
ASSERT(isnullstartblock(new->br_startblock));
/*
* Check and set flags if this segment has a left neighbor
*/
if (xfs_iext_peek_prev_extent(ifp, icur, &left)) {
state |= BMAP_LEFT_VALID;
if (isnullstartblock(left.br_startblock))
state |= BMAP_LEFT_DELAY;
}
/*
* Check and set flags if the current (right) segment exists.
* If it doesn't exist, we're converting the hole at end-of-file.
*/
if (xfs_iext_get_extent(ifp, icur, &right)) {
state |= BMAP_RIGHT_VALID;
if (isnullstartblock(right.br_startblock))
state |= BMAP_RIGHT_DELAY;
}
/*
* Set contiguity flags on the left and right neighbors.
* Don't let extents get too large, even if the pieces are contiguous.
*/
if ((state & BMAP_LEFT_VALID) && (state & BMAP_LEFT_DELAY) &&
left.br_startoff + left.br_blockcount == new->br_startoff &&
left.br_blockcount + new->br_blockcount <= XFS_MAX_BMBT_EXTLEN)
state |= BMAP_LEFT_CONTIG;
if ((state & BMAP_RIGHT_VALID) && (state & BMAP_RIGHT_DELAY) &&
new->br_startoff + new->br_blockcount == right.br_startoff &&
new->br_blockcount + right.br_blockcount <= XFS_MAX_BMBT_EXTLEN &&
(!(state & BMAP_LEFT_CONTIG) ||
(left.br_blockcount + new->br_blockcount +
right.br_blockcount <= XFS_MAX_BMBT_EXTLEN)))
state |= BMAP_RIGHT_CONTIG;
/*
* Switch out based on the contiguity flags.
*/
switch (state & (BMAP_LEFT_CONTIG | BMAP_RIGHT_CONTIG)) {
case BMAP_LEFT_CONTIG | BMAP_RIGHT_CONTIG:
/*
* New allocation is contiguous with delayed allocations
* on the left and on the right.
* Merge all three into a single extent record.
*/
temp = left.br_blockcount + new->br_blockcount +
right.br_blockcount;
oldlen = startblockval(left.br_startblock) +
startblockval(new->br_startblock) +
startblockval(right.br_startblock);
newlen = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp),
oldlen);
left.br_startblock = nullstartblock(newlen);
left.br_blockcount = temp;
xfs_iext_remove(ip, icur, state);
xfs_iext_prev(ifp, icur);
xfs_iext_update_extent(ip, state, icur, &left);
break;
case BMAP_LEFT_CONTIG:
/*
* New allocation is contiguous with a delayed allocation
* on the left.
* Merge the new allocation with the left neighbor.
*/
temp = left.br_blockcount + new->br_blockcount;
oldlen = startblockval(left.br_startblock) +
startblockval(new->br_startblock);
newlen = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp),
oldlen);
left.br_blockcount = temp;
left.br_startblock = nullstartblock(newlen);
xfs_iext_prev(ifp, icur);
xfs_iext_update_extent(ip, state, icur, &left);
break;
case BMAP_RIGHT_CONTIG:
/*
* New allocation is contiguous with a delayed allocation
* on the right.
* Merge the new allocation with the right neighbor.
*/
temp = new->br_blockcount + right.br_blockcount;
oldlen = startblockval(new->br_startblock) +
startblockval(right.br_startblock);
newlen = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp),
oldlen);
right.br_startoff = new->br_startoff;
right.br_startblock = nullstartblock(newlen);
right.br_blockcount = temp;
xfs_iext_update_extent(ip, state, icur, &right);
break;
case 0:
/*
* New allocation is not contiguous with another
* delayed allocation.
* Insert a new entry.
*/
oldlen = newlen = 0;
xfs_iext_insert(ip, icur, new, state);
break;
}
if (oldlen != newlen) {
ASSERT(oldlen > newlen);
xfs_add_fdblocks(ip->i_mount, oldlen - newlen);
/*
* Nothing to do for disk quota accounting here.
*/
xfs_mod_delalloc(ip, 0, (int64_t)newlen - oldlen);
}
}
/*
* Convert a hole to a real allocation.
*/
@ -4039,144 +3897,6 @@ xfs_bmapi_read(
return 0;
}
/*
* Add a delayed allocation extent to an inode. Blocks are reserved from the
* global pool and the extent inserted into the inode in-core extent tree.
*
* On entry, got refers to the first extent beyond the offset of the extent to
* allocate or eof is specified if no such extent exists. On return, got refers
* to the extent record that was inserted to the inode fork.
*
* Note that the allocated extent may have been merged with contiguous extents
* during insertion into the inode fork. Thus, got does not reflect the current
* state of the inode fork on return. If necessary, the caller can use lastx to
* look up the updated record in the inode fork.
*/
int
xfs_bmapi_reserve_delalloc(
struct xfs_inode *ip,
int whichfork,
xfs_fileoff_t off,
xfs_filblks_t len,
xfs_filblks_t prealloc,
struct xfs_bmbt_irec *got,
struct xfs_iext_cursor *icur,
int eof)
{
struct xfs_mount *mp = ip->i_mount;
struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork);
xfs_extlen_t alen;
xfs_extlen_t indlen;
uint64_t fdblocks;
int error;
xfs_fileoff_t aoff;
bool use_cowextszhint =
whichfork == XFS_COW_FORK && !prealloc;
retry:
/*
* Cap the alloc length. Keep track of prealloc so we know whether to
* tag the inode before we return.
*/
aoff = off;
alen = XFS_FILBLKS_MIN(len + prealloc, XFS_MAX_BMBT_EXTLEN);
if (!eof)
alen = XFS_FILBLKS_MIN(alen, got->br_startoff - aoff);
if (prealloc && alen >= len)
prealloc = alen - len;
/*
* If we're targetting the COW fork but aren't creating a speculative
* posteof preallocation, try to expand the reservation to align with
* the COW extent size hint if there's sufficient free space.
*
* Unlike the data fork, the CoW cancellation functions will free all
* the reservations at inactivation, so we don't require that every
* delalloc reservation have a dirty pagecache.
*/
if (use_cowextszhint) {
struct xfs_bmbt_irec prev;
xfs_extlen_t extsz = xfs_get_cowextsz_hint(ip);
if (!xfs_iext_peek_prev_extent(ifp, icur, &prev))
prev.br_startoff = NULLFILEOFF;
error = xfs_bmap_extsize_align(mp, got, &prev, extsz, 0, eof,
1, 0, &aoff, &alen);
ASSERT(!error);
}
/*
* Make a transaction-less quota reservation for delayed allocation
* blocks. This number gets adjusted later. We return if we haven't
* allocated blocks already inside this loop.
*/
error = xfs_quota_reserve_blkres(ip, alen);
if (error)
goto out;
/*
* Split changing sb for alen and indlen since they could be coming
* from different places.
*/
indlen = (xfs_extlen_t)xfs_bmap_worst_indlen(ip, alen);
ASSERT(indlen > 0);
fdblocks = indlen;
if (XFS_IS_REALTIME_INODE(ip)) {
error = xfs_dec_frextents(mp, xfs_blen_to_rtbxlen(mp, alen));
if (error)
goto out_unreserve_quota;
} else {
fdblocks += alen;
}
error = xfs_dec_fdblocks(mp, fdblocks, false);
if (error)
goto out_unreserve_frextents;
ip->i_delayed_blks += alen;
xfs_mod_delalloc(ip, alen, indlen);
got->br_startoff = aoff;
got->br_startblock = nullstartblock(indlen);
got->br_blockcount = alen;
got->br_state = XFS_EXT_NORM;
xfs_bmap_add_extent_hole_delay(ip, whichfork, icur, got);
/*
* Tag the inode if blocks were preallocated. Note that COW fork
* preallocation can occur at the start or end of the extent, even when
* prealloc == 0, so we must also check the aligned offset and length.
*/
if (whichfork == XFS_DATA_FORK && prealloc)
xfs_inode_set_eofblocks_tag(ip);
if (whichfork == XFS_COW_FORK && (prealloc || aoff < off || alen > len))
xfs_inode_set_cowblocks_tag(ip);
return 0;
out_unreserve_frextents:
if (XFS_IS_REALTIME_INODE(ip))
xfs_add_frextents(mp, xfs_blen_to_rtbxlen(mp, alen));
out_unreserve_quota:
if (XFS_IS_QUOTA_ON(mp))
xfs_quota_unreserve_blkres(ip, alen);
out:
if (error == -ENOSPC || error == -EDQUOT) {
trace_xfs_delalloc_enospc(ip, off, len);
if (prealloc || use_cowextszhint) {
/* retry without any preallocation */
use_cowextszhint = false;
prealloc = 0;
goto retry;
}
}
return error;
}
static int
xfs_bmapi_allocate(
struct xfs_bmalloca *bma)
@ -4948,7 +4668,8 @@ xfs_bmap_del_extent_delay(
int whichfork,
struct xfs_iext_cursor *icur,
struct xfs_bmbt_irec *got,
struct xfs_bmbt_irec *del)
struct xfs_bmbt_irec *del,
uint32_t bflags) /* bmapi flags */
{
struct xfs_mount *mp = ip->i_mount;
struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork);
@ -5068,10 +4789,18 @@ xfs_bmap_del_extent_delay(
da_diff = da_old - da_new;
fdblocks = da_diff;
if (isrt)
xfs_add_frextents(mp, xfs_blen_to_rtbxlen(mp, del->br_blockcount));
else
if (bflags & XFS_BMAPI_REMAP) {
;
} else if (isrt) {
xfs_rtbxlen_t rtxlen;
rtxlen = xfs_blen_to_rtbxlen(mp, del->br_blockcount);
if (xfs_is_zoned_inode(ip))
xfs_zoned_add_available(mp, rtxlen);
xfs_add_frextents(mp, rtxlen);
} else {
fdblocks += del->br_blockcount;
}
xfs_add_fdblocks(mp, fdblocks);
xfs_mod_delalloc(ip, -(int64_t)del->br_blockcount, -da_diff);
@ -5670,7 +5399,8 @@ __xfs_bunmapi(
delete:
if (wasdel) {
xfs_bmap_del_extent_delay(ip, whichfork, &icur, &got, &del);
xfs_bmap_del_extent_delay(ip, whichfork, &icur, &got,
&del, flags);
} else {
error = xfs_bmap_del_extent_real(ip, tp, &icur, cur,
&del, &tmp_logflags, whichfork,

View file

@ -204,7 +204,7 @@ int xfs_bunmapi(struct xfs_trans *tp, struct xfs_inode *ip,
xfs_extnum_t nexts, int *done);
void xfs_bmap_del_extent_delay(struct xfs_inode *ip, int whichfork,
struct xfs_iext_cursor *cur, struct xfs_bmbt_irec *got,
struct xfs_bmbt_irec *del);
struct xfs_bmbt_irec *del, uint32_t bflags);
void xfs_bmap_del_extent_cow(struct xfs_inode *ip,
struct xfs_iext_cursor *cur, struct xfs_bmbt_irec *got,
struct xfs_bmbt_irec *del);
@ -219,10 +219,6 @@ int xfs_bmap_insert_extents(struct xfs_trans *tp, struct xfs_inode *ip,
bool *done, xfs_fileoff_t stop_fsb);
int xfs_bmap_split_extent(struct xfs_trans *tp, struct xfs_inode *ip,
xfs_fileoff_t split_offset);
int xfs_bmapi_reserve_delalloc(struct xfs_inode *ip, int whichfork,
xfs_fileoff_t off, xfs_filblks_t len, xfs_filblks_t prealloc,
struct xfs_bmbt_irec *got, struct xfs_iext_cursor *cur,
int eof);
int xfs_bmapi_convert_delalloc(struct xfs_inode *ip, int whichfork,
xfs_off_t offset, struct iomap *iomap, unsigned int *seq);
int xfs_bmap_add_extent_unwritten_real(struct xfs_trans *tp,
@ -233,6 +229,7 @@ xfs_extlen_t xfs_bmapi_minleft(struct xfs_trans *tp, struct xfs_inode *ip,
int fork);
int xfs_bmap_btalloc_low_space(struct xfs_bmalloca *ap,
struct xfs_alloc_arg *args);
xfs_filblks_t xfs_bmap_worst_indlen(struct xfs_inode *ip, xfs_filblks_t len);
enum xfs_bmap_intent_type {
XFS_BMAP_MAP = 1,

View file

@ -178,9 +178,10 @@ typedef struct xfs_sb {
xfs_rgnumber_t sb_rgcount; /* number of realtime groups */
xfs_rtxlen_t sb_rgextents; /* size of a realtime group in rtx */
uint8_t sb_rgblklog; /* rt group number shift */
uint8_t sb_pad[7]; /* zeroes */
xfs_rfsblock_t sb_rtstart; /* start of internal RT section (FSB) */
xfs_filblks_t sb_rtreserved; /* reserved (zoned) RT blocks */
/* must be padded to 64 bit alignment */
} xfs_sb_t;
@ -270,9 +271,10 @@ struct xfs_dsb {
__be64 sb_metadirino; /* metadata directory tree root */
__be32 sb_rgcount; /* # of realtime groups */
__be32 sb_rgextents; /* size of rtgroup in rtx */
__u8 sb_rgblklog; /* rt group number shift */
__u8 sb_pad[7]; /* zeroes */
__be64 sb_rtstart; /* start of internal RT section (FSB) */
__be64 sb_rtreserved; /* reserved (zoned) RT blocks */
/*
* The size of this structure must be padded to 64 bit alignment.
@ -395,6 +397,9 @@ xfs_sb_has_ro_compat_feature(
#define XFS_SB_FEAT_INCOMPAT_EXCHRANGE (1 << 6) /* exchangerange supported */
#define XFS_SB_FEAT_INCOMPAT_PARENT (1 << 7) /* parent pointers */
#define XFS_SB_FEAT_INCOMPAT_METADIR (1 << 8) /* metadata dir tree */
#define XFS_SB_FEAT_INCOMPAT_ZONED (1 << 9) /* zoned RT allocator */
#define XFS_SB_FEAT_INCOMPAT_ZONE_GAPS (1 << 10) /* RTGs have LBA gaps */
#define XFS_SB_FEAT_INCOMPAT_ALL \
(XFS_SB_FEAT_INCOMPAT_FTYPE | \
XFS_SB_FEAT_INCOMPAT_SPINODES | \
@ -404,7 +409,9 @@ xfs_sb_has_ro_compat_feature(
XFS_SB_FEAT_INCOMPAT_NREXT64 | \
XFS_SB_FEAT_INCOMPAT_EXCHRANGE | \
XFS_SB_FEAT_INCOMPAT_PARENT | \
XFS_SB_FEAT_INCOMPAT_METADIR)
XFS_SB_FEAT_INCOMPAT_METADIR | \
XFS_SB_FEAT_INCOMPAT_ZONED | \
XFS_SB_FEAT_INCOMPAT_ZONE_GAPS)
#define XFS_SB_FEAT_INCOMPAT_UNKNOWN ~XFS_SB_FEAT_INCOMPAT_ALL
static inline bool
@ -952,7 +959,12 @@ struct xfs_dinode {
__be64 di_changecount; /* number of attribute changes */
__be64 di_lsn; /* flush sequence */
__be64 di_flags2; /* more random flags */
__be32 di_cowextsize; /* basic cow extent size for file */
union {
/* basic cow extent size for (regular) file */
__be32 di_cowextsize;
/* used blocks in RTG for (zoned) rtrmap inode */
__be32 di_used_blocks;
};
__u8 di_pad2[12]; /* more padding for future expansion */
/* fields only written to during inode creation */

View file

@ -189,7 +189,9 @@ struct xfs_fsop_geom {
uint32_t checked; /* o: checked fs & rt metadata */
__u32 rgextents; /* rt extents in a realtime group */
__u32 rgcount; /* number of realtime groups */
__u64 reserved[16]; /* reserved space */
__u64 rtstart; /* start of internal rt section */
__u64 rtreserved; /* RT (zoned) reserved blocks */
__u64 reserved[14]; /* reserved space */
};
#define XFS_FSOP_GEOM_SICK_COUNTERS (1 << 0) /* summary counters */
@ -247,6 +249,7 @@ typedef struct xfs_fsop_resblks {
#define XFS_FSOP_GEOM_FLAGS_EXCHANGE_RANGE (1 << 24) /* exchange range */
#define XFS_FSOP_GEOM_FLAGS_PARENT (1 << 25) /* linux parent pointers */
#define XFS_FSOP_GEOM_FLAGS_METADIR (1 << 26) /* metadata directories */
#define XFS_FSOP_GEOM_FLAGS_ZONED (1 << 27) /* zoned rt device */
/*
* Minimum and maximum sizes need for growth checks.
@ -1079,6 +1082,15 @@ struct xfs_rtgroup_geometry {
#define XFS_IOC_COMMIT_RANGE _IOW ('X', 131, struct xfs_commit_range)
/* XFS_IOC_GETFSUUID ---------- deprecated 140 */
/*
* Devices supported by a single XFS file system. Reported in fsmaps fmr_device
* when using internal RT devices.
*/
enum xfs_device {
XFS_DEV_DATA = 1,
XFS_DEV_LOG = 2,
XFS_DEV_RT = 3,
};
#ifndef HAVE_BBMACROS
/*

View file

@ -19,10 +19,23 @@ struct xfs_group {
#ifdef __KERNEL__
/* -- kernel only structures below this line -- */
/*
* Track freed but not yet committed extents.
*/
struct xfs_extent_busy_tree *xg_busy_extents;
union {
/*
* For perags and non-zoned RT groups:
* Track freed but not yet committed extents.
*/
struct xfs_extent_busy_tree *xg_busy_extents;
/*
* For zoned RT groups:
* List of groups that need a zone reset.
*
* The zonegc code forces a log flush of the rtrmap inode before
* resetting the write pointer, so there is no need for
* individual busy extent tracking.
*/
struct xfs_group *xg_next_reset;
};
/*
* Bitsets of per-ag metadata that have been checked and/or are sick.
@ -107,9 +120,15 @@ xfs_gbno_to_daddr(
xfs_agblock_t gbno)
{
struct xfs_mount *mp = xg->xg_mount;
uint32_t blocks = mp->m_groups[xg->xg_type].blocks;
struct xfs_groups *g = &mp->m_groups[xg->xg_type];
xfs_fsblock_t fsbno;
return XFS_FSB_TO_BB(mp, (xfs_fsblock_t)xg->xg_gno * blocks + gbno);
if (g->has_daddr_gaps)
fsbno = xfs_gbno_to_fsb(xg, gbno);
else
fsbno = (xfs_fsblock_t)xg->xg_gno * g->blocks + gbno;
return XFS_FSB_TO_BB(mp, g->start_fsb + fsbno);
}
static inline uint32_t

View file

@ -364,7 +364,7 @@ xfs_ialloc_inode_init(
(j * M_IGEO(mp)->blocks_per_cluster));
error = xfs_trans_get_buf(tp, mp->m_ddev_targp, d,
mp->m_bsize * M_IGEO(mp)->blocks_per_cluster,
XBF_UNMAPPED, &fbuf);
0, &fbuf);
if (error)
return error;
@ -1927,7 +1927,7 @@ xfs_dialloc(
* that we can immediately allocate, but then we allow allocation on the
* second pass if we fail to find an AG with free inodes in it.
*/
if (percpu_counter_read_positive(&mp->m_fdblocks) <
if (xfs_estimate_freecounter(mp, XC_FREE_BLOCKS) <
mp->m_low_space[XFS_LOWSP_1_PCNT]) {
ok_alloc = false;
low_space = true;

View file

@ -137,7 +137,7 @@ xfs_imap_to_bp(
int error;
error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, imap->im_blkno,
imap->im_len, XBF_UNMAPPED, bpp, &xfs_inode_buf_ops);
imap->im_len, 0, bpp, &xfs_inode_buf_ops);
if (xfs_metadata_is_sick(error))
xfs_agno_mark_sick(mp, xfs_daddr_to_agno(mp, imap->im_blkno),
XFS_SICK_AG_INODES);
@ -252,7 +252,10 @@ xfs_inode_from_disk(
be64_to_cpu(from->di_changecount));
ip->i_crtime = xfs_inode_from_disk_ts(from, from->di_crtime);
ip->i_diflags2 = be64_to_cpu(from->di_flags2);
/* also covers the di_used_blocks union arm: */
ip->i_cowextsize = be32_to_cpu(from->di_cowextsize);
BUILD_BUG_ON(sizeof(from->di_cowextsize) !=
sizeof(from->di_used_blocks));
}
error = xfs_iformat_data_fork(ip, from);
@ -349,6 +352,7 @@ xfs_inode_to_disk(
to->di_changecount = cpu_to_be64(inode_peek_iversion(inode));
to->di_crtime = xfs_inode_to_disk_ts(ip, ip->i_crtime);
to->di_flags2 = cpu_to_be64(ip->i_diflags2);
/* also covers the di_used_blocks union arm: */
to->di_cowextsize = cpu_to_be32(ip->i_cowextsize);
to->di_ino = cpu_to_be64(ip->i_ino);
to->di_lsn = cpu_to_be64(lsn);
@ -752,11 +756,18 @@ xfs_dinode_verify(
!xfs_has_rtreflink(mp))
return __this_address;
/* COW extent size hint validation */
fa = xfs_inode_validate_cowextsize(mp, be32_to_cpu(dip->di_cowextsize),
mode, flags, flags2);
if (fa)
return fa;
if (xfs_has_zoned(mp) &&
dip->di_metatype == cpu_to_be16(XFS_METAFILE_RTRMAP)) {
if (be32_to_cpu(dip->di_used_blocks) > mp->m_sb.sb_rgextents)
return __this_address;
} else {
/* COW extent size hint validation */
fa = xfs_inode_validate_cowextsize(mp,
be32_to_cpu(dip->di_cowextsize),
mode, flags, flags2);
if (fa)
return fa;
}
/* bigtime iflag can only happen on bigtime filesystems */
if (xfs_dinode_has_bigtime(dip) &&

View file

@ -322,6 +322,7 @@ xfs_inode_init(
if (xfs_has_v3inodes(mp)) {
inode_set_iversion(inode, 1);
/* also covers the di_used_blocks union arm: */
ip->i_cowextsize = 0;
times |= XFS_ICHGTIME_CREATE;
}

View file

@ -475,7 +475,12 @@ struct xfs_log_dinode {
xfs_lsn_t di_lsn;
uint64_t di_flags2; /* more random flags */
uint32_t di_cowextsize; /* basic cow extent size for file */
union {
/* basic cow extent size for (regular) file */
uint32_t di_cowextsize;
/* used blocks in RTG for (zoned) rtrmap inode */
uint32_t di_used_blocks;
};
uint8_t di_pad2[12]; /* more padding for future expansion */
/* fields only written to during inode creation */

View file

@ -21,6 +21,9 @@
#include "xfs_errortag.h"
#include "xfs_error.h"
#include "xfs_alloc.h"
#include "xfs_rtgroup.h"
#include "xfs_rtrmap_btree.h"
#include "xfs_rtrefcount_btree.h"
static const struct {
enum xfs_metafile_type mtype;
@ -74,12 +77,11 @@ xfs_metafile_clear_iflag(
}
/*
* Is the amount of space that could be allocated towards a given metadata
* file at or beneath a certain threshold?
* Is the metafile reservations at or beneath a certain threshold?
*/
static inline bool
xfs_metafile_resv_can_cover(
struct xfs_inode *ip,
struct xfs_mount *mp,
int64_t rhs)
{
/*
@ -88,43 +90,38 @@ xfs_metafile_resv_can_cover(
* global free block count. Take care of the first case to avoid
* touching the per-cpu counter.
*/
if (ip->i_delayed_blks >= rhs)
if (mp->m_metafile_resv_avail >= rhs)
return true;
/*
* There aren't enough blocks left in the inode's reservation, but it
* isn't critical unless there also isn't enough free space.
*/
return __percpu_counter_compare(&ip->i_mount->m_fdblocks,
rhs - ip->i_delayed_blks, 2048) >= 0;
return xfs_compare_freecounter(mp, XC_FREE_BLOCKS,
rhs - mp->m_metafile_resv_avail, 2048) >= 0;
}
/*
* Is this metadata file critically low on blocks? For now we'll define that
* as the number of blocks we can get our hands on being less than 10% of what
* we reserved or less than some arbitrary number (maximum btree height).
* Is the metafile reservation critically low on blocks? For now we'll define
* that as the number of blocks we can get our hands on being less than 10% of
* what we reserved or less than some arbitrary number (maximum btree height).
*/
bool
xfs_metafile_resv_critical(
struct xfs_inode *ip)
struct xfs_mount *mp)
{
uint64_t asked_low_water;
ASSERT(xfs_has_metadir(mp));
if (!ip)
return false;
trace_xfs_metafile_resv_critical(mp, 0);
ASSERT(xfs_is_metadir_inode(ip));
trace_xfs_metafile_resv_critical(ip, 0);
if (!xfs_metafile_resv_can_cover(ip, ip->i_mount->m_rtbtree_maxlevels))
if (!xfs_metafile_resv_can_cover(mp, mp->m_rtbtree_maxlevels))
return true;
asked_low_water = div_u64(ip->i_meta_resv_asked, 10);
if (!xfs_metafile_resv_can_cover(ip, asked_low_water))
if (!xfs_metafile_resv_can_cover(mp,
div_u64(mp->m_metafile_resv_target, 10)))
return true;
return XFS_TEST_ERROR(false, ip->i_mount,
XFS_ERRTAG_METAFILE_RESV_CRITICAL);
return XFS_TEST_ERROR(false, mp, XFS_ERRTAG_METAFILE_RESV_CRITICAL);
}
/* Allocate a block from the metadata file's reservation. */
@ -133,22 +130,24 @@ xfs_metafile_resv_alloc_space(
struct xfs_inode *ip,
struct xfs_alloc_arg *args)
{
struct xfs_mount *mp = ip->i_mount;
int64_t len = args->len;
ASSERT(xfs_is_metadir_inode(ip));
ASSERT(args->resv == XFS_AG_RESV_METAFILE);
trace_xfs_metafile_resv_alloc_space(ip, args->len);
trace_xfs_metafile_resv_alloc_space(mp, args->len);
/*
* Allocate the blocks from the metadata inode's block reservation
* and update the ondisk sb counter.
*/
if (ip->i_delayed_blks > 0) {
mutex_lock(&mp->m_metafile_resv_lock);
if (mp->m_metafile_resv_avail > 0) {
int64_t from_resv;
from_resv = min_t(int64_t, len, ip->i_delayed_blks);
ip->i_delayed_blks -= from_resv;
from_resv = min_t(int64_t, len, mp->m_metafile_resv_avail);
mp->m_metafile_resv_avail -= from_resv;
xfs_mod_delalloc(ip, 0, -from_resv);
xfs_trans_mod_sb(args->tp, XFS_TRANS_SB_RES_FDBLOCKS,
-from_resv);
@ -175,6 +174,9 @@ xfs_metafile_resv_alloc_space(
xfs_trans_mod_sb(args->tp, field, -len);
}
mp->m_metafile_resv_used += args->len;
mutex_unlock(&mp->m_metafile_resv_lock);
ip->i_nblocks += args->len;
xfs_trans_log_inode(args->tp, ip, XFS_ILOG_CORE);
}
@ -186,26 +188,33 @@ xfs_metafile_resv_free_space(
struct xfs_trans *tp,
xfs_filblks_t len)
{
struct xfs_mount *mp = ip->i_mount;
int64_t to_resv;
ASSERT(xfs_is_metadir_inode(ip));
trace_xfs_metafile_resv_free_space(ip, len);
trace_xfs_metafile_resv_free_space(mp, len);
ip->i_nblocks -= len;
xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
mutex_lock(&mp->m_metafile_resv_lock);
mp->m_metafile_resv_used -= len;
/*
* Add the freed blocks back into the inode's delalloc reservation
* until it reaches the maximum size. Update the ondisk fdblocks only.
*/
to_resv = ip->i_meta_resv_asked - (ip->i_nblocks + ip->i_delayed_blks);
to_resv = mp->m_metafile_resv_target -
(mp->m_metafile_resv_used + mp->m_metafile_resv_avail);
if (to_resv > 0) {
to_resv = min_t(int64_t, to_resv, len);
ip->i_delayed_blks += to_resv;
mp->m_metafile_resv_avail += to_resv;
xfs_mod_delalloc(ip, 0, to_resv);
xfs_trans_mod_sb(tp, XFS_TRANS_SB_RES_FDBLOCKS, to_resv);
len -= to_resv;
}
mutex_unlock(&mp->m_metafile_resv_lock);
/*
* Everything else goes back to the filesystem, so update the in-core
@ -215,61 +224,99 @@ xfs_metafile_resv_free_space(
xfs_trans_mod_sb(tp, XFS_TRANS_SB_FDBLOCKS, len);
}
/* Release a metadata file's space reservation. */
void
xfs_metafile_resv_free(
struct xfs_inode *ip)
static void
__xfs_metafile_resv_free(
struct xfs_mount *mp)
{
/* Non-btree metadata inodes don't need space reservations. */
if (!ip || !ip->i_meta_resv_asked)
return;
ASSERT(xfs_is_metadir_inode(ip));
trace_xfs_metafile_resv_free(ip, 0);
if (ip->i_delayed_blks) {
xfs_mod_delalloc(ip, 0, -ip->i_delayed_blks);
xfs_add_fdblocks(ip->i_mount, ip->i_delayed_blks);
ip->i_delayed_blks = 0;
if (mp->m_metafile_resv_avail) {
xfs_mod_sb_delalloc(mp, -(int64_t)mp->m_metafile_resv_avail);
xfs_add_fdblocks(mp, mp->m_metafile_resv_avail);
}
ip->i_meta_resv_asked = 0;
mp->m_metafile_resv_avail = 0;
mp->m_metafile_resv_used = 0;
mp->m_metafile_resv_target = 0;
}
/* Set up a metadata file's space reservation. */
/* Release unused metafile space reservation. */
void
xfs_metafile_resv_free(
struct xfs_mount *mp)
{
if (!xfs_has_metadir(mp))
return;
trace_xfs_metafile_resv_free(mp, 0);
mutex_lock(&mp->m_metafile_resv_lock);
__xfs_metafile_resv_free(mp);
mutex_unlock(&mp->m_metafile_resv_lock);
}
/* Set up a metafile space reservation. */
int
xfs_metafile_resv_init(
struct xfs_inode *ip,
xfs_filblks_t ask)
struct xfs_mount *mp)
{
struct xfs_rtgroup *rtg = NULL;
xfs_filblks_t used = 0, target = 0;
xfs_filblks_t hidden_space;
xfs_filblks_t used;
int error;
xfs_rfsblock_t dblocks_avail = mp->m_sb.sb_dblocks / 4;
int error = 0;
if (!ip || ip->i_meta_resv_asked > 0)
if (!xfs_has_metadir(mp))
return 0;
ASSERT(xfs_is_metadir_inode(ip));
/*
* Free any previous reservation to have a clean slate.
*/
mutex_lock(&mp->m_metafile_resv_lock);
__xfs_metafile_resv_free(mp);
/*
* Space taken by all other metadata btrees are accounted on-disk as
* Currently the only btree metafiles that require reservations are the
* rtrmap and the rtrefcount. Anything new will have to be added here
* as well.
*/
while ((rtg = xfs_rtgroup_next(mp, rtg))) {
if (xfs_has_rtrmapbt(mp)) {
used += rtg_rmap(rtg)->i_nblocks;
target += xfs_rtrmapbt_calc_reserves(mp);
}
if (xfs_has_rtreflink(mp)) {
used += rtg_refcount(rtg)->i_nblocks;
target += xfs_rtrefcountbt_calc_reserves(mp);
}
}
if (!target)
goto out_unlock;
/*
* Space taken by the per-AG metadata btrees are accounted on-disk as
* used space. We therefore only hide the space that is reserved but
* not used by the trees.
*/
used = ip->i_nblocks;
if (used > ask)
ask = used;
hidden_space = ask - used;
if (used > target)
target = used;
else if (target > dblocks_avail)
target = dblocks_avail;
hidden_space = target - used;
error = xfs_dec_fdblocks(ip->i_mount, hidden_space, true);
error = xfs_dec_fdblocks(mp, hidden_space, true);
if (error) {
trace_xfs_metafile_resv_init_error(ip, error, _RET_IP_);
return error;
trace_xfs_metafile_resv_init_error(mp, 0);
goto out_unlock;
}
xfs_mod_delalloc(ip, 0, hidden_space);
ip->i_delayed_blks = hidden_space;
ip->i_meta_resv_asked = ask;
xfs_mod_sb_delalloc(mp, hidden_space);
trace_xfs_metafile_resv_init(ip, ask);
return 0;
mp->m_metafile_resv_target = target;
mp->m_metafile_resv_used = used;
mp->m_metafile_resv_avail = hidden_space;
trace_xfs_metafile_resv_init(mp, target);
out_unlock:
mutex_unlock(&mp->m_metafile_resv_lock);
return error;
}

View file

@ -26,13 +26,13 @@ void xfs_metafile_clear_iflag(struct xfs_trans *tp, struct xfs_inode *ip);
/* Space reservations for metadata inodes. */
struct xfs_alloc_arg;
bool xfs_metafile_resv_critical(struct xfs_inode *ip);
bool xfs_metafile_resv_critical(struct xfs_mount *mp);
void xfs_metafile_resv_alloc_space(struct xfs_inode *ip,
struct xfs_alloc_arg *args);
void xfs_metafile_resv_free_space(struct xfs_inode *ip, struct xfs_trans *tp,
xfs_filblks_t len);
void xfs_metafile_resv_free(struct xfs_inode *ip);
int xfs_metafile_resv_init(struct xfs_inode *ip, xfs_filblks_t ask);
void xfs_metafile_resv_free(struct xfs_mount *mp);
int xfs_metafile_resv_init(struct xfs_mount *mp);
/* Code specific to kernel/userspace; must be provided externally. */

View file

@ -233,8 +233,8 @@ xfs_check_ondisk_structs(void)
16299260424LL);
/* superblock field checks we got from xfs/122 */
XFS_CHECK_STRUCT_SIZE(struct xfs_dsb, 288);
XFS_CHECK_STRUCT_SIZE(struct xfs_sb, 288);
XFS_CHECK_STRUCT_SIZE(struct xfs_dsb, 304);
XFS_CHECK_STRUCT_SIZE(struct xfs_sb, 304);
XFS_CHECK_SB_OFFSET(sb_magicnum, 0);
XFS_CHECK_SB_OFFSET(sb_blocksize, 4);
XFS_CHECK_SB_OFFSET(sb_dblocks, 8);
@ -295,6 +295,8 @@ xfs_check_ondisk_structs(void)
XFS_CHECK_SB_OFFSET(sb_rgextents, 276);
XFS_CHECK_SB_OFFSET(sb_rgblklog, 280);
XFS_CHECK_SB_OFFSET(sb_pad, 281);
XFS_CHECK_SB_OFFSET(sb_rtstart, 288);
XFS_CHECK_SB_OFFSET(sb_rtreserved, 296);
}
#endif /* __XFS_ONDISK_H */

View file

@ -1123,6 +1123,7 @@ xfs_rtfree_blocks(
xfs_extlen_t mod;
int error;
ASSERT(!xfs_has_zoned(mp));
ASSERT(rtlen <= XFS_MAX_BMBT_EXTLEN);
mod = xfs_blen_to_rtxoff(mp, rtlen);
@ -1174,6 +1175,9 @@ xfs_rtalloc_query_range(
end = min(end, rtg->rtg_extents - 1);
if (xfs_has_zoned(mp))
return -EINVAL;
/* Iterate the bitmap, looking for discrepancies. */
while (start <= end) {
struct xfs_rtalloc_rec rec;
@ -1268,6 +1272,8 @@ xfs_rtbitmap_blockcount_len(
struct xfs_mount *mp,
xfs_rtbxlen_t rtextents)
{
if (xfs_has_zoned(mp))
return 0;
return howmany_64(rtextents, xfs_rtbitmap_rtx_per_rbmblock(mp));
}
@ -1308,6 +1314,11 @@ xfs_rtsummary_blockcount(
xfs_rtbxlen_t rextents = xfs_rtbitmap_bitcount(mp);
unsigned long long rsumwords;
if (xfs_has_zoned(mp)) {
*rsumlevels = 0;
return 0;
}
*rsumlevels = xfs_compute_rextslog(rextents) + 1;
rsumwords = xfs_rtbitmap_blockcount_len(mp, rextents) * (*rsumlevels);
return howmany_64(rsumwords, mp->m_blockwsize);

View file

@ -194,15 +194,17 @@ xfs_rtgroup_lock(
ASSERT(!(rtglock_flags & XFS_RTGLOCK_BITMAP_SHARED) ||
!(rtglock_flags & XFS_RTGLOCK_BITMAP));
if (rtglock_flags & XFS_RTGLOCK_BITMAP) {
/*
* Lock both realtime free space metadata inodes for a freespace
* update.
*/
xfs_ilock(rtg_bitmap(rtg), XFS_ILOCK_EXCL);
xfs_ilock(rtg_summary(rtg), XFS_ILOCK_EXCL);
} else if (rtglock_flags & XFS_RTGLOCK_BITMAP_SHARED) {
xfs_ilock(rtg_bitmap(rtg), XFS_ILOCK_SHARED);
if (!xfs_has_zoned(rtg_mount(rtg))) {
if (rtglock_flags & XFS_RTGLOCK_BITMAP) {
/*
* Lock both realtime free space metadata inodes for a
* freespace update.
*/
xfs_ilock(rtg_bitmap(rtg), XFS_ILOCK_EXCL);
xfs_ilock(rtg_summary(rtg), XFS_ILOCK_EXCL);
} else if (rtglock_flags & XFS_RTGLOCK_BITMAP_SHARED) {
xfs_ilock(rtg_bitmap(rtg), XFS_ILOCK_SHARED);
}
}
if ((rtglock_flags & XFS_RTGLOCK_RMAP) && rtg_rmap(rtg))
@ -228,11 +230,13 @@ xfs_rtgroup_unlock(
if ((rtglock_flags & XFS_RTGLOCK_RMAP) && rtg_rmap(rtg))
xfs_iunlock(rtg_rmap(rtg), XFS_ILOCK_EXCL);
if (rtglock_flags & XFS_RTGLOCK_BITMAP) {
xfs_iunlock(rtg_summary(rtg), XFS_ILOCK_EXCL);
xfs_iunlock(rtg_bitmap(rtg), XFS_ILOCK_EXCL);
} else if (rtglock_flags & XFS_RTGLOCK_BITMAP_SHARED) {
xfs_iunlock(rtg_bitmap(rtg), XFS_ILOCK_SHARED);
if (!xfs_has_zoned(rtg_mount(rtg))) {
if (rtglock_flags & XFS_RTGLOCK_BITMAP) {
xfs_iunlock(rtg_summary(rtg), XFS_ILOCK_EXCL);
xfs_iunlock(rtg_bitmap(rtg), XFS_ILOCK_EXCL);
} else if (rtglock_flags & XFS_RTGLOCK_BITMAP_SHARED) {
xfs_iunlock(rtg_bitmap(rtg), XFS_ILOCK_SHARED);
}
}
}
@ -249,7 +253,8 @@ xfs_rtgroup_trans_join(
ASSERT(!(rtglock_flags & ~XFS_RTGLOCK_ALL_FLAGS));
ASSERT(!(rtglock_flags & XFS_RTGLOCK_BITMAP_SHARED));
if (rtglock_flags & XFS_RTGLOCK_BITMAP) {
if (!xfs_has_zoned(rtg_mount(rtg)) &&
(rtglock_flags & XFS_RTGLOCK_BITMAP)) {
xfs_trans_ijoin(tp, rtg_bitmap(rtg), XFS_ILOCK_EXCL);
xfs_trans_ijoin(tp, rtg_summary(rtg), XFS_ILOCK_EXCL);
}
@ -270,7 +275,7 @@ xfs_rtgroup_get_geometry(
/* Fill out form. */
memset(rgeo, 0, sizeof(*rgeo));
rgeo->rg_number = rtg_rgno(rtg);
rgeo->rg_length = rtg_group(rtg)->xg_block_count;
rgeo->rg_length = rtg_blocks(rtg);
xfs_rtgroup_geom_health(rtg, rgeo);
return 0;
}
@ -354,6 +359,7 @@ static const struct xfs_rtginode_ops xfs_rtginode_ops[XFS_RTGI_MAX] = {
.sick = XFS_SICK_RG_BITMAP,
.fmt_mask = (1U << XFS_DINODE_FMT_EXTENTS) |
(1U << XFS_DINODE_FMT_BTREE),
.enabled = xfs_has_nonzoned,
.create = xfs_rtbitmap_create,
},
[XFS_RTGI_SUMMARY] = {
@ -362,6 +368,7 @@ static const struct xfs_rtginode_ops xfs_rtginode_ops[XFS_RTGI_MAX] = {
.sick = XFS_SICK_RG_SUMMARY,
.fmt_mask = (1U << XFS_DINODE_FMT_EXTENTS) |
(1U << XFS_DINODE_FMT_BTREE),
.enabled = xfs_has_nonzoned,
.create = xfs_rtsummary_create,
},
[XFS_RTGI_RMAP] = {

View file

@ -37,15 +37,33 @@ struct xfs_rtgroup {
xfs_rtxnum_t rtg_extents;
/*
* Cache of rt summary level per bitmap block with the invariant that
* rtg_rsum_cache[bbno] > the maximum i for which rsum[i][bbno] != 0,
* or 0 if rsum[i][bbno] == 0 for all i.
*
* For bitmap based RT devices this points to a cache of rt summary
* level per bitmap block with the invariant that rtg_rsum_cache[bbno]
* > the maximum i for which rsum[i][bbno] != 0, or 0 if
* rsum[i][bbno] == 0 for all i.
* Reads and writes are serialized by the rsumip inode lock.
*
* For zoned RT devices this points to the open zone structure for
* a group that is open for writers, or is NULL.
*/
uint8_t *rtg_rsum_cache;
union {
uint8_t *rtg_rsum_cache;
struct xfs_open_zone *rtg_open_zone;
};
};
/*
* For zoned RT devices this is set on groups that have no written blocks
* and can be picked by the allocator for opening.
*/
#define XFS_RTG_FREE XA_MARK_0
/*
* For zoned RT devices this is set on groups that are fully written and that
* have unused blocks. Used by the garbage collection to pick targets.
*/
#define XFS_RTG_RECLAIMABLE XA_MARK_1
static inline struct xfs_rtgroup *to_rtg(struct xfs_group *xg)
{
return container_of(xg, struct xfs_rtgroup, rtg_group);
@ -66,6 +84,11 @@ static inline xfs_rgnumber_t rtg_rgno(const struct xfs_rtgroup *rtg)
return rtg->rtg_group.xg_gno;
}
static inline xfs_rgblock_t rtg_blocks(const struct xfs_rtgroup *rtg)
{
return rtg->rtg_group.xg_block_count;
}
static inline struct xfs_inode *rtg_bitmap(const struct xfs_rtgroup *rtg)
{
return rtg->rtg_inodes[XFS_RTGI_BITMAP];
@ -222,10 +245,14 @@ xfs_rtb_to_daddr(
xfs_rtblock_t rtbno)
{
struct xfs_groups *g = &mp->m_groups[XG_TYPE_RTG];
xfs_rgnumber_t rgno = xfs_rtb_to_rgno(mp, rtbno);
uint64_t start_bno = (xfs_rtblock_t)rgno * g->blocks;
return XFS_FSB_TO_BB(mp, start_bno + (rtbno & g->blkmask));
if (xfs_has_rtgroups(mp) && !g->has_daddr_gaps) {
xfs_rgnumber_t rgno = xfs_rtb_to_rgno(mp, rtbno);
rtbno = (xfs_rtblock_t)rgno * g->blocks + (rtbno & g->blkmask);
}
return XFS_FSB_TO_BB(mp, g->start_fsb + rtbno);
}
static inline xfs_rtblock_t
@ -233,10 +260,11 @@ xfs_daddr_to_rtb(
struct xfs_mount *mp,
xfs_daddr_t daddr)
{
xfs_rfsblock_t bno = XFS_BB_TO_FSBT(mp, daddr);
struct xfs_groups *g = &mp->m_groups[XG_TYPE_RTG];
xfs_rfsblock_t bno;
if (xfs_has_rtgroups(mp)) {
struct xfs_groups *g = &mp->m_groups[XG_TYPE_RTG];
bno = XFS_BB_TO_FSBT(mp, daddr) - g->start_fsb;
if (xfs_has_rtgroups(mp) && !g->has_daddr_gaps) {
xfs_rgnumber_t rgno;
uint32_t rgbno;

View file

@ -1033,3 +1033,22 @@ xfs_rtrmapbt_init_rtsb(
xfs_btree_del_cursor(cur, error);
return error;
}
/*
* Return the highest rgbno currently tracked by the rmap for this rtg.
*/
xfs_rgblock_t
xfs_rtrmap_highest_rgbno(
struct xfs_rtgroup *rtg)
{
struct xfs_btree_block *block = rtg_rmap(rtg)->i_df.if_broot;
union xfs_btree_key key = {};
struct xfs_btree_cur *cur;
if (block->bb_numrecs == 0)
return NULLRGBLOCK;
cur = xfs_rtrmapbt_init_cursor(NULL, rtg);
xfs_btree_get_keys(cur, block, &key);
xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
return be32_to_cpu(key.__rmap_bigkey[1].rm_startblock);
}

View file

@ -207,4 +207,6 @@ struct xfs_btree_cur *xfs_rtrmapbt_mem_cursor(struct xfs_rtgroup *rtg,
int xfs_rtrmapbt_mem_init(struct xfs_mount *mp, struct xfbtree *xfbtree,
struct xfs_buftarg *btp, xfs_rgnumber_t rgno);
xfs_rgblock_t xfs_rtrmap_highest_rgbno(struct xfs_rtgroup *rtg);
#endif /* __XFS_RTRMAP_BTREE_H__ */

View file

@ -185,6 +185,8 @@ xfs_sb_version_to_features(
features |= XFS_FEAT_PARENT;
if (sbp->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_METADIR)
features |= XFS_FEAT_METADIR;
if (sbp->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_ZONED)
features |= XFS_FEAT_ZONED;
return features;
}
@ -266,6 +268,9 @@ static uint64_t
xfs_expected_rbmblocks(
struct xfs_sb *sbp)
{
if (xfs_sb_is_v5(sbp) &&
(sbp->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_ZONED))
return 0;
return howmany_64(xfs_extents_per_rbm(sbp),
NBBY * xfs_rtbmblock_size(sbp));
}
@ -275,9 +280,15 @@ bool
xfs_validate_rt_geometry(
struct xfs_sb *sbp)
{
if (sbp->sb_rextsize * sbp->sb_blocksize > XFS_MAX_RTEXTSIZE ||
sbp->sb_rextsize * sbp->sb_blocksize < XFS_MIN_RTEXTSIZE)
return false;
if (xfs_sb_is_v5(sbp) &&
(sbp->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_ZONED)) {
if (sbp->sb_rextsize != 1)
return false;
} else {
if (sbp->sb_rextsize * sbp->sb_blocksize > XFS_MAX_RTEXTSIZE ||
sbp->sb_rextsize * sbp->sb_blocksize < XFS_MIN_RTEXTSIZE)
return false;
}
if (sbp->sb_rblocks == 0) {
if (sbp->sb_rextents != 0 || sbp->sb_rbmblocks != 0 ||
@ -435,6 +446,34 @@ xfs_validate_sb_rtgroups(
return 0;
}
static int
xfs_validate_sb_zoned(
struct xfs_mount *mp,
struct xfs_sb *sbp)
{
if (sbp->sb_frextents != 0) {
xfs_warn(mp,
"sb_frextents must be zero for zoned file systems.");
return -EINVAL;
}
if (sbp->sb_rtstart && sbp->sb_rtstart < sbp->sb_dblocks) {
xfs_warn(mp,
"sb_rtstart (%lld) overlaps sb_dblocks (%lld).",
sbp->sb_rtstart, sbp->sb_dblocks);
return -EINVAL;
}
if (sbp->sb_rtreserved && sbp->sb_rtreserved >= sbp->sb_rblocks) {
xfs_warn(mp,
"sb_rtreserved (%lld) larger than sb_rblocks (%lld).",
sbp->sb_rtreserved, sbp->sb_rblocks);
return -EINVAL;
}
return 0;
}
/* Check the validity of the SB. */
STATIC int
xfs_validate_sb_common(
@ -523,6 +562,11 @@ xfs_validate_sb_common(
if (error)
return error;
}
if (sbp->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_ZONED) {
error = xfs_validate_sb_zoned(mp, sbp);
if (error)
return error;
}
} else if (sbp->sb_qflags & (XFS_PQUOTA_ENFD | XFS_GQUOTA_ENFD |
XFS_PQUOTA_CHKD | XFS_GQUOTA_CHKD)) {
xfs_notice(mp,
@ -835,6 +879,14 @@ __xfs_sb_from_disk(
to->sb_rgcount = 1;
to->sb_rgextents = 0;
}
if (to->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_ZONED) {
to->sb_rtstart = be64_to_cpu(from->sb_rtstart);
to->sb_rtreserved = be64_to_cpu(from->sb_rtreserved);
} else {
to->sb_rtstart = 0;
to->sb_rtreserved = 0;
}
}
void
@ -1001,6 +1053,11 @@ xfs_sb_to_disk(
to->sb_rbmino = cpu_to_be64(0);
to->sb_rsumino = cpu_to_be64(0);
}
if (from->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_ZONED) {
to->sb_rtstart = cpu_to_be64(from->sb_rtstart);
to->sb_rtreserved = cpu_to_be64(from->sb_rtreserved);
}
}
/*
@ -1146,6 +1203,10 @@ xfs_sb_mount_rextsize(
rgs->blocks = sbp->sb_rgextents * sbp->sb_rextsize;
rgs->blklog = mp->m_sb.sb_rgblklog;
rgs->blkmask = xfs_mask32lo(mp->m_sb.sb_rgblklog);
rgs->start_fsb = mp->m_sb.sb_rtstart;
if (xfs_sb_has_incompat_feature(sbp,
XFS_SB_FEAT_INCOMPAT_ZONE_GAPS))
rgs->has_daddr_gaps = true;
} else {
rgs->blocks = 0;
rgs->blklog = 0;
@ -1265,8 +1326,7 @@ xfs_log_sb(
mp->m_sb.sb_ifree = min_t(uint64_t,
percpu_counter_sum_positive(&mp->m_ifree),
mp->m_sb.sb_icount);
mp->m_sb.sb_fdblocks =
percpu_counter_sum_positive(&mp->m_fdblocks);
mp->m_sb.sb_fdblocks = xfs_sum_freecounter(mp, XC_FREE_BLOCKS);
}
/*
@ -1275,9 +1335,10 @@ xfs_log_sb(
* we handle nearly-lockless reservations, so we must use the _positive
* variant here to avoid writing out nonsense frextents.
*/
if (xfs_has_rtgroups(mp))
if (xfs_has_rtgroups(mp) && !xfs_has_zoned(mp)) {
mp->m_sb.sb_frextents =
percpu_counter_sum_positive(&mp->m_frextents);
xfs_sum_freecounter(mp, XC_FREE_RTEXTENTS);
}
xfs_sb_to_disk(bp->b_addr, &mp->m_sb);
xfs_trans_buf_set_type(tp, bp, XFS_BLFT_SB_BUF);
@ -1510,6 +1571,8 @@ xfs_fs_geometry(
geo->flags |= XFS_FSOP_GEOM_FLAGS_EXCHANGE_RANGE;
if (xfs_has_metadir(mp))
geo->flags |= XFS_FSOP_GEOM_FLAGS_METADIR;
if (xfs_has_zoned(mp))
geo->flags |= XFS_FSOP_GEOM_FLAGS_ZONED;
geo->rtsectsize = sbp->sb_blocksize;
geo->dirblocksize = xfs_dir2_dirblock_bytes(sbp);
@ -1530,6 +1593,10 @@ xfs_fs_geometry(
geo->rgcount = sbp->sb_rgcount;
geo->rgextents = sbp->sb_rgextents;
}
if (xfs_has_zoned(mp)) {
geo->rtstart = sbp->sb_rtstart;
geo->rtreserved = sbp->sb_rtreserved;
}
}
/* Read a secondary superblock. */

View file

@ -233,6 +233,34 @@ enum xfs_group_type {
{ XG_TYPE_AG, "ag" }, \
{ XG_TYPE_RTG, "rtg" }
enum xfs_free_counter {
/*
* Number of free blocks on the data device.
*/
XC_FREE_BLOCKS,
/*
* Number of free RT extents on the RT device.
*/
XC_FREE_RTEXTENTS,
/*
* Number of available for use RT extents.
*
* This counter only exists for zoned RT device and indicates the number
* of RT extents that can be directly used by writes. XC_FREE_RTEXTENTS
* also includes blocks that have been written previously and freed, but
* sit in a rtgroup that still needs a zone reset.
*/
XC_FREE_RTAVAILABLE,
XC_FREE_NR,
};
#define XFS_FREECOUNTER_STR \
{ XC_FREE_BLOCKS, "blocks" }, \
{ XC_FREE_RTEXTENTS, "rtextents" }, \
{ XC_FREE_RTAVAILABLE, "rtavailable" }
/*
* Type verifier functions
*/

186
fs/xfs/libxfs/xfs_zones.c Normal file
View file

@ -0,0 +1,186 @@
// SPDX-License-Identifier: GPL-2.0
/*
* Copyright (c) 2023-2025 Christoph Hellwig.
* Copyright (c) 2024-2025, Western Digital Corporation or its affiliates.
*/
#include "xfs.h"
#include "xfs_fs.h"
#include "xfs_shared.h"
#include "xfs_format.h"
#include "xfs_log_format.h"
#include "xfs_trans_resv.h"
#include "xfs_mount.h"
#include "xfs_inode.h"
#include "xfs_rtgroup.h"
#include "xfs_zones.h"
static bool
xfs_zone_validate_empty(
struct blk_zone *zone,
struct xfs_rtgroup *rtg,
xfs_rgblock_t *write_pointer)
{
struct xfs_mount *mp = rtg_mount(rtg);
if (rtg_rmap(rtg)->i_used_blocks > 0) {
xfs_warn(mp, "empty zone %u has non-zero used counter (0x%x).",
rtg_rgno(rtg), rtg_rmap(rtg)->i_used_blocks);
return false;
}
*write_pointer = 0;
return true;
}
static bool
xfs_zone_validate_wp(
struct blk_zone *zone,
struct xfs_rtgroup *rtg,
xfs_rgblock_t *write_pointer)
{
struct xfs_mount *mp = rtg_mount(rtg);
xfs_rtblock_t wp_fsb = xfs_daddr_to_rtb(mp, zone->wp);
if (rtg_rmap(rtg)->i_used_blocks > rtg->rtg_extents) {
xfs_warn(mp, "zone %u has too large used counter (0x%x).",
rtg_rgno(rtg), rtg_rmap(rtg)->i_used_blocks);
return false;
}
if (xfs_rtb_to_rgno(mp, wp_fsb) != rtg_rgno(rtg)) {
xfs_warn(mp, "zone %u write pointer (0x%llx) outside of zone.",
rtg_rgno(rtg), wp_fsb);
return false;
}
*write_pointer = xfs_rtb_to_rgbno(mp, wp_fsb);
if (*write_pointer >= rtg->rtg_extents) {
xfs_warn(mp, "zone %u has invalid write pointer (0x%x).",
rtg_rgno(rtg), *write_pointer);
return false;
}
return true;
}
static bool
xfs_zone_validate_full(
struct blk_zone *zone,
struct xfs_rtgroup *rtg,
xfs_rgblock_t *write_pointer)
{
struct xfs_mount *mp = rtg_mount(rtg);
if (rtg_rmap(rtg)->i_used_blocks > rtg->rtg_extents) {
xfs_warn(mp, "zone %u has too large used counter (0x%x).",
rtg_rgno(rtg), rtg_rmap(rtg)->i_used_blocks);
return false;
}
*write_pointer = rtg->rtg_extents;
return true;
}
static bool
xfs_zone_validate_seq(
struct blk_zone *zone,
struct xfs_rtgroup *rtg,
xfs_rgblock_t *write_pointer)
{
struct xfs_mount *mp = rtg_mount(rtg);
switch (zone->cond) {
case BLK_ZONE_COND_EMPTY:
return xfs_zone_validate_empty(zone, rtg, write_pointer);
case BLK_ZONE_COND_IMP_OPEN:
case BLK_ZONE_COND_EXP_OPEN:
case BLK_ZONE_COND_CLOSED:
return xfs_zone_validate_wp(zone, rtg, write_pointer);
case BLK_ZONE_COND_FULL:
return xfs_zone_validate_full(zone, rtg, write_pointer);
case BLK_ZONE_COND_NOT_WP:
case BLK_ZONE_COND_OFFLINE:
case BLK_ZONE_COND_READONLY:
xfs_warn(mp, "zone %u has unsupported zone condition 0x%x.",
rtg_rgno(rtg), zone->cond);
return false;
default:
xfs_warn(mp, "zone %u has unknown zone condition 0x%x.",
rtg_rgno(rtg), zone->cond);
return false;
}
}
static bool
xfs_zone_validate_conv(
struct blk_zone *zone,
struct xfs_rtgroup *rtg)
{
struct xfs_mount *mp = rtg_mount(rtg);
switch (zone->cond) {
case BLK_ZONE_COND_NOT_WP:
return true;
default:
xfs_warn(mp,
"conventional zone %u has unsupported zone condition 0x%x.",
rtg_rgno(rtg), zone->cond);
return false;
}
}
bool
xfs_zone_validate(
struct blk_zone *zone,
struct xfs_rtgroup *rtg,
xfs_rgblock_t *write_pointer)
{
struct xfs_mount *mp = rtg_mount(rtg);
struct xfs_groups *g = &mp->m_groups[XG_TYPE_RTG];
uint32_t expected_size;
/*
* Check that the zone capacity matches the rtgroup size stored in the
* superblock. Note that all zones including the last one must have a
* uniform capacity.
*/
if (XFS_BB_TO_FSB(mp, zone->capacity) != g->blocks) {
xfs_warn(mp,
"zone %u capacity (0x%llx) does not match RT group size (0x%x).",
rtg_rgno(rtg), XFS_BB_TO_FSB(mp, zone->capacity),
g->blocks);
return false;
}
if (g->has_daddr_gaps) {
expected_size = 1 << g->blklog;
} else {
if (zone->len != zone->capacity) {
xfs_warn(mp,
"zone %u has capacity != size ((0x%llx vs 0x%llx)",
rtg_rgno(rtg),
XFS_BB_TO_FSB(mp, zone->len),
XFS_BB_TO_FSB(mp, zone->capacity));
return false;
}
expected_size = g->blocks;
}
if (XFS_BB_TO_FSB(mp, zone->len) != expected_size) {
xfs_warn(mp,
"zone %u length (0x%llx) does match geometry (0x%x).",
rtg_rgno(rtg), XFS_BB_TO_FSB(mp, zone->len),
expected_size);
}
switch (zone->type) {
case BLK_ZONE_TYPE_CONVENTIONAL:
return xfs_zone_validate_conv(zone, rtg);
case BLK_ZONE_TYPE_SEQWRITE_REQ:
return xfs_zone_validate_seq(zone, rtg, write_pointer);
default:
xfs_warn(mp, "zoned %u has unsupported type 0x%x.",
rtg_rgno(rtg), zone->type);
return false;
}
}

35
fs/xfs/libxfs/xfs_zones.h Normal file
View file

@ -0,0 +1,35 @@
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LIBXFS_ZONES_H
#define _LIBXFS_ZONES_H
struct xfs_rtgroup;
/*
* In order to guarantee forward progress for GC we need to reserve at least
* two zones: one that will be used for moving data into and one spare zone
* making sure that we have enough space to relocate a nearly-full zone.
* To allow for slightly sloppy accounting for when we need to reserve the
* second zone, we actually reserve three as that is easier than doing fully
* accurate bookkeeping.
*/
#define XFS_GC_ZONES 3U
/*
* In addition we need two zones for user writes, one open zone for writing
* and one to still have available blocks without resetting the open zone
* when data in the open zone has been freed.
*/
#define XFS_RESERVED_ZONES (XFS_GC_ZONES + 1)
#define XFS_MIN_ZONES (XFS_RESERVED_ZONES + 1)
/*
* Always keep one zone out of the general open zone pool to allow for GC to
* happen while other writers are waiting for free space.
*/
#define XFS_OPEN_GC_ZONES 1U
#define XFS_MIN_OPEN_ZONES (XFS_OPEN_GC_ZONES + 1U)
bool xfs_zone_validate(struct blk_zone *zone, struct xfs_rtgroup *rtg,
xfs_rgblock_t *write_pointer);
#endif /* _LIBXFS_ZONES_H */

View file

@ -69,6 +69,8 @@ STATIC size_t
xchk_superblock_ondisk_size(
struct xfs_mount *mp)
{
if (xfs_has_zoned(mp))
return offsetofend(struct xfs_dsb, sb_rtreserved);
if (xfs_has_metadir(mp))
return offsetofend(struct xfs_dsb, sb_pad);
if (xfs_has_metauuid(mp))

View file

@ -1038,8 +1038,8 @@ xchk_bmap(
switch (whichfork) {
case XFS_COW_FORK:
/* No CoW forks on non-reflink filesystems. */
if (!xfs_has_reflink(mp)) {
/* No CoW forks filesystem doesn't support out of place writes */
if (!xfs_has_reflink(mp) && !xfs_has_zoned(mp)) {
xchk_ino_set_corrupt(sc, sc->ip->i_ino);
return 0;
}

View file

@ -350,7 +350,7 @@ retry:
* The global incore space reservation is taken from the incore
* counters, so leave that out of the computation.
*/
fsc->fdblocks -= mp->m_resblks_avail;
fsc->fdblocks -= mp->m_free[XC_FREE_BLOCKS].res_avail;
/*
* Delayed allocation reservations are taken out of the incore counters
@ -413,7 +413,13 @@ xchk_fscount_count_frextents(
fsc->frextents = 0;
fsc->frextents_delayed = 0;
if (!xfs_has_realtime(mp))
/*
* Don't bother verifying and repairing the fs counters for zoned file
* systems as they don't track an on-disk frextents count, and the
* in-memory percpu counter also includes reservations.
*/
if (!xfs_has_realtime(mp) || xfs_has_zoned(mp))
return 0;
while ((rtg = xfs_rtgroup_next(mp, rtg))) {
@ -513,8 +519,8 @@ xchk_fscounters(
/* Snapshot the percpu counters. */
icount = percpu_counter_sum(&mp->m_icount);
ifree = percpu_counter_sum(&mp->m_ifree);
fdblocks = percpu_counter_sum(&mp->m_fdblocks);
frextents = percpu_counter_sum(&mp->m_frextents);
fdblocks = xfs_sum_freecounter_raw(mp, XC_FREE_BLOCKS);
frextents = xfs_sum_freecounter_raw(mp, XC_FREE_RTEXTENTS);
/* No negative values, please! */
if (icount < 0 || ifree < 0)
@ -589,15 +595,17 @@ xchk_fscounters(
try_again = true;
}
if (!xchk_fscount_within_range(sc, fdblocks, &mp->m_fdblocks,
fsc->fdblocks)) {
if (!xchk_fscount_within_range(sc, fdblocks,
&mp->m_free[XC_FREE_BLOCKS].count, fsc->fdblocks)) {
if (fsc->frozen)
xchk_set_corrupt(sc);
else
try_again = true;
}
if (!xchk_fscount_within_range(sc, frextents, &mp->m_frextents,
if (!xfs_has_zoned(mp) &&
!xchk_fscount_within_range(sc, frextents,
&mp->m_free[XC_FREE_RTEXTENTS].count,
fsc->frextents - fsc->frextents_delayed)) {
if (fsc->frozen)
xchk_set_corrupt(sc);

View file

@ -64,7 +64,7 @@ xrep_fscounters(
percpu_counter_set(&mp->m_icount, fsc->icount);
percpu_counter_set(&mp->m_ifree, fsc->ifree);
percpu_counter_set(&mp->m_fdblocks, fsc->fdblocks);
xfs_set_freecounter(mp, XC_FREE_BLOCKS, fsc->fdblocks);
/*
* Online repair is only supported on v5 file systems, which require
@ -74,10 +74,12 @@ xrep_fscounters(
* track of the delalloc reservations separately, as they are are
* subtracted from m_frextents, but not included in sb_frextents.
*/
percpu_counter_set(&mp->m_frextents,
fsc->frextents - fsc->frextents_delayed);
if (!xfs_has_rtgroups(mp))
mp->m_sb.sb_frextents = fsc->frextents;
if (!xfs_has_zoned(mp)) {
xfs_set_freecounter(mp, XC_FREE_RTEXTENTS,
fsc->frextents - fsc->frextents_delayed);
if (!xfs_has_rtgroups(mp))
mp->m_sb.sb_frextents = fsc->frextents;
}
return 0;
}

View file

@ -273,6 +273,13 @@ xchk_inode_cowextsize(
xfs_failaddr_t fa;
uint32_t value = be32_to_cpu(dip->di_cowextsize);
/*
* The used block counter for rtrmap is checked and repaired elsewhere.
*/
if (xfs_has_zoned(sc->mp) &&
dip->di_metatype == cpu_to_be16(XFS_METAFILE_RTRMAP))
return;
fa = xfs_inode_validate_cowextsize(sc->mp, value, mode, flags, flags2);
if (fa)
xchk_ino_set_corrupt(sc, ino);

View file

@ -710,7 +710,9 @@ xrep_dinode_extsize_hints(
XFS_DIFLAG_EXTSZINHERIT);
}
if (dip->di_version < 3)
if (dip->di_version < 3 ||
(xfs_has_zoned(sc->mp) &&
dip->di_metatype == cpu_to_be16(XFS_METAFILE_RTRMAP)))
return;
fa = xfs_inode_validate_cowextsize(mp, be32_to_cpu(dip->di_cowextsize),
@ -1558,8 +1560,7 @@ xrep_dinode_core(
/* Read the inode cluster buffer. */
error = xfs_trans_read_buf(sc->mp, sc->tp, sc->mp->m_ddev_targp,
ri->imap.im_blkno, ri->imap.im_len, XBF_UNMAPPED, &bp,
NULL);
ri->imap.im_blkno, ri->imap.im_len, 0, &bp, NULL);
if (error)
return error;

View file

@ -62,7 +62,7 @@ xrep_newbt_estimate_slack(
free = sc->sa.pag->pagf_freeblks;
sz = xfs_ag_block_count(sc->mp, pag_agno(sc->sa.pag));
} else {
free = percpu_counter_sum(&sc->mp->m_fdblocks);
free = xfs_sum_freecounter_raw(sc->mp, XC_FREE_BLOCKS);
sz = sc->mp->m_sb.sb_dblocks;
}

View file

@ -935,10 +935,13 @@ xrep_reap_metadir_fsblocks(
if (error)
return error;
if (xreap_dirty(&rs))
return xrep_defer_finish(sc);
if (xreap_dirty(&rs)) {
error = xrep_defer_finish(sc);
if (error)
return error;
}
return 0;
return xrep_reset_metafile_resv(sc);
}
/*

View file

@ -43,6 +43,7 @@
#include "xfs_rtalloc.h"
#include "xfs_metafile.h"
#include "xfs_rtrefcount_btree.h"
#include "xfs_zone_alloc.h"
#include "scrub/scrub.h"
#include "scrub/common.h"
#include "scrub/trace.h"
@ -1050,7 +1051,13 @@ xrep_require_rtext_inuse(
xfs_rtxnum_t startrtx;
xfs_rtxnum_t endrtx;
bool is_free = false;
int error;
int error = 0;
if (xfs_has_zoned(mp)) {
if (!xfs_zone_rgbno_is_valid(sc->sr.rtg, rgbno + len - 1))
return -EFSCORRUPTED;
return 0;
}
startrtx = xfs_rgbno_to_rtx(mp, rgbno);
endrtx = xfs_rgbno_to_rtx(mp, rgbno + len - 1);
@ -1386,11 +1393,12 @@ int
xrep_reset_metafile_resv(
struct xfs_scrub *sc)
{
struct xfs_inode *ip = sc->ip;
struct xfs_mount *mp = sc->mp;
int64_t delta;
int error;
delta = ip->i_nblocks + ip->i_delayed_blks - ip->i_meta_resv_asked;
delta = mp->m_metafile_resv_used + mp->m_metafile_resv_avail -
mp->m_metafile_resv_target;
if (delta == 0)
return 0;
@ -1401,11 +1409,11 @@ xrep_reset_metafile_resv(
if (delta > 0) {
int64_t give_back;
give_back = min_t(uint64_t, delta, ip->i_delayed_blks);
give_back = min_t(uint64_t, delta, mp->m_metafile_resv_avail);
if (give_back > 0) {
xfs_mod_delalloc(ip, 0, -give_back);
xfs_add_fdblocks(ip->i_mount, give_back);
ip->i_delayed_blks -= give_back;
xfs_mod_sb_delalloc(mp, -give_back);
xfs_add_fdblocks(mp, give_back);
mp->m_metafile_resv_avail -= give_back;
}
return 0;
@ -1413,24 +1421,23 @@ xrep_reset_metafile_resv(
/*
* Not enough reservation; try to take some blocks from the filesystem
* to the metadata inode. @delta is negative here, so invert the sign.
* to the metabtree reservation.
*/
delta = -delta;
error = xfs_dec_fdblocks(sc->mp, delta, true);
delta = -delta; /* delta is negative here, so invert the sign. */
error = xfs_dec_fdblocks(mp, delta, true);
while (error == -ENOSPC) {
delta--;
if (delta == 0) {
xfs_warn(sc->mp,
"Insufficient free space to reset space reservation for inode 0x%llx after repair.",
ip->i_ino);
"Insufficient free space to reset metabtree reservation after repair.");
return 0;
}
error = xfs_dec_fdblocks(sc->mp, delta, true);
error = xfs_dec_fdblocks(mp, delta, true);
}
if (error)
return error;
xfs_mod_delalloc(ip, 0, delta);
ip->i_delayed_blks += delta;
xfs_mod_sb_delalloc(mp, delta);
mp->m_metafile_resv_avail += delta;
return 0;
}

View file

@ -21,6 +21,7 @@
#include "xfs_rmap.h"
#include "xfs_rtrmap_btree.h"
#include "xfs_exchmaps.h"
#include "xfs_zone_alloc.h"
#include "scrub/scrub.h"
#include "scrub/common.h"
#include "scrub/repair.h"
@ -272,7 +273,6 @@ xchk_xref_is_used_rt_space(
xfs_extlen_t len)
{
struct xfs_rtgroup *rtg = sc->sr.rtg;
struct xfs_inode *rbmip = rtg_bitmap(rtg);
xfs_rtxnum_t startext;
xfs_rtxnum_t endext;
bool is_free;
@ -281,6 +281,13 @@ xchk_xref_is_used_rt_space(
if (xchk_skip_xref(sc->sm))
return;
if (xfs_has_zoned(sc->mp)) {
if (!xfs_zone_rgbno_is_valid(rtg,
xfs_rtb_to_rgbno(sc->mp, rtbno) + len - 1))
xchk_ino_xref_set_corrupt(sc, rtg_rmap(rtg)->i_ino);
return;
}
startext = xfs_rtb_to_rtx(sc->mp, rtbno);
endext = xfs_rtb_to_rtx(sc->mp, rtbno + len - 1);
error = xfs_rtalloc_extent_is_free(rtg, sc->tp, startext,
@ -288,5 +295,5 @@ xchk_xref_is_used_rt_space(
if (!xchk_should_check_xref(sc, &error, NULL))
return;
if (is_free)
xchk_ino_xref_set_corrupt(sc, rbmip->i_ino);
xchk_ino_xref_set_corrupt(sc, rtg_bitmap(rtg)->i_ino);
}

View file

@ -697,32 +697,6 @@ err_cur:
return error;
}
/*
* Now that we've logged the roots of the new btrees, invalidate all of the
* old blocks and free them.
*/
STATIC int
xrep_rtrefc_remove_old_tree(
struct xrep_rtrefc *rr)
{
int error;
/*
* Free all the extents that were allocated to the former rtrefcountbt
* and aren't cross-linked with something else.
*/
error = xrep_reap_metadir_fsblocks(rr->sc,
&rr->old_rtrefcountbt_blocks);
if (error)
return error;
/*
* Ensure the proper reservation for the rtrefcount inode so that we
* don't fail to expand the btree.
*/
return xrep_reset_metafile_resv(rr->sc);
}
/* Rebuild the rt refcount btree. */
int
xrep_rtrefcountbt(
@ -769,8 +743,12 @@ xrep_rtrefcountbt(
if (error)
goto out_bitmap;
/* Kill the old tree. */
error = xrep_rtrefc_remove_old_tree(rr);
/*
* Free all the extents that were allocated to the former rtrefcountbt
* and aren't cross-linked with something else.
*/
error = xrep_reap_metadir_fsblocks(rr->sc,
&rr->old_rtrefcountbt_blocks);
if (error)
goto out_bitmap;

View file

@ -810,28 +810,6 @@ err_cur:
/* Reaping the old btree. */
/* Reap the old rtrmapbt blocks. */
STATIC int
xrep_rtrmap_remove_old_tree(
struct xrep_rtrmap *rr)
{
int error;
/*
* Free all the extents that were allocated to the former rtrmapbt and
* aren't cross-linked with something else.
*/
error = xrep_reap_metadir_fsblocks(rr->sc, &rr->old_rtrmapbt_blocks);
if (error)
return error;
/*
* Ensure the proper reservation for the rtrmap inode so that we don't
* fail to expand the new btree.
*/
return xrep_reset_metafile_resv(rr->sc);
}
static inline bool
xrep_rtrmapbt_want_live_update(
struct xchk_iscan *iscan,
@ -995,8 +973,11 @@ xrep_rtrmapbt(
if (error)
goto out_records;
/* Kill the old tree. */
error = xrep_rtrmap_remove_old_tree(rr);
/*
* Free all the extents that were allocated to the former rtrmapbt and
* aren't cross-linked with something else.
*/
error = xrep_reap_metadir_fsblocks(rr->sc, &rr->old_rtrmapbt_blocks);
if (error)
goto out_records;

View file

@ -399,12 +399,14 @@ static const struct xchk_meta_ops meta_scrub_ops[] = {
},
[XFS_SCRUB_TYPE_RTBITMAP] = { /* realtime bitmap */
.type = ST_RTGROUP,
.has = xfs_has_nonzoned,
.setup = xchk_setup_rtbitmap,
.scrub = xchk_rtbitmap,
.repair = xrep_rtbitmap,
},
[XFS_SCRUB_TYPE_RTSUM] = { /* realtime summary */
.type = ST_RTGROUP,
.has = xfs_has_nonzoned,
.setup = xchk_setup_rtsummary,
.scrub = xchk_rtsummary,
.repair = xrep_rtsummary,

View file

@ -1,7 +1,7 @@
// SPDX-License-Identifier: GPL-2.0
/*
* Copyright (c) 2000-2005 Silicon Graphics, Inc.
* Copyright (c) 2016-2018 Christoph Hellwig.
* Copyright (c) 2016-2025 Christoph Hellwig.
* All Rights Reserved.
*/
#include "xfs.h"
@ -20,6 +20,8 @@
#include "xfs_errortag.h"
#include "xfs_error.h"
#include "xfs_icache.h"
#include "xfs_zone_alloc.h"
#include "xfs_rtgroup.h"
struct xfs_writepage_ctx {
struct iomap_writepage_ctx ctx;
@ -77,6 +79,26 @@ xfs_setfilesize(
return xfs_trans_commit(tp);
}
static void
xfs_ioend_put_open_zones(
struct iomap_ioend *ioend)
{
struct iomap_ioend *tmp;
/*
* Put the open zone for all ioends merged into this one (if any).
*/
list_for_each_entry(tmp, &ioend->io_list, io_list)
xfs_open_zone_put(tmp->io_private);
/*
* The main ioend might not have an open zone if the submission failed
* before xfs_zone_alloc_and_submit got called.
*/
if (ioend->io_private)
xfs_open_zone_put(ioend->io_private);
}
/*
* IO write completion.
*/
@ -86,6 +108,7 @@ xfs_end_ioend(
{
struct xfs_inode *ip = XFS_I(ioend->io_inode);
struct xfs_mount *mp = ip->i_mount;
bool is_zoned = xfs_is_zoned_inode(ip);
xfs_off_t offset = ioend->io_offset;
size_t size = ioend->io_size;
unsigned int nofs_flag;
@ -116,9 +139,10 @@ xfs_end_ioend(
error = blk_status_to_errno(ioend->io_bio.bi_status);
if (unlikely(error)) {
if (ioend->io_flags & IOMAP_IOEND_SHARED) {
ASSERT(!is_zoned);
xfs_reflink_cancel_cow_range(ip, offset, size, true);
xfs_bmap_punch_delalloc_range(ip, XFS_DATA_FORK, offset,
offset + size);
offset + size, NULL);
}
goto done;
}
@ -126,14 +150,21 @@ xfs_end_ioend(
/*
* Success: commit the COW or unwritten blocks if needed.
*/
if (ioend->io_flags & IOMAP_IOEND_SHARED)
if (is_zoned)
error = xfs_zoned_end_io(ip, offset, size, ioend->io_sector,
ioend->io_private, NULLFSBLOCK);
else if (ioend->io_flags & IOMAP_IOEND_SHARED)
error = xfs_reflink_end_cow(ip, offset, size);
else if (ioend->io_flags & IOMAP_IOEND_UNWRITTEN)
error = xfs_iomap_write_unwritten(ip, offset, size, false);
if (!error && xfs_ioend_is_append(ioend))
if (!error &&
!(ioend->io_flags & IOMAP_IOEND_DIRECT) &&
xfs_ioend_is_append(ioend))
error = xfs_setfilesize(ip, offset, size);
done:
if (is_zoned)
xfs_ioend_put_open_zones(ioend);
iomap_finish_ioends(ioend, error);
memalloc_nofs_restore(nofs_flag);
}
@ -176,17 +207,27 @@ xfs_end_io(
}
}
STATIC void
void
xfs_end_bio(
struct bio *bio)
{
struct iomap_ioend *ioend = iomap_ioend_from_bio(bio);
struct xfs_inode *ip = XFS_I(ioend->io_inode);
struct xfs_mount *mp = ip->i_mount;
unsigned long flags;
/*
* For Appends record the actually written block number and set the
* boundary flag if needed.
*/
if (IS_ENABLED(CONFIG_XFS_RT) && bio_is_zone_append(bio)) {
ioend->io_sector = bio->bi_iter.bi_sector;
xfs_mark_rtg_boundary(ioend);
}
spin_lock_irqsave(&ip->i_ioend_lock, flags);
if (list_empty(&ip->i_ioend_list))
WARN_ON_ONCE(!queue_work(ip->i_mount->m_unwritten_workqueue,
WARN_ON_ONCE(!queue_work(mp->m_unwritten_workqueue,
&ip->i_ioend_work));
list_add_tail(&ioend->io_list, &ip->i_ioend_list);
spin_unlock_irqrestore(&ip->i_ioend_lock, flags);
@ -463,7 +504,7 @@ xfs_discard_folio(
* folio itself and not the start offset that is passed in.
*/
xfs_bmap_punch_delalloc_range(ip, XFS_DATA_FORK, pos,
folio_pos(folio) + folio_size(folio));
folio_pos(folio) + folio_size(folio), NULL);
}
static const struct iomap_writeback_ops xfs_writeback_ops = {
@ -472,15 +513,125 @@ static const struct iomap_writeback_ops xfs_writeback_ops = {
.discard_folio = xfs_discard_folio,
};
struct xfs_zoned_writepage_ctx {
struct iomap_writepage_ctx ctx;
struct xfs_open_zone *open_zone;
};
static inline struct xfs_zoned_writepage_ctx *
XFS_ZWPC(struct iomap_writepage_ctx *ctx)
{
return container_of(ctx, struct xfs_zoned_writepage_ctx, ctx);
}
static int
xfs_zoned_map_blocks(
struct iomap_writepage_ctx *wpc,
struct inode *inode,
loff_t offset,
unsigned int len)
{
struct xfs_inode *ip = XFS_I(inode);
struct xfs_mount *mp = ip->i_mount;
xfs_fileoff_t offset_fsb = XFS_B_TO_FSBT(mp, offset);
xfs_fileoff_t end_fsb = XFS_B_TO_FSB(mp, offset + len);
xfs_filblks_t count_fsb;
struct xfs_bmbt_irec imap, del;
struct xfs_iext_cursor icur;
if (xfs_is_shutdown(mp))
return -EIO;
XFS_ERRORTAG_DELAY(mp, XFS_ERRTAG_WB_DELAY_MS);
/*
* All dirty data must be covered by delalloc extents. But truncate can
* remove delalloc extents underneath us or reduce their size.
* Returning a hole tells iomap to not write back any data from this
* range, which is the right thing to do in that case.
*
* Otherwise just tell iomap to treat ranges previously covered by a
* delalloc extent as mapped. The actual block allocation will be done
* just before submitting the bio.
*
* This implies we never map outside folios that are locked or marked
* as under writeback, and thus there is no need check the fork sequence
* count here.
*/
xfs_ilock(ip, XFS_ILOCK_EXCL);
if (!xfs_iext_lookup_extent(ip, ip->i_cowfp, offset_fsb, &icur, &imap))
imap.br_startoff = end_fsb; /* fake a hole past EOF */
if (imap.br_startoff > offset_fsb) {
imap.br_blockcount = imap.br_startoff - offset_fsb;
imap.br_startoff = offset_fsb;
imap.br_startblock = HOLESTARTBLOCK;
imap.br_state = XFS_EXT_NORM;
xfs_iunlock(ip, XFS_ILOCK_EXCL);
xfs_bmbt_to_iomap(ip, &wpc->iomap, &imap, 0, 0, 0);
return 0;
}
end_fsb = min(end_fsb, imap.br_startoff + imap.br_blockcount);
count_fsb = end_fsb - offset_fsb;
del = imap;
xfs_trim_extent(&del, offset_fsb, count_fsb);
xfs_bmap_del_extent_delay(ip, XFS_COW_FORK, &icur, &imap, &del,
XFS_BMAPI_REMAP);
xfs_iunlock(ip, XFS_ILOCK_EXCL);
wpc->iomap.type = IOMAP_MAPPED;
wpc->iomap.flags = IOMAP_F_DIRTY;
wpc->iomap.bdev = mp->m_rtdev_targp->bt_bdev;
wpc->iomap.offset = offset;
wpc->iomap.length = XFS_FSB_TO_B(mp, count_fsb);
wpc->iomap.flags = IOMAP_F_ANON_WRITE;
trace_xfs_zoned_map_blocks(ip, offset, wpc->iomap.length);
return 0;
}
static int
xfs_zoned_submit_ioend(
struct iomap_writepage_ctx *wpc,
int status)
{
wpc->ioend->io_bio.bi_end_io = xfs_end_bio;
if (status)
return status;
xfs_zone_alloc_and_submit(wpc->ioend, &XFS_ZWPC(wpc)->open_zone);
return 0;
}
static const struct iomap_writeback_ops xfs_zoned_writeback_ops = {
.map_blocks = xfs_zoned_map_blocks,
.submit_ioend = xfs_zoned_submit_ioend,
.discard_folio = xfs_discard_folio,
};
STATIC int
xfs_vm_writepages(
struct address_space *mapping,
struct writeback_control *wbc)
{
struct xfs_writepage_ctx wpc = { };
struct xfs_inode *ip = XFS_I(mapping->host);
xfs_iflags_clear(XFS_I(mapping->host), XFS_ITRUNCATED);
return iomap_writepages(mapping, wbc, &wpc.ctx, &xfs_writeback_ops);
xfs_iflags_clear(ip, XFS_ITRUNCATED);
if (xfs_is_zoned_inode(ip)) {
struct xfs_zoned_writepage_ctx xc = { };
int error;
error = iomap_writepages(mapping, wbc, &xc.ctx,
&xfs_zoned_writeback_ops);
if (xc.open_zone)
xfs_open_zone_put(xc.open_zone);
return error;
} else {
struct xfs_writepage_ctx wpc = { };
return iomap_writepages(mapping, wbc, &wpc.ctx,
&xfs_writeback_ops);
}
}
STATIC int

View file

@ -9,6 +9,7 @@
extern const struct address_space_operations xfs_address_space_operations;
extern const struct address_space_operations xfs_dax_aops;
int xfs_setfilesize(struct xfs_inode *ip, xfs_off_t offset, size_t size);
int xfs_setfilesize(struct xfs_inode *ip, xfs_off_t offset, size_t size);
void xfs_end_bio(struct bio *bio);
#endif /* __XFS_AOPS_H__ */

View file

@ -30,6 +30,7 @@
#include "xfs_reflink.h"
#include "xfs_rtbitmap.h"
#include "xfs_rtgroup.h"
#include "xfs_zone_alloc.h"
/* Kernel only BMAP related definitions and functions */
@ -436,7 +437,8 @@ xfs_bmap_punch_delalloc_range(
struct xfs_inode *ip,
int whichfork,
xfs_off_t start_byte,
xfs_off_t end_byte)
xfs_off_t end_byte,
struct xfs_zone_alloc_ctx *ac)
{
struct xfs_mount *mp = ip->i_mount;
struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork);
@ -467,7 +469,21 @@ xfs_bmap_punch_delalloc_range(
continue;
}
xfs_bmap_del_extent_delay(ip, whichfork, &icur, &got, &del);
if (xfs_is_zoned_inode(ip) && ac) {
/*
* In a zoned buffered write context we need to return
* the punched delalloc allocations to the allocation
* context. This allows reusing them in the following
* iomap iterations.
*/
xfs_bmap_del_extent_delay(ip, whichfork, &icur, &got,
&del, XFS_BMAPI_REMAP);
ac->reserved_blocks += del.br_blockcount;
} else {
xfs_bmap_del_extent_delay(ip, whichfork, &icur, &got,
&del, 0);
}
if (!xfs_iext_get_extent(ifp, &icur, &got))
break;
}
@ -582,7 +598,7 @@ xfs_free_eofblocks(
if (ip->i_delayed_blks) {
xfs_bmap_punch_delalloc_range(ip, XFS_DATA_FORK,
round_up(XFS_ISIZE(ip), mp->m_sb.sb_blocksize),
LLONG_MAX);
LLONG_MAX, NULL);
}
xfs_inode_clear_eofblocks_tag(ip);
return 0;
@ -825,7 +841,8 @@ int
xfs_free_file_space(
struct xfs_inode *ip,
xfs_off_t offset,
xfs_off_t len)
xfs_off_t len,
struct xfs_zone_alloc_ctx *ac)
{
struct xfs_mount *mp = ip->i_mount;
xfs_fileoff_t startoffset_fsb;
@ -880,7 +897,7 @@ xfs_free_file_space(
return 0;
if (offset + len > XFS_ISIZE(ip))
len = XFS_ISIZE(ip) - offset;
error = xfs_zero_range(ip, offset, len, NULL);
error = xfs_zero_range(ip, offset, len, ac, NULL);
if (error)
return error;
@ -968,7 +985,8 @@ int
xfs_collapse_file_space(
struct xfs_inode *ip,
xfs_off_t offset,
xfs_off_t len)
xfs_off_t len,
struct xfs_zone_alloc_ctx *ac)
{
struct xfs_mount *mp = ip->i_mount;
struct xfs_trans *tp;
@ -981,7 +999,7 @@ xfs_collapse_file_space(
trace_xfs_collapse_file_space(ip);
error = xfs_free_file_space(ip, offset, len);
error = xfs_free_file_space(ip, offset, len, ac);
if (error)
return error;

View file

@ -15,6 +15,7 @@ struct xfs_inode;
struct xfs_mount;
struct xfs_trans;
struct xfs_bmalloca;
struct xfs_zone_alloc_ctx;
#ifdef CONFIG_XFS_RT
int xfs_bmap_rtalloc(struct xfs_bmalloca *ap);
@ -31,7 +32,8 @@ xfs_bmap_rtalloc(struct xfs_bmalloca *ap)
#endif /* CONFIG_XFS_RT */
void xfs_bmap_punch_delalloc_range(struct xfs_inode *ip, int whichfork,
xfs_off_t start_byte, xfs_off_t end_byte);
xfs_off_t start_byte, xfs_off_t end_byte,
struct xfs_zone_alloc_ctx *ac);
struct kgetbmap {
__s64 bmv_offset; /* file offset of segment in blocks */
@ -54,13 +56,13 @@ int xfs_bmap_last_extent(struct xfs_trans *tp, struct xfs_inode *ip,
/* preallocation and hole punch interface */
int xfs_alloc_file_space(struct xfs_inode *ip, xfs_off_t offset,
xfs_off_t len);
xfs_off_t len);
int xfs_free_file_space(struct xfs_inode *ip, xfs_off_t offset,
xfs_off_t len);
xfs_off_t len, struct xfs_zone_alloc_ctx *ac);
int xfs_collapse_file_space(struct xfs_inode *, xfs_off_t offset,
xfs_off_t len);
xfs_off_t len, struct xfs_zone_alloc_ctx *ac);
int xfs_insert_file_space(struct xfs_inode *, xfs_off_t offset,
xfs_off_t len);
xfs_off_t len);
/* EOF block manipulation functions */
bool xfs_can_free_eofblocks(struct xfs_inode *ip);

View file

@ -55,27 +55,6 @@ static inline bool xfs_buf_is_uncached(struct xfs_buf *bp)
return bp->b_rhash_key == XFS_BUF_DADDR_NULL;
}
static inline int
xfs_buf_is_vmapped(
struct xfs_buf *bp)
{
/*
* Return true if the buffer is vmapped.
*
* b_addr is null if the buffer is not mapped, but the code is clever
* enough to know it doesn't have to map a single page, so the check has
* to be both for b_addr and bp->b_page_count > 1.
*/
return bp->b_addr && bp->b_page_count > 1;
}
static inline int
xfs_buf_vmap_len(
struct xfs_buf *bp)
{
return (bp->b_page_count * PAGE_SIZE);
}
/*
* When we mark a buffer stale, we remove the buffer from the LRU and clear the
* b_lru_ref count so that the buffer is freed immediately when the buffer
@ -109,38 +88,168 @@ xfs_buf_stale(
spin_unlock(&bp->b_lock);
}
static int
xfs_buf_get_maps(
struct xfs_buf *bp,
int map_count)
static void
xfs_buf_free_callback(
struct callback_head *cb)
{
ASSERT(bp->b_maps == NULL);
bp->b_map_count = map_count;
struct xfs_buf *bp = container_of(cb, struct xfs_buf, b_rcu);
if (map_count == 1) {
bp->b_maps = &bp->__b_map;
return 0;
}
bp->b_maps = kzalloc(map_count * sizeof(struct xfs_buf_map),
GFP_KERNEL | __GFP_NOLOCKDEP | __GFP_NOFAIL);
if (!bp->b_maps)
return -ENOMEM;
return 0;
if (bp->b_maps != &bp->__b_map)
kfree(bp->b_maps);
kmem_cache_free(xfs_buf_cache, bp);
}
static void
xfs_buf_free_maps(
struct xfs_buf *bp)
xfs_buf_free(
struct xfs_buf *bp)
{
if (bp->b_maps != &bp->__b_map) {
kfree(bp->b_maps);
bp->b_maps = NULL;
}
unsigned int size = BBTOB(bp->b_length);
trace_xfs_buf_free(bp, _RET_IP_);
ASSERT(list_empty(&bp->b_lru));
if (!xfs_buftarg_is_mem(bp->b_target) && size >= PAGE_SIZE)
mm_account_reclaimed_pages(howmany(size, PAGE_SHIFT));
if (is_vmalloc_addr(bp->b_addr))
vfree(bp->b_addr);
else if (bp->b_flags & _XBF_KMEM)
kfree(bp->b_addr);
else
folio_put(virt_to_folio(bp->b_addr));
call_rcu(&bp->b_rcu, xfs_buf_free_callback);
}
static int
_xfs_buf_alloc(
xfs_buf_alloc_kmem(
struct xfs_buf *bp,
size_t size,
gfp_t gfp_mask)
{
ASSERT(is_power_of_2(size));
ASSERT(size < PAGE_SIZE);
bp->b_addr = kmalloc(size, gfp_mask | __GFP_NOFAIL);
if (!bp->b_addr)
return -ENOMEM;
/*
* Slab guarantees that we get back naturally aligned allocations for
* power of two sizes. Keep this check as the canary in the coal mine
* if anything changes in slab.
*/
if (WARN_ON_ONCE(!IS_ALIGNED((unsigned long)bp->b_addr, size))) {
kfree(bp->b_addr);
bp->b_addr = NULL;
return -ENOMEM;
}
bp->b_flags |= _XBF_KMEM;
trace_xfs_buf_backing_kmem(bp, _RET_IP_);
return 0;
}
/*
* Allocate backing memory for a buffer.
*
* For tmpfs-backed buffers used by in-memory btrees this directly maps the
* tmpfs page cache folios.
*
* For real file system buffers there are three different kinds backing memory:
*
* The first type backs the buffer by a kmalloc allocation. This is done for
* less than PAGE_SIZE allocations to avoid wasting memory.
*
* The second type is a single folio buffer - this may be a high order folio or
* just a single page sized folio, but either way they get treated the same way
* by the rest of the code - the buffer memory spans a single contiguous memory
* region that we don't have to map and unmap to access the data directly.
*
* The third type of buffer is the vmalloc()d buffer. This provides the buffer
* with the required contiguous memory region but backed by discontiguous
* physical pages.
*/
static int
xfs_buf_alloc_backing_mem(
struct xfs_buf *bp,
xfs_buf_flags_t flags)
{
size_t size = BBTOB(bp->b_length);
gfp_t gfp_mask = GFP_KERNEL | __GFP_NOLOCKDEP | __GFP_NOWARN;
struct folio *folio;
if (xfs_buftarg_is_mem(bp->b_target))
return xmbuf_map_backing_mem(bp);
/* Assure zeroed buffer for non-read cases. */
if (!(flags & XBF_READ))
gfp_mask |= __GFP_ZERO;
if (flags & XBF_READ_AHEAD)
gfp_mask |= __GFP_NORETRY;
/*
* For buffers smaller than PAGE_SIZE use a kmalloc allocation if that
* is properly aligned. The slab allocator now guarantees an aligned
* allocation for all power of two sizes, which matches most of the
* smaller than PAGE_SIZE buffers used by XFS.
*/
if (size < PAGE_SIZE && is_power_of_2(size))
return xfs_buf_alloc_kmem(bp, size, gfp_mask);
/*
* Don't bother with the retry loop for single PAGE allocations: vmalloc
* won't do any better.
*/
if (size <= PAGE_SIZE)
gfp_mask |= __GFP_NOFAIL;
/*
* Optimistically attempt a single high order folio allocation for
* larger than PAGE_SIZE buffers.
*
* Allocating a high order folio makes the assumption that buffers are a
* power-of-2 size, matching the power-of-2 folios sizes available.
*
* The exception here are user xattr data buffers, which can be arbitrarily
* sized up to 64kB plus structure metadata, skip straight to the vmalloc
* path for them instead of wasting memory here.
*/
if (size > PAGE_SIZE) {
if (!is_power_of_2(size))
goto fallback;
gfp_mask &= ~__GFP_DIRECT_RECLAIM;
gfp_mask |= __GFP_NORETRY;
}
folio = folio_alloc(gfp_mask, get_order(size));
if (!folio) {
if (size <= PAGE_SIZE)
return -ENOMEM;
trace_xfs_buf_backing_fallback(bp, _RET_IP_);
goto fallback;
}
bp->b_addr = folio_address(folio);
trace_xfs_buf_backing_folio(bp, _RET_IP_);
return 0;
fallback:
for (;;) {
bp->b_addr = __vmalloc(size, gfp_mask);
if (bp->b_addr)
break;
if (flags & XBF_READ_AHEAD)
return -ENOMEM;
XFS_STATS_INC(bp->b_mount, xb_page_retries);
memalloc_retry_wait(gfp_mask);
}
trace_xfs_buf_backing_vmalloc(bp, _RET_IP_);
return 0;
}
static int
xfs_buf_alloc(
struct xfs_buftarg *target,
struct xfs_buf_map *map,
int nmaps,
@ -159,7 +268,7 @@ _xfs_buf_alloc(
* We don't want certain flags to appear in b_flags unless they are
* specifically set by later operations on the buffer.
*/
flags &= ~(XBF_UNMAPPED | XBF_TRYLOCK | XBF_ASYNC | XBF_READ_AHEAD);
flags &= ~(XBF_TRYLOCK | XBF_ASYNC | XBF_READ_AHEAD);
/*
* A new buffer is held and locked by the owner. This ensures that the
@ -179,15 +288,14 @@ _xfs_buf_alloc(
bp->b_target = target;
bp->b_mount = target->bt_mount;
bp->b_flags = flags;
error = xfs_buf_get_maps(bp, nmaps);
if (error) {
kmem_cache_free(xfs_buf_cache, bp);
return error;
}
bp->b_rhash_key = map[0].bm_bn;
bp->b_length = 0;
bp->b_map_count = nmaps;
if (nmaps == 1)
bp->b_maps = &bp->__b_map;
else
bp->b_maps = kcalloc(nmaps, sizeof(struct xfs_buf_map),
GFP_KERNEL | __GFP_NOLOCKDEP | __GFP_NOFAIL);
for (i = 0; i < nmaps; i++) {
bp->b_maps[i].bm_bn = map[i].bm_bn;
bp->b_maps[i].bm_len = map[i].bm_len;
@ -200,198 +308,16 @@ _xfs_buf_alloc(
XFS_STATS_INC(bp->b_mount, xb_create);
trace_xfs_buf_init(bp, _RET_IP_);
error = xfs_buf_alloc_backing_mem(bp, flags);
if (error) {
xfs_buf_free(bp);
return error;
}
*bpp = bp;
return 0;
}
static void
xfs_buf_free_pages(
struct xfs_buf *bp)
{
uint i;
ASSERT(bp->b_flags & _XBF_PAGES);
if (xfs_buf_is_vmapped(bp))
vm_unmap_ram(bp->b_addr, bp->b_page_count);
for (i = 0; i < bp->b_page_count; i++) {
if (bp->b_pages[i])
__free_page(bp->b_pages[i]);
}
mm_account_reclaimed_pages(bp->b_page_count);
if (bp->b_pages != bp->b_page_array)
kfree(bp->b_pages);
bp->b_pages = NULL;
bp->b_flags &= ~_XBF_PAGES;
}
static void
xfs_buf_free_callback(
struct callback_head *cb)
{
struct xfs_buf *bp = container_of(cb, struct xfs_buf, b_rcu);
xfs_buf_free_maps(bp);
kmem_cache_free(xfs_buf_cache, bp);
}
static void
xfs_buf_free(
struct xfs_buf *bp)
{
trace_xfs_buf_free(bp, _RET_IP_);
ASSERT(list_empty(&bp->b_lru));
if (xfs_buftarg_is_mem(bp->b_target))
xmbuf_unmap_page(bp);
else if (bp->b_flags & _XBF_PAGES)
xfs_buf_free_pages(bp);
else if (bp->b_flags & _XBF_KMEM)
kfree(bp->b_addr);
call_rcu(&bp->b_rcu, xfs_buf_free_callback);
}
static int
xfs_buf_alloc_kmem(
struct xfs_buf *bp,
xfs_buf_flags_t flags)
{
gfp_t gfp_mask = GFP_KERNEL | __GFP_NOLOCKDEP | __GFP_NOFAIL;
size_t size = BBTOB(bp->b_length);
/* Assure zeroed buffer for non-read cases. */
if (!(flags & XBF_READ))
gfp_mask |= __GFP_ZERO;
bp->b_addr = kmalloc(size, gfp_mask);
if (!bp->b_addr)
return -ENOMEM;
if (((unsigned long)(bp->b_addr + size - 1) & PAGE_MASK) !=
((unsigned long)bp->b_addr & PAGE_MASK)) {
/* b_addr spans two pages - use alloc_page instead */
kfree(bp->b_addr);
bp->b_addr = NULL;
return -ENOMEM;
}
bp->b_offset = offset_in_page(bp->b_addr);
bp->b_pages = bp->b_page_array;
bp->b_pages[0] = kmem_to_page(bp->b_addr);
bp->b_page_count = 1;
bp->b_flags |= _XBF_KMEM;
return 0;
}
static int
xfs_buf_alloc_pages(
struct xfs_buf *bp,
xfs_buf_flags_t flags)
{
gfp_t gfp_mask = GFP_KERNEL | __GFP_NOLOCKDEP | __GFP_NOWARN;
long filled = 0;
if (flags & XBF_READ_AHEAD)
gfp_mask |= __GFP_NORETRY;
/* Make sure that we have a page list */
bp->b_page_count = DIV_ROUND_UP(BBTOB(bp->b_length), PAGE_SIZE);
if (bp->b_page_count <= XB_PAGES) {
bp->b_pages = bp->b_page_array;
} else {
bp->b_pages = kzalloc(sizeof(struct page *) * bp->b_page_count,
gfp_mask);
if (!bp->b_pages)
return -ENOMEM;
}
bp->b_flags |= _XBF_PAGES;
/* Assure zeroed buffer for non-read cases. */
if (!(flags & XBF_READ))
gfp_mask |= __GFP_ZERO;
/*
* Bulk filling of pages can take multiple calls. Not filling the entire
* array is not an allocation failure, so don't back off if we get at
* least one extra page.
*/
for (;;) {
long last = filled;
filled = alloc_pages_bulk(gfp_mask, bp->b_page_count,
bp->b_pages);
if (filled == bp->b_page_count) {
XFS_STATS_INC(bp->b_mount, xb_page_found);
break;
}
if (filled != last)
continue;
if (flags & XBF_READ_AHEAD) {
xfs_buf_free_pages(bp);
return -ENOMEM;
}
XFS_STATS_INC(bp->b_mount, xb_page_retries);
memalloc_retry_wait(gfp_mask);
}
return 0;
}
/*
* Map buffer into kernel address-space if necessary.
*/
STATIC int
_xfs_buf_map_pages(
struct xfs_buf *bp,
xfs_buf_flags_t flags)
{
ASSERT(bp->b_flags & _XBF_PAGES);
if (bp->b_page_count == 1) {
/* A single page buffer is always mappable */
bp->b_addr = page_address(bp->b_pages[0]);
} else if (flags & XBF_UNMAPPED) {
bp->b_addr = NULL;
} else {
int retried = 0;
unsigned nofs_flag;
/*
* vm_map_ram() will allocate auxiliary structures (e.g.
* pagetables) with GFP_KERNEL, yet we often under a scoped nofs
* context here. Mixing GFP_KERNEL with GFP_NOFS allocations
* from the same call site that can be run from both above and
* below memory reclaim causes lockdep false positives. Hence we
* always need to force this allocation to nofs context because
* we can't pass __GFP_NOLOCKDEP down to auxillary structures to
* prevent false positive lockdep reports.
*
* XXX(dgc): I think dquot reclaim is the only place we can get
* to this function from memory reclaim context now. If we fix
* that like we've fixed inode reclaim to avoid writeback from
* reclaim, this nofs wrapping can go away.
*/
nofs_flag = memalloc_nofs_save();
do {
bp->b_addr = vm_map_ram(bp->b_pages, bp->b_page_count,
-1);
if (bp->b_addr)
break;
vm_unmap_aliases();
} while (retried++ <= 1);
memalloc_nofs_restore(nofs_flag);
if (!bp->b_addr)
return -ENOMEM;
}
return 0;
}
/*
* Finding and Reading Buffers
*/
@ -507,7 +433,7 @@ xfs_buf_find_lock(
return -ENOENT;
}
ASSERT((bp->b_flags & _XBF_DELWRI_Q) == 0);
bp->b_flags &= _XBF_KMEM | _XBF_PAGES;
bp->b_flags &= _XBF_KMEM;
bp->b_ops = NULL;
}
return 0;
@ -575,25 +501,10 @@ xfs_buf_find_insert(
struct xfs_buf *bp;
int error;
error = _xfs_buf_alloc(btp, map, nmaps, flags, &new_bp);
error = xfs_buf_alloc(btp, map, nmaps, flags, &new_bp);
if (error)
goto out_drop_pag;
if (xfs_buftarg_is_mem(new_bp->b_target)) {
error = xmbuf_map_page(new_bp);
} else if (BBTOB(new_bp->b_length) >= PAGE_SIZE ||
xfs_buf_alloc_kmem(new_bp, flags) < 0) {
/*
* For buffers that fit entirely within a single page, first
* attempt to allocate the memory from the heap to minimise
* memory usage. If we can't get heap memory for these small
* buffers, we fall back to using the page allocator.
*/
error = xfs_buf_alloc_pages(new_bp, flags);
}
if (error)
goto out_free_buf;
/* The new buffer keeps the perag reference until it is freed. */
new_bp->b_pag = pag;
@ -704,18 +615,6 @@ xfs_buf_get_map(
xfs_perag_put(pag);
}
/* We do not hold a perag reference anymore. */
if (!bp->b_addr) {
error = _xfs_buf_map_pages(bp, flags);
if (unlikely(error)) {
xfs_warn_ratelimited(btp->bt_mount,
"%s: failed to map %u pages", __func__,
bp->b_page_count);
xfs_buf_relse(bp);
return error;
}
}
/*
* Clear b_error if this is a lookup from a caller that doesn't expect
* valid data to be found in the buffer.
@ -903,7 +802,6 @@ xfs_buf_read_uncached(
struct xfs_buftarg *target,
xfs_daddr_t daddr,
size_t numblks,
xfs_buf_flags_t flags,
struct xfs_buf **bpp,
const struct xfs_buf_ops *ops)
{
@ -912,7 +810,7 @@ xfs_buf_read_uncached(
*bpp = NULL;
error = xfs_buf_get_uncached(target, numblks, flags, &bp);
error = xfs_buf_get_uncached(target, numblks, &bp);
if (error)
return error;
@ -938,42 +836,14 @@ int
xfs_buf_get_uncached(
struct xfs_buftarg *target,
size_t numblks,
xfs_buf_flags_t flags,
struct xfs_buf **bpp)
{
int error;
struct xfs_buf *bp;
DEFINE_SINGLE_BUF_MAP(map, XFS_BUF_DADDR_NULL, numblks);
/* there are currently no valid flags for xfs_buf_get_uncached */
ASSERT(flags == 0);
*bpp = NULL;
error = _xfs_buf_alloc(target, &map, 1, flags, &bp);
if (error)
return error;
if (xfs_buftarg_is_mem(bp->b_target))
error = xmbuf_map_page(bp);
else
error = xfs_buf_alloc_pages(bp, flags);
if (error)
goto fail_free_buf;
error = _xfs_buf_map_pages(bp, 0);
if (unlikely(error)) {
xfs_warn(target->bt_mount,
"%s: failed to map pages", __func__);
goto fail_free_buf;
}
trace_xfs_buf_get_uncached(bp, _RET_IP_);
*bpp = bp;
return 0;
fail_free_buf:
xfs_buf_free(bp);
error = xfs_buf_alloc(target, &map, 1, 0, bpp);
if (!error)
trace_xfs_buf_get_uncached(*bpp, _RET_IP_);
return error;
}
@ -1299,9 +1169,9 @@ __xfs_buf_ioend(
trace_xfs_buf_iodone(bp, _RET_IP_);
if (bp->b_flags & XBF_READ) {
if (!bp->b_error && xfs_buf_is_vmapped(bp))
if (!bp->b_error && is_vmalloc_addr(bp->b_addr))
invalidate_kernel_vmap_range(bp->b_addr,
xfs_buf_vmap_len(bp));
roundup(BBTOB(bp->b_length), PAGE_SIZE));
if (!bp->b_error && bp->b_ops)
bp->b_ops->verify_read(bp);
if (!bp->b_error)
@ -1462,29 +1332,48 @@ static void
xfs_buf_submit_bio(
struct xfs_buf *bp)
{
unsigned int size = BBTOB(bp->b_length);
unsigned int map = 0, p;
unsigned int map = 0;
struct blk_plug plug;
struct bio *bio;
bio = bio_alloc(bp->b_target->bt_bdev, bp->b_page_count,
xfs_buf_bio_op(bp), GFP_NOIO);
if (is_vmalloc_addr(bp->b_addr)) {
unsigned int size = BBTOB(bp->b_length);
unsigned int alloc_size = roundup(size, PAGE_SIZE);
void *data = bp->b_addr;
bio = bio_alloc(bp->b_target->bt_bdev, alloc_size >> PAGE_SHIFT,
xfs_buf_bio_op(bp), GFP_NOIO);
do {
unsigned int len = min(size, PAGE_SIZE);
ASSERT(offset_in_page(data) == 0);
__bio_add_page(bio, vmalloc_to_page(data), len, 0);
data += len;
size -= len;
} while (size);
flush_kernel_vmap_range(bp->b_addr, alloc_size);
} else {
/*
* Single folio or slab allocation. Must be contiguous and thus
* only a single bvec is needed.
*
* This uses the page based bio add helper for now as that is
* the lowest common denominator between folios and slab
* allocations. To be replaced with a better block layer
* helper soon (hopefully).
*/
bio = bio_alloc(bp->b_target->bt_bdev, 1, xfs_buf_bio_op(bp),
GFP_NOIO);
__bio_add_page(bio, virt_to_page(bp->b_addr),
BBTOB(bp->b_length),
offset_in_page(bp->b_addr));
}
bio->bi_private = bp;
bio->bi_end_io = xfs_buf_bio_end_io;
if (bp->b_flags & _XBF_KMEM) {
__bio_add_page(bio, virt_to_page(bp->b_addr), size,
bp->b_offset);
} else {
for (p = 0; p < bp->b_page_count; p++)
__bio_add_page(bio, bp->b_pages[p], PAGE_SIZE, 0);
bio->bi_iter.bi_size = size; /* limit to the actual size used */
if (xfs_buf_is_vmapped(bp))
flush_kernel_vmap_range(bp->b_addr,
xfs_buf_vmap_len(bp));
}
/*
* If there is more than one map segment, split out a new bio for each
* map except of the last one. The last map is handled by the
@ -1611,47 +1500,6 @@ xfs_buf_submit(
xfs_buf_submit_bio(bp);
}
void *
xfs_buf_offset(
struct xfs_buf *bp,
size_t offset)
{
struct page *page;
if (bp->b_addr)
return bp->b_addr + offset;
page = bp->b_pages[offset >> PAGE_SHIFT];
return page_address(page) + (offset & (PAGE_SIZE-1));
}
void
xfs_buf_zero(
struct xfs_buf *bp,
size_t boff,
size_t bsize)
{
size_t bend;
bend = boff + bsize;
while (boff < bend) {
struct page *page;
int page_index, page_offset, csize;
page_index = (boff + bp->b_offset) >> PAGE_SHIFT;
page_offset = (boff + bp->b_offset) & ~PAGE_MASK;
page = bp->b_pages[page_index];
csize = min_t(size_t, PAGE_SIZE - page_offset,
BBTOB(bp->b_length) - boff);
ASSERT((csize + page_offset) <= PAGE_SIZE);
memset(page_address(page) + page_offset, 0, csize);
boff += csize;
}
}
/*
* Log a message about and stale a buffer that a caller has decided is corrupt.
*

View file

@ -36,7 +36,6 @@ struct xfs_buf;
#define _XBF_LOGRECOVERY (1u << 18)/* log recovery buffer */
/* flags used only internally */
#define _XBF_PAGES (1u << 20)/* backed by refcounted pages */
#define _XBF_KMEM (1u << 21)/* backed by heap memory */
#define _XBF_DELWRI_Q (1u << 22)/* buffer on a delwri queue */
@ -48,7 +47,6 @@ struct xfs_buf;
#define XBF_LIVESCAN (1u << 28)
#define XBF_INCORE (1u << 29)/* lookup only, return if found in cache */
#define XBF_TRYLOCK (1u << 30)/* lock requested, but do not wait */
#define XBF_UNMAPPED (1u << 31)/* do not map the buffer */
typedef unsigned int xfs_buf_flags_t;
@ -62,14 +60,12 @@ typedef unsigned int xfs_buf_flags_t;
{ XBF_STALE, "STALE" }, \
{ XBF_WRITE_FAIL, "WRITE_FAIL" }, \
{ _XBF_LOGRECOVERY, "LOG_RECOVERY" }, \
{ _XBF_PAGES, "PAGES" }, \
{ _XBF_KMEM, "KMEM" }, \
{ _XBF_DELWRI_Q, "DELWRI_Q" }, \
/* The following interface flags should never be set */ \
{ XBF_LIVESCAN, "LIVESCAN" }, \
{ XBF_INCORE, "INCORE" }, \
{ XBF_TRYLOCK, "TRYLOCK" }, \
{ XBF_UNMAPPED, "UNMAPPED" }
{ XBF_TRYLOCK, "TRYLOCK" }
/*
* Internal state flags.
@ -124,8 +120,6 @@ struct xfs_buftarg {
struct xfs_buf_cache bt_cache[];
};
#define XB_PAGES 2
struct xfs_buf_map {
xfs_daddr_t bm_bn; /* block number for I/O */
int bm_len; /* size of I/O */
@ -187,15 +181,10 @@ struct xfs_buf {
struct xfs_buf_log_item *b_log_item;
struct list_head b_li_list; /* Log items list head */
struct xfs_trans *b_transp;
struct page **b_pages; /* array of page pointers */
struct page *b_page_array[XB_PAGES]; /* inline pages */
struct xfs_buf_map *b_maps; /* compound buffer map */
struct xfs_buf_map __b_map; /* inline compound buffer map */
int b_map_count;
atomic_t b_pin_count; /* pin count */
unsigned int b_page_count; /* size of page array */
unsigned int b_offset; /* page offset of b_addr,
only for _XBF_KMEM buffers */
int b_error; /* error code on I/O */
void (*b_iodone)(struct xfs_buf *bp);
@ -284,9 +273,9 @@ xfs_buf_readahead(
}
int xfs_buf_get_uncached(struct xfs_buftarg *target, size_t numblks,
xfs_buf_flags_t flags, struct xfs_buf **bpp);
struct xfs_buf **bpp);
int xfs_buf_read_uncached(struct xfs_buftarg *target, xfs_daddr_t daddr,
size_t numblks, xfs_buf_flags_t flags, struct xfs_buf **bpp,
size_t numblks, struct xfs_buf **bpp,
const struct xfs_buf_ops *ops);
int _xfs_buf_read(struct xfs_buf *bp);
void xfs_buf_hold(struct xfs_buf *bp);
@ -315,12 +304,20 @@ extern void __xfs_buf_ioerror(struct xfs_buf *bp, int error,
#define xfs_buf_ioerror(bp, err) __xfs_buf_ioerror((bp), (err), __this_address)
extern void xfs_buf_ioerror_alert(struct xfs_buf *bp, xfs_failaddr_t fa);
void xfs_buf_ioend_fail(struct xfs_buf *);
void xfs_buf_zero(struct xfs_buf *bp, size_t boff, size_t bsize);
void __xfs_buf_mark_corrupt(struct xfs_buf *bp, xfs_failaddr_t fa);
#define xfs_buf_mark_corrupt(bp) __xfs_buf_mark_corrupt((bp), __this_address)
/* Buffer Utility Routines */
extern void *xfs_buf_offset(struct xfs_buf *, size_t);
static inline void *xfs_buf_offset(struct xfs_buf *bp, size_t offset)
{
return bp->b_addr + offset;
}
static inline void xfs_buf_zero(struct xfs_buf *bp, size_t boff, size_t bsize)
{
memset(bp->b_addr + boff, 0, bsize);
}
extern void xfs_buf_stale(struct xfs_buf *bp);
/* Delayed Write Buffer Routines */

View file

@ -57,24 +57,6 @@ xfs_buf_log_format_size(
(blfp->blf_map_size * sizeof(blfp->blf_data_map[0]));
}
static inline bool
xfs_buf_item_straddle(
struct xfs_buf *bp,
uint offset,
int first_bit,
int nbits)
{
void *first, *last;
first = xfs_buf_offset(bp, offset + (first_bit << XFS_BLF_SHIFT));
last = xfs_buf_offset(bp,
offset + ((first_bit + nbits) << XFS_BLF_SHIFT));
if (last - first != nbits * XFS_BLF_CHUNK)
return true;
return false;
}
/*
* Return the number of log iovecs and space needed to log the given buf log
* item segment.
@ -91,11 +73,8 @@ xfs_buf_item_size_segment(
int *nvecs,
int *nbytes)
{
struct xfs_buf *bp = bip->bli_buf;
int first_bit;
int nbits;
int next_bit;
int last_bit;
first_bit = xfs_next_bit(blfp->blf_data_map, blfp->blf_map_size, 0);
if (first_bit == -1)
@ -108,15 +87,6 @@ xfs_buf_item_size_segment(
nbits = xfs_contig_bits(blfp->blf_data_map,
blfp->blf_map_size, first_bit);
ASSERT(nbits > 0);
/*
* Straddling a page is rare because we don't log contiguous
* chunks of unmapped buffers anywhere.
*/
if (nbits > 1 &&
xfs_buf_item_straddle(bp, offset, first_bit, nbits))
goto slow_scan;
(*nvecs)++;
*nbytes += nbits * XFS_BLF_CHUNK;
@ -131,40 +101,6 @@ xfs_buf_item_size_segment(
} while (first_bit != -1);
return;
slow_scan:
/* Count the first bit we jumped out of the above loop from */
(*nvecs)++;
*nbytes += XFS_BLF_CHUNK;
last_bit = first_bit;
while (last_bit != -1) {
/*
* This takes the bit number to start looking from and
* returns the next set bit from there. It returns -1
* if there are no more bits set or the start bit is
* beyond the end of the bitmap.
*/
next_bit = xfs_next_bit(blfp->blf_data_map, blfp->blf_map_size,
last_bit + 1);
/*
* If we run out of bits, leave the loop,
* else if we find a new set of bits bump the number of vecs,
* else keep scanning the current set of bits.
*/
if (next_bit == -1) {
break;
} else if (next_bit != last_bit + 1 ||
xfs_buf_item_straddle(bp, offset, first_bit, nbits)) {
last_bit = next_bit;
first_bit = next_bit;
(*nvecs)++;
nbits = 1;
} else {
last_bit++;
nbits++;
}
*nbytes += XFS_BLF_CHUNK;
}
}
/*
@ -277,8 +213,6 @@ xfs_buf_item_format_segment(
struct xfs_buf *bp = bip->bli_buf;
uint base_size;
int first_bit;
int last_bit;
int next_bit;
uint nbits;
/* copy the flags across from the base format item */
@ -323,15 +257,6 @@ xfs_buf_item_format_segment(
nbits = xfs_contig_bits(blfp->blf_data_map,
blfp->blf_map_size, first_bit);
ASSERT(nbits > 0);
/*
* Straddling a page is rare because we don't log contiguous
* chunks of unmapped buffers anywhere.
*/
if (nbits > 1 &&
xfs_buf_item_straddle(bp, offset, first_bit, nbits))
goto slow_scan;
xfs_buf_item_copy_iovec(lv, vecp, bp, offset,
first_bit, nbits);
blfp->blf_size++;
@ -347,45 +272,6 @@ xfs_buf_item_format_segment(
} while (first_bit != -1);
return;
slow_scan:
ASSERT(bp->b_addr == NULL);
last_bit = first_bit;
nbits = 1;
for (;;) {
/*
* This takes the bit number to start looking from and
* returns the next set bit from there. It returns -1
* if there are no more bits set or the start bit is
* beyond the end of the bitmap.
*/
next_bit = xfs_next_bit(blfp->blf_data_map, blfp->blf_map_size,
(uint)last_bit + 1);
/*
* If we run out of bits fill in the last iovec and get out of
* the loop. Else if we start a new set of bits then fill in
* the iovec for the series we were looking at and start
* counting the bits in the new one. Else we're still in the
* same set of bits so just keep counting and scanning.
*/
if (next_bit == -1) {
xfs_buf_item_copy_iovec(lv, vecp, bp, offset,
first_bit, nbits);
blfp->blf_size++;
break;
} else if (next_bit != last_bit + 1 ||
xfs_buf_item_straddle(bp, offset, first_bit, nbits)) {
xfs_buf_item_copy_iovec(lv, vecp, bp, offset,
first_bit, nbits);
blfp->blf_size++;
first_bit = next_bit;
last_bit = next_bit;
nbits = 1;
} else {
last_bit++;
nbits++;
}
}
}
/*

View file

@ -1006,7 +1006,6 @@ xlog_recover_buf_commit_pass2(
struct xfs_mount *mp = log->l_mp;
struct xfs_buf *bp;
int error;
uint buf_flags;
xfs_lsn_t lsn;
/*
@ -1025,13 +1024,8 @@ xlog_recover_buf_commit_pass2(
}
trace_xfs_log_recover_buf_recover(log, buf_f);
buf_flags = 0;
if (buf_f->blf_flags & XFS_BLF_INODE_BUF)
buf_flags |= XBF_UNMAPPED;
error = xfs_buf_read(mp->m_ddev_targp, buf_f->blf_blkno, buf_f->blf_len,
buf_flags, &bp, NULL);
0, &bp, NULL);
if (error)
return error;

View file

@ -74,7 +74,7 @@ xmbuf_alloc(
/*
* We don't want to bother with kmapping data during repair, so don't
* allow highmem pages to back this mapping.
* allow highmem folios to back this mapping.
*/
mapping_set_gfp_mask(inode->i_mapping, GFP_KERNEL);
@ -127,14 +127,13 @@ xmbuf_free(
kfree(btp);
}
/* Directly map a shmem page into the buffer cache. */
/* Directly map a shmem folio into the buffer cache. */
int
xmbuf_map_page(
xmbuf_map_backing_mem(
struct xfs_buf *bp)
{
struct inode *inode = file_inode(bp->b_target->bt_file);
struct folio *folio = NULL;
struct page *page;
loff_t pos = BBTOB(xfs_buf_daddr(bp));
int error;
@ -159,39 +158,17 @@ xmbuf_map_page(
return -EIO;
}
page = folio_file_page(folio, pos >> PAGE_SHIFT);
/*
* Mark the page dirty so that it won't be reclaimed once we drop the
* (potentially last) reference in xmbuf_unmap_page.
* Mark the folio dirty so that it won't be reclaimed once we drop the
* (potentially last) reference in xfs_buf_free.
*/
set_page_dirty(page);
unlock_page(page);
folio_set_dirty(folio);
folio_unlock(folio);
bp->b_addr = page_address(page);
bp->b_pages = bp->b_page_array;
bp->b_pages[0] = page;
bp->b_page_count = 1;
bp->b_addr = folio_address(folio);
return 0;
}
/* Unmap a shmem page that was mapped into the buffer cache. */
void
xmbuf_unmap_page(
struct xfs_buf *bp)
{
struct page *page = bp->b_pages[0];
ASSERT(xfs_buftarg_is_mem(bp->b_target));
put_page(page);
bp->b_addr = NULL;
bp->b_pages[0] = NULL;
bp->b_pages = NULL;
bp->b_page_count = 0;
}
/* Is this a valid daddr within the buftarg? */
bool
xmbuf_verify_daddr(
@ -205,7 +182,7 @@ xmbuf_verify_daddr(
return daddr < (inode->i_sb->s_maxbytes >> BBSHIFT);
}
/* Discard the page backing this buffer. */
/* Discard the folio backing this buffer. */
static void
xmbuf_stale(
struct xfs_buf *bp)
@ -220,7 +197,7 @@ xmbuf_stale(
}
/*
* Finalize a buffer -- discard the backing page if it's stale, or run the
* Finalize a buffer -- discard the backing folio if it's stale, or run the
* write verifier to detect problems.
*/
int

View file

@ -19,16 +19,14 @@ int xmbuf_alloc(struct xfs_mount *mp, const char *descr,
struct xfs_buftarg **btpp);
void xmbuf_free(struct xfs_buftarg *btp);
int xmbuf_map_page(struct xfs_buf *bp);
void xmbuf_unmap_page(struct xfs_buf *bp);
bool xmbuf_verify_daddr(struct xfs_buftarg *btp, xfs_daddr_t daddr);
void xmbuf_trans_bdetach(struct xfs_trans *tp, struct xfs_buf *bp);
int xmbuf_finalize(struct xfs_buf *bp);
#else
# define xfs_buftarg_is_mem(...) (false)
# define xmbuf_map_page(...) (-ENOMEM)
# define xmbuf_unmap_page(...) ((void)0)
# define xmbuf_verify_daddr(...) (false)
#endif /* CONFIG_XFS_MEMORY_BUFS */
int xmbuf_map_backing_mem(struct xfs_buf *bp);
#endif /* __XFS_BUF_MEM_H__ */

View file

@ -844,7 +844,8 @@ xfs_ioc_trim(
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
if (mp->m_rtdev_targp &&
if (mp->m_rtdev_targp && !xfs_has_zoned(mp) &&
bdev_max_discard_sectors(mp->m_rtdev_targp->bt_bdev))
rt_bdev = mp->m_rtdev_targp->bt_bdev;
if (!bdev_max_discard_sectors(mp->m_ddev_targp->bt_bdev) && !rt_bdev)

View file

@ -671,7 +671,7 @@ xfs_extent_busy_wait_all(
while ((pag = xfs_perag_next(mp, pag)))
xfs_extent_busy_wait_group(pag_group(pag));
if (xfs_has_rtgroups(mp))
if (xfs_has_rtgroups(mp) && !xfs_has_zoned(mp))
while ((rtg = xfs_rtgroup_next(mp, rtg)))
xfs_extent_busy_wait_group(rtg_group(rtg));
}

View file

@ -29,6 +29,7 @@
#include "xfs_inode.h"
#include "xfs_rtbitmap.h"
#include "xfs_rtgroup.h"
#include "xfs_zone_alloc.h"
struct kmem_cache *xfs_efi_cache;
struct kmem_cache *xfs_efd_cache;
@ -767,21 +768,35 @@ xfs_rtextent_free_finish_item(
trace_xfs_extent_free_deferred(mp, xefi);
if (!(xefi->xefi_flags & XFS_EFI_CANCELLED)) {
if (*rtgp != to_rtg(xefi->xefi_group)) {
*rtgp = to_rtg(xefi->xefi_group);
xfs_rtgroup_lock(*rtgp, XFS_RTGLOCK_BITMAP);
xfs_rtgroup_trans_join(tp, *rtgp,
XFS_RTGLOCK_BITMAP);
}
error = xfs_rtfree_blocks(tp, *rtgp,
xefi->xefi_startblock, xefi->xefi_blockcount);
if (xefi->xefi_flags & XFS_EFI_CANCELLED)
goto done;
if (*rtgp != to_rtg(xefi->xefi_group)) {
unsigned int lock_flags;
if (xfs_has_zoned(mp))
lock_flags = XFS_RTGLOCK_RMAP;
else
lock_flags = XFS_RTGLOCK_BITMAP;
*rtgp = to_rtg(xefi->xefi_group);
xfs_rtgroup_lock(*rtgp, lock_flags);
xfs_rtgroup_trans_join(tp, *rtgp, lock_flags);
}
if (xfs_has_zoned(mp)) {
error = xfs_zone_free_blocks(tp, *rtgp, xefi->xefi_startblock,
xefi->xefi_blockcount);
} else {
error = xfs_rtfree_blocks(tp, *rtgp, xefi->xefi_startblock,
xefi->xefi_blockcount);
}
if (error == -EAGAIN) {
xfs_efd_from_efi(efdp);
return error;
}
done:
xfs_efd_add_extent(efdp, xefi);
xfs_extent_free_cancel_item(item);
return error;

View file

@ -25,6 +25,8 @@
#include "xfs_iomap.h"
#include "xfs_reflink.h"
#include "xfs_file.h"
#include "xfs_aops.h"
#include "xfs_zone_alloc.h"
#include <linux/dax.h>
#include <linux/falloc.h>
@ -150,7 +152,7 @@ xfs_file_fsync(
* ensure newly written file data make it to disk before logging the new
* inode size in case of an extending write.
*/
if (XFS_IS_REALTIME_INODE(ip))
if (XFS_IS_REALTIME_INODE(ip) && mp->m_rtdev_targp != mp->m_ddev_targp)
error = blkdev_issue_flush(mp->m_rtdev_targp->bt_bdev);
else if (mp->m_logdev_targp != mp->m_ddev_targp)
error = blkdev_issue_flush(mp->m_ddev_targp->bt_bdev);
@ -360,7 +362,8 @@ xfs_file_write_zero_eof(
struct iov_iter *from,
unsigned int *iolock,
size_t count,
bool *drained_dio)
bool *drained_dio,
struct xfs_zone_alloc_ctx *ac)
{
struct xfs_inode *ip = XFS_I(iocb->ki_filp->f_mapping->host);
loff_t isize;
@ -414,7 +417,7 @@ xfs_file_write_zero_eof(
trace_xfs_zero_eof(ip, isize, iocb->ki_pos - isize);
xfs_ilock(ip, XFS_MMAPLOCK_EXCL);
error = xfs_zero_range(ip, isize, iocb->ki_pos - isize, NULL);
error = xfs_zero_range(ip, isize, iocb->ki_pos - isize, ac, NULL);
xfs_iunlock(ip, XFS_MMAPLOCK_EXCL);
return error;
@ -431,7 +434,8 @@ STATIC ssize_t
xfs_file_write_checks(
struct kiocb *iocb,
struct iov_iter *from,
unsigned int *iolock)
unsigned int *iolock,
struct xfs_zone_alloc_ctx *ac)
{
struct inode *inode = iocb->ki_filp->f_mapping->host;
size_t count = iov_iter_count(from);
@ -481,7 +485,7 @@ restart:
*/
if (iocb->ki_pos > i_size_read(inode)) {
error = xfs_file_write_zero_eof(iocb, from, iolock, count,
&drained_dio);
&drained_dio, ac);
if (error == 1)
goto restart;
if (error)
@ -491,6 +495,48 @@ restart:
return kiocb_modified(iocb);
}
static ssize_t
xfs_zoned_write_space_reserve(
struct xfs_inode *ip,
struct kiocb *iocb,
struct iov_iter *from,
unsigned int flags,
struct xfs_zone_alloc_ctx *ac)
{
loff_t count = iov_iter_count(from);
int error;
if (iocb->ki_flags & IOCB_NOWAIT)
flags |= XFS_ZR_NOWAIT;
/*
* Check the rlimit and LFS boundary first so that we don't over-reserve
* by possibly a lot.
*
* The generic write path will redo this check later, and it might have
* changed by then. If it got expanded we'll stick to our earlier
* smaller limit, and if it is decreased the new smaller limit will be
* used and our extra space reservation will be returned after finishing
* the write.
*/
error = generic_write_check_limits(iocb->ki_filp, iocb->ki_pos, &count);
if (error)
return error;
/*
* Sloppily round up count to file system blocks.
*
* This will often reserve an extra block, but that avoids having to look
* at the start offset, which isn't stable for O_APPEND until taking the
* iolock. Also we need to reserve a block each for zeroing the old
* EOF block and the new start block if they are unaligned.
*
* Any remaining block will be returned after the write.
*/
return xfs_zoned_space_reserve(ip,
XFS_B_TO_FSB(ip->i_mount, count) + 1 + 2, flags, ac);
}
static int
xfs_dio_write_end_io(
struct kiocb *iocb,
@ -503,6 +549,9 @@ xfs_dio_write_end_io(
loff_t offset = iocb->ki_pos;
unsigned int nofs_flag;
ASSERT(!xfs_is_zoned_inode(ip) ||
!(flags & (IOMAP_DIO_UNWRITTEN | IOMAP_DIO_COW)));
trace_xfs_end_io_direct_write(ip, offset, size);
if (xfs_is_shutdown(ip->i_mount))
@ -582,14 +631,51 @@ static const struct iomap_dio_ops xfs_dio_write_ops = {
.end_io = xfs_dio_write_end_io,
};
static void
xfs_dio_zoned_submit_io(
const struct iomap_iter *iter,
struct bio *bio,
loff_t file_offset)
{
struct xfs_mount *mp = XFS_I(iter->inode)->i_mount;
struct xfs_zone_alloc_ctx *ac = iter->private;
xfs_filblks_t count_fsb;
struct iomap_ioend *ioend;
count_fsb = XFS_B_TO_FSB(mp, bio->bi_iter.bi_size);
if (count_fsb > ac->reserved_blocks) {
xfs_err(mp,
"allocation (%lld) larger than reservation (%lld).",
count_fsb, ac->reserved_blocks);
xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
bio_io_error(bio);
return;
}
ac->reserved_blocks -= count_fsb;
bio->bi_end_io = xfs_end_bio;
ioend = iomap_init_ioend(iter->inode, bio, file_offset,
IOMAP_IOEND_DIRECT);
xfs_zone_alloc_and_submit(ioend, &ac->open_zone);
}
static const struct iomap_dio_ops xfs_dio_zoned_write_ops = {
.bio_set = &iomap_ioend_bioset,
.submit_io = xfs_dio_zoned_submit_io,
.end_io = xfs_dio_write_end_io,
};
/*
* Handle block aligned direct I/O writes
* Handle block aligned direct I/O writes.
*/
static noinline ssize_t
xfs_file_dio_write_aligned(
struct xfs_inode *ip,
struct kiocb *iocb,
struct iov_iter *from)
struct iov_iter *from,
const struct iomap_ops *ops,
const struct iomap_dio_ops *dops,
struct xfs_zone_alloc_ctx *ac)
{
unsigned int iolock = XFS_IOLOCK_SHARED;
ssize_t ret;
@ -597,7 +683,7 @@ xfs_file_dio_write_aligned(
ret = xfs_ilock_iocb_for_write(iocb, &iolock);
if (ret)
return ret;
ret = xfs_file_write_checks(iocb, from, &iolock);
ret = xfs_file_write_checks(iocb, from, &iolock, ac);
if (ret)
goto out_unlock;
@ -611,11 +697,31 @@ xfs_file_dio_write_aligned(
iolock = XFS_IOLOCK_SHARED;
}
trace_xfs_file_direct_write(iocb, from);
ret = iomap_dio_rw(iocb, from, &xfs_direct_write_iomap_ops,
&xfs_dio_write_ops, 0, NULL, 0);
ret = iomap_dio_rw(iocb, from, ops, dops, 0, ac, 0);
out_unlock:
if (iolock)
xfs_iunlock(ip, iolock);
xfs_iunlock(ip, iolock);
return ret;
}
/*
* Handle block aligned direct I/O writes to zoned devices.
*/
static noinline ssize_t
xfs_file_dio_write_zoned(
struct xfs_inode *ip,
struct kiocb *iocb,
struct iov_iter *from)
{
struct xfs_zone_alloc_ctx ac = { };
ssize_t ret;
ret = xfs_zoned_write_space_reserve(ip, iocb, from, 0, &ac);
if (ret < 0)
return ret;
ret = xfs_file_dio_write_aligned(ip, iocb, from,
&xfs_zoned_direct_write_iomap_ops,
&xfs_dio_zoned_write_ops, &ac);
xfs_zoned_space_unreserve(ip, &ac);
return ret;
}
@ -675,7 +781,7 @@ retry_exclusive:
goto out_unlock;
}
ret = xfs_file_write_checks(iocb, from, &iolock);
ret = xfs_file_write_checks(iocb, from, &iolock, NULL);
if (ret)
goto out_unlock;
@ -721,9 +827,21 @@ xfs_file_dio_write(
/* direct I/O must be aligned to device logical sector size */
if ((iocb->ki_pos | count) & target->bt_logical_sectormask)
return -EINVAL;
if ((iocb->ki_pos | count) & ip->i_mount->m_blockmask)
/*
* For always COW inodes we also must check the alignment of each
* individual iovec segment, as they could end up with different
* I/Os due to the way bio_iov_iter_get_pages works, and we'd
* then overwrite an already written block.
*/
if (((iocb->ki_pos | count) & ip->i_mount->m_blockmask) ||
(xfs_is_always_cow_inode(ip) &&
(iov_iter_alignment(from) & ip->i_mount->m_blockmask)))
return xfs_file_dio_write_unaligned(ip, iocb, from);
return xfs_file_dio_write_aligned(ip, iocb, from);
if (xfs_is_zoned_inode(ip))
return xfs_file_dio_write_zoned(ip, iocb, from);
return xfs_file_dio_write_aligned(ip, iocb, from,
&xfs_direct_write_iomap_ops, &xfs_dio_write_ops, NULL);
}
static noinline ssize_t
@ -740,7 +858,7 @@ xfs_file_dax_write(
ret = xfs_ilock_iocb(iocb, iolock);
if (ret)
return ret;
ret = xfs_file_write_checks(iocb, from, &iolock);
ret = xfs_file_write_checks(iocb, from, &iolock, NULL);
if (ret)
goto out;
@ -784,7 +902,7 @@ write_retry:
if (ret)
return ret;
ret = xfs_file_write_checks(iocb, from, &iolock);
ret = xfs_file_write_checks(iocb, from, &iolock, NULL);
if (ret)
goto out;
@ -831,6 +949,67 @@ out:
return ret;
}
STATIC ssize_t
xfs_file_buffered_write_zoned(
struct kiocb *iocb,
struct iov_iter *from)
{
struct xfs_inode *ip = XFS_I(iocb->ki_filp->f_mapping->host);
struct xfs_mount *mp = ip->i_mount;
unsigned int iolock = XFS_IOLOCK_EXCL;
bool cleared_space = false;
struct xfs_zone_alloc_ctx ac = { };
ssize_t ret;
ret = xfs_zoned_write_space_reserve(ip, iocb, from, XFS_ZR_GREEDY, &ac);
if (ret < 0)
return ret;
ret = xfs_ilock_iocb(iocb, iolock);
if (ret)
goto out_unreserve;
ret = xfs_file_write_checks(iocb, from, &iolock, &ac);
if (ret)
goto out_unlock;
/*
* Truncate the iter to the length that we were actually able to
* allocate blocks for. This needs to happen after
* xfs_file_write_checks, because that assigns ki_pos for O_APPEND
* writes.
*/
iov_iter_truncate(from,
XFS_FSB_TO_B(mp, ac.reserved_blocks) -
(iocb->ki_pos & mp->m_blockmask));
if (!iov_iter_count(from))
goto out_unlock;
retry:
trace_xfs_file_buffered_write(iocb, from);
ret = iomap_file_buffered_write(iocb, from,
&xfs_buffered_write_iomap_ops, &ac);
if (ret == -ENOSPC && !cleared_space) {
/*
* Kick off writeback to convert delalloc space and release the
* usually too pessimistic indirect block reservations.
*/
xfs_flush_inodes(mp);
cleared_space = true;
goto retry;
}
out_unlock:
xfs_iunlock(ip, iolock);
out_unreserve:
xfs_zoned_space_unreserve(ip, &ac);
if (ret > 0) {
XFS_STATS_ADD(mp, xs_write_bytes, ret);
ret = generic_write_sync(iocb, ret);
}
return ret;
}
STATIC ssize_t
xfs_file_write_iter(
struct kiocb *iocb,
@ -878,6 +1057,8 @@ xfs_file_write_iter(
return ret;
}
if (xfs_is_zoned_inode(ip))
return xfs_file_buffered_write_zoned(iocb, from);
return xfs_file_buffered_write(iocb, from);
}
@ -932,7 +1113,8 @@ static int
xfs_falloc_collapse_range(
struct file *file,
loff_t offset,
loff_t len)
loff_t len,
struct xfs_zone_alloc_ctx *ac)
{
struct inode *inode = file_inode(file);
loff_t new_size = i_size_read(inode) - len;
@ -948,7 +1130,7 @@ xfs_falloc_collapse_range(
if (offset + len >= i_size_read(inode))
return -EINVAL;
error = xfs_collapse_file_space(XFS_I(inode), offset, len);
error = xfs_collapse_file_space(XFS_I(inode), offset, len, ac);
if (error)
return error;
return xfs_falloc_setsize(file, new_size);
@ -1004,7 +1186,8 @@ xfs_falloc_zero_range(
struct file *file,
int mode,
loff_t offset,
loff_t len)
loff_t len,
struct xfs_zone_alloc_ctx *ac)
{
struct inode *inode = file_inode(file);
unsigned int blksize = i_blocksize(inode);
@ -1017,7 +1200,7 @@ xfs_falloc_zero_range(
if (error)
return error;
error = xfs_free_file_space(XFS_I(inode), offset, len);
error = xfs_free_file_space(XFS_I(inode), offset, len, ac);
if (error)
return error;
@ -1088,22 +1271,18 @@ xfs_falloc_allocate_range(
FALLOC_FL_INSERT_RANGE | FALLOC_FL_UNSHARE_RANGE)
STATIC long
xfs_file_fallocate(
__xfs_file_fallocate(
struct file *file,
int mode,
loff_t offset,
loff_t len)
loff_t len,
struct xfs_zone_alloc_ctx *ac)
{
struct inode *inode = file_inode(file);
struct xfs_inode *ip = XFS_I(inode);
long error;
uint iolock = XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL;
if (!S_ISREG(inode->i_mode))
return -EINVAL;
if (mode & ~XFS_FALLOC_FL_SUPPORTED)
return -EOPNOTSUPP;
xfs_ilock(ip, iolock);
error = xfs_break_layouts(inode, &iolock, BREAK_UNMAP);
if (error)
@ -1124,16 +1303,16 @@ xfs_file_fallocate(
switch (mode & FALLOC_FL_MODE_MASK) {
case FALLOC_FL_PUNCH_HOLE:
error = xfs_free_file_space(ip, offset, len);
error = xfs_free_file_space(ip, offset, len, ac);
break;
case FALLOC_FL_COLLAPSE_RANGE:
error = xfs_falloc_collapse_range(file, offset, len);
error = xfs_falloc_collapse_range(file, offset, len, ac);
break;
case FALLOC_FL_INSERT_RANGE:
error = xfs_falloc_insert_range(file, offset, len);
break;
case FALLOC_FL_ZERO_RANGE:
error = xfs_falloc_zero_range(file, mode, offset, len);
error = xfs_falloc_zero_range(file, mode, offset, len, ac);
break;
case FALLOC_FL_UNSHARE_RANGE:
error = xfs_falloc_unshare_range(file, mode, offset, len);
@ -1154,6 +1333,54 @@ out_unlock:
return error;
}
static long
xfs_file_zoned_fallocate(
struct file *file,
int mode,
loff_t offset,
loff_t len)
{
struct xfs_zone_alloc_ctx ac = { };
struct xfs_inode *ip = XFS_I(file_inode(file));
int error;
error = xfs_zoned_space_reserve(ip, 2, XFS_ZR_RESERVED, &ac);
if (error)
return error;
error = __xfs_file_fallocate(file, mode, offset, len, &ac);
xfs_zoned_space_unreserve(ip, &ac);
return error;
}
static long
xfs_file_fallocate(
struct file *file,
int mode,
loff_t offset,
loff_t len)
{
struct inode *inode = file_inode(file);
if (!S_ISREG(inode->i_mode))
return -EINVAL;
if (mode & ~XFS_FALLOC_FL_SUPPORTED)
return -EOPNOTSUPP;
/*
* For zoned file systems, zeroing the first and last block of a hole
* punch requires allocating a new block to rewrite the remaining data
* and new zeroes out of place. Get a reservations for those before
* taking the iolock. Dip into the reserved pool because we are
* expected to be able to punch a hole even on a completely full
* file system.
*/
if (xfs_is_zoned_inode(XFS_I(inode)) &&
(mode & (FALLOC_FL_PUNCH_HOLE | FALLOC_FL_ZERO_RANGE |
FALLOC_FL_COLLAPSE_RANGE)))
return xfs_file_zoned_fallocate(file, mode, offset, len);
return __xfs_file_fallocate(file, mode, offset, len, NULL);
}
STATIC int
xfs_file_fadvise(
struct file *file,
@ -1347,15 +1574,22 @@ xfs_file_release(
* blocks. This avoids open/read/close workloads from removing EOF
* blocks that other writers depend upon to reduce fragmentation.
*
* Inodes on the zoned RT device never have preallocations, so skip
* taking the locks below.
*/
if (!inode->i_nlink ||
!(file->f_mode & FMODE_WRITE) ||
(ip->i_diflags & XFS_DIFLAG_APPEND) ||
xfs_is_zoned_inode(ip))
return 0;
/*
* If we can't get the iolock just skip truncating the blocks past EOF
* because we could deadlock with the mmap_lock otherwise. We'll get
* another chance to drop them once the last reference to the inode is
* dropped, so we'll never leak blocks permanently.
*/
if (inode->i_nlink &&
(file->f_mode & FMODE_WRITE) &&
!(ip->i_diflags & XFS_DIFLAG_APPEND) &&
!xfs_iflags_test(ip, XFS_EOFBLOCKS_RELEASED) &&
if (!xfs_iflags_test(ip, XFS_EOFBLOCKS_RELEASED) &&
xfs_ilock_nowait(ip, XFS_IOLOCK_EXCL)) {
if (xfs_can_free_eofblocks(ip) &&
!xfs_iflags_test_and_set(ip, XFS_EOFBLOCKS_RELEASED))
@ -1469,9 +1703,10 @@ xfs_dax_read_fault(
* i_lock (XFS - extent map serialisation)
*/
static vm_fault_t
xfs_write_fault(
__xfs_write_fault(
struct vm_fault *vmf,
unsigned int order)
unsigned int order,
struct xfs_zone_alloc_ctx *ac)
{
struct inode *inode = file_inode(vmf->vma->vm_file);
struct xfs_inode *ip = XFS_I(inode);
@ -1499,13 +1734,49 @@ xfs_write_fault(
ret = xfs_dax_fault_locked(vmf, order, true);
else
ret = iomap_page_mkwrite(vmf, &xfs_buffered_write_iomap_ops,
NULL);
ac);
xfs_iunlock(ip, lock_mode);
sb_end_pagefault(inode->i_sb);
return ret;
}
static vm_fault_t
xfs_write_fault_zoned(
struct vm_fault *vmf,
unsigned int order)
{
struct xfs_inode *ip = XFS_I(file_inode(vmf->vma->vm_file));
unsigned int len = folio_size(page_folio(vmf->page));
struct xfs_zone_alloc_ctx ac = { };
int error;
vm_fault_t ret;
/*
* This could over-allocate as it doesn't check for truncation.
*
* But as the overallocation is limited to less than a folio and will be
* release instantly that's just fine.
*/
error = xfs_zoned_space_reserve(ip, XFS_B_TO_FSB(ip->i_mount, len), 0,
&ac);
if (error < 0)
return vmf_fs_error(error);
ret = __xfs_write_fault(vmf, order, &ac);
xfs_zoned_space_unreserve(ip, &ac);
return ret;
}
static vm_fault_t
xfs_write_fault(
struct vm_fault *vmf,
unsigned int order)
{
if (xfs_is_zoned_inode(XFS_I(file_inode(vmf->vma->vm_file))))
return xfs_write_fault_zoned(vmf, order);
return __xfs_write_fault(vmf, order, NULL);
}
static inline bool
xfs_is_write_fault(
struct vm_fault *vmf)

View file

@ -879,17 +879,39 @@ xfs_getfsmap_rtdev_rmapbt(
struct xfs_mount *mp = tp->t_mountp;
struct xfs_rtgroup *rtg = NULL;
struct xfs_btree_cur *bt_cur = NULL;
xfs_daddr_t rtstart_daddr;
xfs_rtblock_t start_rtb;
xfs_rtblock_t end_rtb;
xfs_rgnumber_t start_rg, end_rg;
uint64_t eofs;
int error = 0;
eofs = XFS_FSB_TO_BB(mp, mp->m_sb.sb_rblocks);
eofs = XFS_FSB_TO_BB(mp, mp->m_sb.sb_rtstart + mp->m_sb.sb_rblocks);
if (keys[0].fmr_physical >= eofs)
return 0;
start_rtb = xfs_daddr_to_rtb(mp, keys[0].fmr_physical);
end_rtb = xfs_daddr_to_rtb(mp, min(eofs - 1, keys[1].fmr_physical));
rtstart_daddr = XFS_FSB_TO_BB(mp, mp->m_sb.sb_rtstart);
if (keys[0].fmr_physical < rtstart_daddr) {
struct xfs_fsmap_irec frec = {
.owner = XFS_RMAP_OWN_FS,
.len_daddr = rtstart_daddr,
};
/* Adjust the low key if we are continuing from where we left off. */
if (keys[0].fmr_length > 0) {
info->low_daddr = keys[0].fmr_physical + keys[0].fmr_length;
return 0;
}
/* Fabricate an rmap entry for space occupied by the data dev */
error = xfs_getfsmap_helper(tp, info, &frec);
if (error)
return error;
}
start_rtb = xfs_daddr_to_rtb(mp, rtstart_daddr + keys[0].fmr_physical);
end_rtb = xfs_daddr_to_rtb(mp, rtstart_daddr +
min(eofs - 1, keys[1].fmr_physical));
info->missing_owner = XFS_FMR_OWN_FREE;
@ -1004,22 +1026,40 @@ xfs_getfsmap_rtdev_rmapbt(
}
#endif /* CONFIG_XFS_RT */
static uint32_t
xfs_getfsmap_device(
struct xfs_mount *mp,
enum xfs_device dev)
{
if (mp->m_sb.sb_rtstart)
return dev;
switch (dev) {
case XFS_DEV_DATA:
return new_encode_dev(mp->m_ddev_targp->bt_dev);
case XFS_DEV_LOG:
return new_encode_dev(mp->m_logdev_targp->bt_dev);
case XFS_DEV_RT:
if (!mp->m_rtdev_targp)
break;
return new_encode_dev(mp->m_rtdev_targp->bt_dev);
}
return -1;
}
/* Do we recognize the device? */
STATIC bool
xfs_getfsmap_is_valid_device(
struct xfs_mount *mp,
struct xfs_fsmap *fm)
{
if (fm->fmr_device == 0 || fm->fmr_device == UINT_MAX ||
fm->fmr_device == new_encode_dev(mp->m_ddev_targp->bt_dev))
return true;
if (mp->m_logdev_targp &&
fm->fmr_device == new_encode_dev(mp->m_logdev_targp->bt_dev))
return true;
if (mp->m_rtdev_targp &&
fm->fmr_device == new_encode_dev(mp->m_rtdev_targp->bt_dev))
return true;
return false;
return fm->fmr_device == 0 ||
fm->fmr_device == UINT_MAX ||
fm->fmr_device == xfs_getfsmap_device(mp, XFS_DEV_DATA) ||
fm->fmr_device == xfs_getfsmap_device(mp, XFS_DEV_LOG) ||
(mp->m_rtdev_targp &&
fm->fmr_device == xfs_getfsmap_device(mp, XFS_DEV_RT));
}
/* Ensure that the low key is less than the high key. */
@ -1126,7 +1166,7 @@ xfs_getfsmap(
/* Set up our device handlers. */
memset(handlers, 0, sizeof(handlers));
handlers[0].nr_sectors = XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks);
handlers[0].dev = new_encode_dev(mp->m_ddev_targp->bt_dev);
handlers[0].dev = xfs_getfsmap_device(mp, XFS_DEV_DATA);
if (use_rmap)
handlers[0].fn = xfs_getfsmap_datadev_rmapbt;
else
@ -1134,13 +1174,17 @@ xfs_getfsmap(
if (mp->m_logdev_targp != mp->m_ddev_targp) {
handlers[1].nr_sectors = XFS_FSB_TO_BB(mp,
mp->m_sb.sb_logblocks);
handlers[1].dev = new_encode_dev(mp->m_logdev_targp->bt_dev);
handlers[1].dev = xfs_getfsmap_device(mp, XFS_DEV_LOG);
handlers[1].fn = xfs_getfsmap_logdev;
}
#ifdef CONFIG_XFS_RT
if (mp->m_rtdev_targp) {
/*
* For zoned file systems there is no rtbitmap, so only support fsmap
* if the callers is privileged enough to use the full rmap version.
*/
if (mp->m_rtdev_targp && (use_rmap || !xfs_has_zoned(mp))) {
handlers[2].nr_sectors = XFS_FSB_TO_BB(mp, mp->m_sb.sb_rblocks);
handlers[2].dev = new_encode_dev(mp->m_rtdev_targp->bt_dev);
handlers[2].dev = xfs_getfsmap_device(mp, XFS_DEV_RT);
if (use_rmap)
handlers[2].fn = xfs_getfsmap_rtdev_rmapbt;
else
@ -1230,7 +1274,13 @@ xfs_getfsmap(
if (tp)
xfs_trans_cancel(tp);
head->fmh_oflags = FMH_OF_DEV_T;
/*
* For internal RT device we need to report different synthetic devices
* for a single physical device, and thus can't report the actual dev_t.
*/
if (!mp->m_sb.sb_rtstart)
head->fmh_oflags = FMH_OF_DEV_T;
return error;
}

View file

@ -24,6 +24,7 @@
#include "xfs_rtalloc.h"
#include "xfs_rtrmap_btree.h"
#include "xfs_rtrefcount_btree.h"
#include "xfs_metafile.h"
/*
* Write new AG headers to disk. Non-transactional, but need to be
@ -110,7 +111,7 @@ xfs_growfs_data_private(
if (nb > mp->m_sb.sb_dblocks) {
error = xfs_buf_read_uncached(mp->m_ddev_targp,
XFS_FSB_TO_BB(mp, nb) - XFS_FSS_TO_BB(mp, 1),
XFS_FSS_TO_BB(mp, 1), 0, &bp, NULL);
XFS_FSS_TO_BB(mp, 1), &bp, NULL);
if (error)
return error;
xfs_buf_relse(bp);
@ -300,24 +301,30 @@ xfs_growfs_data(
struct xfs_mount *mp,
struct xfs_growfs_data *in)
{
int error = 0;
int error;
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
if (!mutex_trylock(&mp->m_growlock))
return -EWOULDBLOCK;
/* we can't grow the data section when an internal RT section exists */
if (in->newblocks != mp->m_sb.sb_dblocks && mp->m_sb.sb_rtstart) {
error = -EINVAL;
goto out_unlock;
}
/* update imaxpct separately to the physical grow of the filesystem */
if (in->imaxpct != mp->m_sb.sb_imax_pct) {
error = xfs_growfs_imaxpct(mp, in->imaxpct);
if (error)
goto out_error;
goto out_unlock;
}
if (in->newblocks != mp->m_sb.sb_dblocks) {
error = xfs_growfs_data_private(mp, in);
if (error)
goto out_error;
goto out_unlock;
}
/* Post growfs calculations needed to reflect new state in operations */
@ -331,13 +338,12 @@ xfs_growfs_data(
/* Update secondary superblocks now the physical grow has completed */
error = xfs_update_secondary_sbs(mp);
out_error:
/*
* Increment the generation unconditionally, the error could be from
* updating the secondary superblocks, in which case the new size
* is live already.
* Increment the generation unconditionally, after trying to update the
* secondary superblocks, as the new size is live already at this point.
*/
mp->m_generation++;
out_unlock:
mutex_unlock(&mp->m_growlock);
return error;
}
@ -366,6 +372,7 @@ xfs_growfs_log(
int
xfs_reserve_blocks(
struct xfs_mount *mp,
enum xfs_free_counter ctr,
uint64_t request)
{
int64_t lcounter, delta;
@ -373,6 +380,8 @@ xfs_reserve_blocks(
int64_t free;
int error = 0;
ASSERT(ctr < XC_FREE_NR);
/*
* With per-cpu counters, this becomes an interesting problem. we need
* to work out if we are freeing or allocation blocks first, then we can
@ -391,16 +400,16 @@ xfs_reserve_blocks(
* counters directly since we shouldn't have any problems unreserving
* space.
*/
if (mp->m_resblks > request) {
lcounter = mp->m_resblks_avail - request;
if (mp->m_free[ctr].res_total > request) {
lcounter = mp->m_free[ctr].res_avail - request;
if (lcounter > 0) { /* release unused blocks */
fdblks_delta = lcounter;
mp->m_resblks_avail -= lcounter;
mp->m_free[ctr].res_avail -= lcounter;
}
mp->m_resblks = request;
mp->m_free[ctr].res_total = request;
if (fdblks_delta) {
spin_unlock(&mp->m_sb_lock);
xfs_add_fdblocks(mp, fdblks_delta);
xfs_add_freecounter(mp, ctr, fdblks_delta);
spin_lock(&mp->m_sb_lock);
}
@ -409,7 +418,7 @@ xfs_reserve_blocks(
/*
* If the request is larger than the current reservation, reserve the
* blocks before we update the reserve counters. Sample m_fdblocks and
* blocks before we update the reserve counters. Sample m_free and
* perform a partial reservation if the request exceeds free space.
*
* The code below estimates how many blocks it can request from
@ -419,10 +428,10 @@ xfs_reserve_blocks(
* space to fill it because mod_fdblocks will refill an undersized
* reserve when it can.
*/
free = percpu_counter_sum(&mp->m_fdblocks) -
xfs_fdblocks_unavailable(mp);
delta = request - mp->m_resblks;
mp->m_resblks = request;
free = xfs_sum_freecounter_raw(mp, ctr) -
xfs_freecounter_unavailable(mp, ctr);
delta = request - mp->m_free[ctr].res_total;
mp->m_free[ctr].res_total = request;
if (delta > 0 && free > 0) {
/*
* We'll either succeed in getting space from the free block
@ -436,9 +445,9 @@ xfs_reserve_blocks(
*/
fdblks_delta = min(free, delta);
spin_unlock(&mp->m_sb_lock);
error = xfs_dec_fdblocks(mp, fdblks_delta, 0);
error = xfs_dec_freecounter(mp, ctr, fdblks_delta, 0);
if (!error)
xfs_add_fdblocks(mp, fdblks_delta);
xfs_add_freecounter(mp, ctr, fdblks_delta);
spin_lock(&mp->m_sb_lock);
}
out:
@ -558,15 +567,13 @@ xfs_fs_reserve_ag_blocks(
return error;
}
if (xfs_has_realtime(mp)) {
err2 = xfs_rt_resv_init(mp);
if (err2 && err2 != -ENOSPC) {
xfs_warn(mp,
"Error %d reserving realtime metadata reserve pool.", err2);
xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
}
err2 = xfs_metafile_resv_init(mp);
if (err2 && err2 != -ENOSPC) {
xfs_warn(mp,
"Error %d reserving realtime metadata reserve pool.", err2);
xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
if (err2 && !error)
if (!error)
error = err2;
}
@ -582,9 +589,7 @@ xfs_fs_unreserve_ag_blocks(
{
struct xfs_perag *pag = NULL;
if (xfs_has_realtime(mp))
xfs_rt_resv_free(mp);
xfs_metafile_resv_free(mp);
while ((pag = xfs_perag_next(mp, pag)))
xfs_ag_resv_free(pag);
}

View file

@ -8,7 +8,8 @@
int xfs_growfs_data(struct xfs_mount *mp, struct xfs_growfs_data *in);
int xfs_growfs_log(struct xfs_mount *mp, struct xfs_growfs_log *in);
int xfs_reserve_blocks(struct xfs_mount *mp, uint64_t request);
int xfs_reserve_blocks(struct xfs_mount *mp, enum xfs_free_counter cnt,
uint64_t request);
int xfs_fs_goingdown(struct xfs_mount *mp, uint32_t inflags);
int xfs_fs_reserve_ag_blocks(struct xfs_mount *mp);

View file

@ -2073,10 +2073,10 @@ xfs_inodegc_want_queue_rt_file(
{
struct xfs_mount *mp = ip->i_mount;
if (!XFS_IS_REALTIME_INODE(ip))
if (!XFS_IS_REALTIME_INODE(ip) || xfs_has_zoned(mp))
return false;
if (__percpu_counter_compare(&mp->m_frextents,
if (xfs_compare_freecounter(mp, XC_FREE_RTEXTENTS,
mp->m_low_rtexts[XFS_LOWSP_5_PCNT],
XFS_FDBLOCKS_BATCH) < 0)
return true;
@ -2104,7 +2104,7 @@ xfs_inodegc_want_queue_work(
if (items > mp->m_ino_geo.inodes_per_cluster)
return true;
if (__percpu_counter_compare(&mp->m_fdblocks,
if (xfs_compare_freecounter(mp, XC_FREE_BLOCKS,
mp->m_low_space[XFS_LOWSP_5_PCNT],
XFS_FDBLOCKS_BATCH) < 0)
return true;

View file

@ -1721,8 +1721,7 @@ xfs_ifree_cluster(
* to mark all the active inodes on the buffer stale.
*/
error = xfs_trans_get_buf(tp, mp->m_ddev_targp, blkno,
mp->m_bsize * igeo->blocks_per_cluster,
XBF_UNMAPPED, &bp);
mp->m_bsize * igeo->blocks_per_cluster, 0, &bp);
if (error)
return error;
@ -3074,5 +3073,6 @@ bool
xfs_is_always_cow_inode(
const struct xfs_inode *ip)
{
return ip->i_mount->m_always_cow && xfs_has_reflink(ip->i_mount);
return xfs_is_zoned_inode(ip) ||
(ip->i_mount->m_always_cow && xfs_has_reflink(ip->i_mount));
}

View file

@ -25,19 +25,9 @@ struct xfs_dquot;
typedef struct xfs_inode {
/* Inode linking and identification information. */
struct xfs_mount *i_mount; /* fs mount struct ptr */
union {
struct {
struct xfs_dquot *i_udquot; /* user dquot */
struct xfs_dquot *i_gdquot; /* group dquot */
struct xfs_dquot *i_pdquot; /* project dquot */
};
/*
* Space that has been set aside to accomodate expansions of a
* metadata btree rooted in this file.
*/
uint64_t i_meta_resv_asked;
};
struct xfs_dquot *i_udquot; /* user dquot */
struct xfs_dquot *i_gdquot; /* group dquot */
struct xfs_dquot *i_pdquot; /* project dquot */
/* Inode location stuff */
xfs_ino_t i_ino; /* inode number (agno/agino)*/
@ -69,8 +59,13 @@ typedef struct xfs_inode {
xfs_rfsblock_t i_nblocks; /* # of direct & btree blocks */
prid_t i_projid; /* owner's project id */
xfs_extlen_t i_extsize; /* basic/minimum extent size */
/* cowextsize is only used for v3 inodes, flushiter for v1/2 */
/*
* i_used_blocks is used for zoned rtrmap inodes,
* i_cowextsize is used for other v3 inodes,
* i_flushiter for v1/2 inodes
*/
union {
uint32_t i_used_blocks; /* used blocks in RTG */
xfs_extlen_t i_cowextsize; /* basic cow extent size */
uint16_t i_flushiter; /* incremented on flush */
};
@ -309,6 +304,11 @@ static inline bool xfs_is_internal_inode(const struct xfs_inode *ip)
xfs_is_quota_inode(&mp->m_sb, ip->i_ino);
}
static inline bool xfs_is_zoned_inode(const struct xfs_inode *ip)
{
return xfs_has_zoned(ip->i_mount) && XFS_IS_REALTIME_INODE(ip);
}
bool xfs_is_always_cow_inode(const struct xfs_inode *ip);
static inline bool xfs_is_cow_inode(const struct xfs_inode *ip)

View file

@ -596,6 +596,7 @@ xfs_inode_to_log_dinode(
to->di_changecount = inode_peek_iversion(inode);
to->di_crtime = xfs_inode_to_log_dinode_ts(ip, ip->i_crtime);
to->di_flags2 = ip->i_diflags2;
/* also covers the di_used_blocks union arm: */
to->di_cowextsize = ip->i_cowextsize;
to->di_ino = ip->i_ino;
to->di_lsn = lsn;

View file

@ -203,6 +203,7 @@ xfs_log_dinode_to_disk(
to->di_crtime = xfs_log_dinode_to_disk_ts(from,
from->di_crtime);
to->di_flags2 = cpu_to_be64(from->di_flags2);
/* also covers the di_used_blocks union arm: */
to->di_cowextsize = cpu_to_be32(from->di_cowextsize);
to->di_ino = cpu_to_be64(from->di_ino);
to->di_lsn = cpu_to_be64(lsn);

View file

@ -1131,15 +1131,15 @@ xfs_ioctl_getset_resblocks(
error = mnt_want_write_file(filp);
if (error)
return error;
error = xfs_reserve_blocks(mp, fsop.resblks);
error = xfs_reserve_blocks(mp, XC_FREE_BLOCKS, fsop.resblks);
mnt_drop_write_file(filp);
if (error)
return error;
}
spin_lock(&mp->m_sb_lock);
fsop.resblks = mp->m_resblks;
fsop.resblks_avail = mp->m_resblks_avail;
fsop.resblks = mp->m_free[XC_FREE_BLOCKS].res_total;
fsop.resblks_avail = mp->m_free[XC_FREE_BLOCKS].res_avail;
spin_unlock(&mp->m_sb_lock);
if (copy_to_user(arg, &fsop, sizeof(fsop)))
@ -1155,9 +1155,9 @@ xfs_ioctl_fs_counts(
struct xfs_fsop_counts out = {
.allocino = percpu_counter_read_positive(&mp->m_icount),
.freeino = percpu_counter_read_positive(&mp->m_ifree),
.freedata = percpu_counter_read_positive(&mp->m_fdblocks) -
xfs_fdblocks_unavailable(mp),
.freertx = percpu_counter_read_positive(&mp->m_frextents),
.freedata = xfs_estimate_freecounter(mp, XC_FREE_BLOCKS) -
xfs_freecounter_unavailable(mp, XC_FREE_BLOCKS),
.freertx = xfs_estimate_freecounter(mp, XC_FREE_RTEXTENTS),
};
if (copy_to_user(uarg, &out, sizeof(out)))

View file

@ -30,6 +30,8 @@
#include "xfs_reflink.h"
#include "xfs_health.h"
#include "xfs_rtbitmap.h"
#include "xfs_icache.h"
#include "xfs_zone_alloc.h"
#define XFS_ALLOC_ALIGN(mp, off) \
(((off) >> mp->m_allocsize_log) << mp->m_allocsize_log)
@ -431,13 +433,14 @@ xfs_quota_calc_throttle(
static int64_t
xfs_iomap_freesp(
struct percpu_counter *counter,
struct xfs_mount *mp,
unsigned int idx,
uint64_t low_space[XFS_LOWSP_MAX],
int *shift)
{
int64_t freesp;
freesp = percpu_counter_read_positive(counter);
freesp = xfs_estimate_freecounter(mp, idx);
if (freesp < low_space[XFS_LOWSP_5_PCNT]) {
*shift = 2;
if (freesp < low_space[XFS_LOWSP_4_PCNT])
@ -536,10 +539,10 @@ xfs_iomap_prealloc_size(
if (unlikely(XFS_IS_REALTIME_INODE(ip)))
freesp = xfs_rtbxlen_to_blen(mp,
xfs_iomap_freesp(&mp->m_frextents,
xfs_iomap_freesp(mp, XC_FREE_RTEXTENTS,
mp->m_low_rtexts, &shift));
else
freesp = xfs_iomap_freesp(&mp->m_fdblocks, mp->m_low_space,
freesp = xfs_iomap_freesp(mp, XC_FREE_BLOCKS, mp->m_low_space,
&shift);
/*
@ -966,6 +969,59 @@ const struct iomap_ops xfs_direct_write_iomap_ops = {
.iomap_begin = xfs_direct_write_iomap_begin,
};
#ifdef CONFIG_XFS_RT
/*
* This is really simple. The space has already been reserved before taking the
* IOLOCK, the actual block allocation is done just before submitting the bio
* and only recorded in the extent map on I/O completion.
*/
static int
xfs_zoned_direct_write_iomap_begin(
struct inode *inode,
loff_t offset,
loff_t length,
unsigned flags,
struct iomap *iomap,
struct iomap *srcmap)
{
struct xfs_inode *ip = XFS_I(inode);
int error;
ASSERT(!(flags & IOMAP_OVERWRITE_ONLY));
/*
* Needs to be pushed down into the allocator so that only writes into
* a single zone can be supported.
*/
if (flags & IOMAP_NOWAIT)
return -EAGAIN;
/*
* Ensure the extent list is in memory in so that we don't have to do
* read it from the I/O completion handler.
*/
if (xfs_need_iread_extents(&ip->i_df)) {
xfs_ilock(ip, XFS_ILOCK_EXCL);
error = xfs_iread_extents(NULL, ip, XFS_DATA_FORK);
xfs_iunlock(ip, XFS_ILOCK_EXCL);
if (error)
return error;
}
iomap->type = IOMAP_MAPPED;
iomap->flags = IOMAP_F_DIRTY;
iomap->bdev = ip->i_mount->m_rtdev_targp->bt_bdev;
iomap->offset = offset;
iomap->length = length;
iomap->flags = IOMAP_F_ANON_WRITE;
return 0;
}
const struct iomap_ops xfs_zoned_direct_write_iomap_ops = {
.iomap_begin = xfs_zoned_direct_write_iomap_begin,
};
#endif /* CONFIG_XFS_RT */
static int
xfs_dax_write_iomap_end(
struct inode *inode,
@ -991,6 +1047,455 @@ const struct iomap_ops xfs_dax_write_iomap_ops = {
.iomap_end = xfs_dax_write_iomap_end,
};
/*
* Convert a hole to a delayed allocation.
*/
static void
xfs_bmap_add_extent_hole_delay(
struct xfs_inode *ip, /* incore inode pointer */
int whichfork,
struct xfs_iext_cursor *icur,
struct xfs_bmbt_irec *new) /* new data to add to file extents */
{
struct xfs_ifork *ifp; /* inode fork pointer */
xfs_bmbt_irec_t left; /* left neighbor extent entry */
xfs_filblks_t newlen=0; /* new indirect size */
xfs_filblks_t oldlen=0; /* old indirect size */
xfs_bmbt_irec_t right; /* right neighbor extent entry */
uint32_t state = xfs_bmap_fork_to_state(whichfork);
xfs_filblks_t temp; /* temp for indirect calculations */
ifp = xfs_ifork_ptr(ip, whichfork);
ASSERT(isnullstartblock(new->br_startblock));
/*
* Check and set flags if this segment has a left neighbor
*/
if (xfs_iext_peek_prev_extent(ifp, icur, &left)) {
state |= BMAP_LEFT_VALID;
if (isnullstartblock(left.br_startblock))
state |= BMAP_LEFT_DELAY;
}
/*
* Check and set flags if the current (right) segment exists.
* If it doesn't exist, we're converting the hole at end-of-file.
*/
if (xfs_iext_get_extent(ifp, icur, &right)) {
state |= BMAP_RIGHT_VALID;
if (isnullstartblock(right.br_startblock))
state |= BMAP_RIGHT_DELAY;
}
/*
* Set contiguity flags on the left and right neighbors.
* Don't let extents get too large, even if the pieces are contiguous.
*/
if ((state & BMAP_LEFT_VALID) && (state & BMAP_LEFT_DELAY) &&
left.br_startoff + left.br_blockcount == new->br_startoff &&
left.br_blockcount + new->br_blockcount <= XFS_MAX_BMBT_EXTLEN)
state |= BMAP_LEFT_CONTIG;
if ((state & BMAP_RIGHT_VALID) && (state & BMAP_RIGHT_DELAY) &&
new->br_startoff + new->br_blockcount == right.br_startoff &&
new->br_blockcount + right.br_blockcount <= XFS_MAX_BMBT_EXTLEN &&
(!(state & BMAP_LEFT_CONTIG) ||
(left.br_blockcount + new->br_blockcount +
right.br_blockcount <= XFS_MAX_BMBT_EXTLEN)))
state |= BMAP_RIGHT_CONTIG;
/*
* Switch out based on the contiguity flags.
*/
switch (state & (BMAP_LEFT_CONTIG | BMAP_RIGHT_CONTIG)) {
case BMAP_LEFT_CONTIG | BMAP_RIGHT_CONTIG:
/*
* New allocation is contiguous with delayed allocations
* on the left and on the right.
* Merge all three into a single extent record.
*/
temp = left.br_blockcount + new->br_blockcount +
right.br_blockcount;
oldlen = startblockval(left.br_startblock) +
startblockval(new->br_startblock) +
startblockval(right.br_startblock);
newlen = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp),
oldlen);
left.br_startblock = nullstartblock(newlen);
left.br_blockcount = temp;
xfs_iext_remove(ip, icur, state);
xfs_iext_prev(ifp, icur);
xfs_iext_update_extent(ip, state, icur, &left);
break;
case BMAP_LEFT_CONTIG:
/*
* New allocation is contiguous with a delayed allocation
* on the left.
* Merge the new allocation with the left neighbor.
*/
temp = left.br_blockcount + new->br_blockcount;
oldlen = startblockval(left.br_startblock) +
startblockval(new->br_startblock);
newlen = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp),
oldlen);
left.br_blockcount = temp;
left.br_startblock = nullstartblock(newlen);
xfs_iext_prev(ifp, icur);
xfs_iext_update_extent(ip, state, icur, &left);
break;
case BMAP_RIGHT_CONTIG:
/*
* New allocation is contiguous with a delayed allocation
* on the right.
* Merge the new allocation with the right neighbor.
*/
temp = new->br_blockcount + right.br_blockcount;
oldlen = startblockval(new->br_startblock) +
startblockval(right.br_startblock);
newlen = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp),
oldlen);
right.br_startoff = new->br_startoff;
right.br_startblock = nullstartblock(newlen);
right.br_blockcount = temp;
xfs_iext_update_extent(ip, state, icur, &right);
break;
case 0:
/*
* New allocation is not contiguous with another
* delayed allocation.
* Insert a new entry.
*/
oldlen = newlen = 0;
xfs_iext_insert(ip, icur, new, state);
break;
}
if (oldlen != newlen) {
ASSERT(oldlen > newlen);
xfs_add_fdblocks(ip->i_mount, oldlen - newlen);
/*
* Nothing to do for disk quota accounting here.
*/
xfs_mod_delalloc(ip, 0, (int64_t)newlen - oldlen);
}
}
/*
* Add a delayed allocation extent to an inode. Blocks are reserved from the
* global pool and the extent inserted into the inode in-core extent tree.
*
* On entry, got refers to the first extent beyond the offset of the extent to
* allocate or eof is specified if no such extent exists. On return, got refers
* to the extent record that was inserted to the inode fork.
*
* Note that the allocated extent may have been merged with contiguous extents
* during insertion into the inode fork. Thus, got does not reflect the current
* state of the inode fork on return. If necessary, the caller can use lastx to
* look up the updated record in the inode fork.
*/
static int
xfs_bmapi_reserve_delalloc(
struct xfs_inode *ip,
int whichfork,
xfs_fileoff_t off,
xfs_filblks_t len,
xfs_filblks_t prealloc,
struct xfs_bmbt_irec *got,
struct xfs_iext_cursor *icur,
int eof)
{
struct xfs_mount *mp = ip->i_mount;
struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork);
xfs_extlen_t alen;
xfs_extlen_t indlen;
uint64_t fdblocks;
int error;
xfs_fileoff_t aoff;
bool use_cowextszhint =
whichfork == XFS_COW_FORK && !prealloc;
retry:
/*
* Cap the alloc length. Keep track of prealloc so we know whether to
* tag the inode before we return.
*/
aoff = off;
alen = XFS_FILBLKS_MIN(len + prealloc, XFS_MAX_BMBT_EXTLEN);
if (!eof)
alen = XFS_FILBLKS_MIN(alen, got->br_startoff - aoff);
if (prealloc && alen >= len)
prealloc = alen - len;
/*
* If we're targetting the COW fork but aren't creating a speculative
* posteof preallocation, try to expand the reservation to align with
* the COW extent size hint if there's sufficient free space.
*
* Unlike the data fork, the CoW cancellation functions will free all
* the reservations at inactivation, so we don't require that every
* delalloc reservation have a dirty pagecache.
*/
if (use_cowextszhint) {
struct xfs_bmbt_irec prev;
xfs_extlen_t extsz = xfs_get_cowextsz_hint(ip);
if (!xfs_iext_peek_prev_extent(ifp, icur, &prev))
prev.br_startoff = NULLFILEOFF;
error = xfs_bmap_extsize_align(mp, got, &prev, extsz, 0, eof,
1, 0, &aoff, &alen);
ASSERT(!error);
}
/*
* Make a transaction-less quota reservation for delayed allocation
* blocks. This number gets adjusted later. We return if we haven't
* allocated blocks already inside this loop.
*/
error = xfs_quota_reserve_blkres(ip, alen);
if (error)
goto out;
/*
* Split changing sb for alen and indlen since they could be coming
* from different places.
*/
indlen = (xfs_extlen_t)xfs_bmap_worst_indlen(ip, alen);
ASSERT(indlen > 0);
fdblocks = indlen;
if (XFS_IS_REALTIME_INODE(ip)) {
ASSERT(!xfs_is_zoned_inode(ip));
error = xfs_dec_frextents(mp, xfs_blen_to_rtbxlen(mp, alen));
if (error)
goto out_unreserve_quota;
} else {
fdblocks += alen;
}
error = xfs_dec_fdblocks(mp, fdblocks, false);
if (error)
goto out_unreserve_frextents;
ip->i_delayed_blks += alen;
xfs_mod_delalloc(ip, alen, indlen);
got->br_startoff = aoff;
got->br_startblock = nullstartblock(indlen);
got->br_blockcount = alen;
got->br_state = XFS_EXT_NORM;
xfs_bmap_add_extent_hole_delay(ip, whichfork, icur, got);
/*
* Tag the inode if blocks were preallocated. Note that COW fork
* preallocation can occur at the start or end of the extent, even when
* prealloc == 0, so we must also check the aligned offset and length.
*/
if (whichfork == XFS_DATA_FORK && prealloc)
xfs_inode_set_eofblocks_tag(ip);
if (whichfork == XFS_COW_FORK && (prealloc || aoff < off || alen > len))
xfs_inode_set_cowblocks_tag(ip);
return 0;
out_unreserve_frextents:
if (XFS_IS_REALTIME_INODE(ip))
xfs_add_frextents(mp, xfs_blen_to_rtbxlen(mp, alen));
out_unreserve_quota:
if (XFS_IS_QUOTA_ON(mp))
xfs_quota_unreserve_blkres(ip, alen);
out:
if (error == -ENOSPC || error == -EDQUOT) {
trace_xfs_delalloc_enospc(ip, off, len);
if (prealloc || use_cowextszhint) {
/* retry without any preallocation */
use_cowextszhint = false;
prealloc = 0;
goto retry;
}
}
return error;
}
static int
xfs_zoned_buffered_write_iomap_begin(
struct inode *inode,
loff_t offset,
loff_t count,
unsigned flags,
struct iomap *iomap,
struct iomap *srcmap)
{
struct iomap_iter *iter =
container_of(iomap, struct iomap_iter, iomap);
struct xfs_zone_alloc_ctx *ac = iter->private;
struct xfs_inode *ip = XFS_I(inode);
struct xfs_mount *mp = ip->i_mount;
xfs_fileoff_t offset_fsb = XFS_B_TO_FSBT(mp, offset);
xfs_fileoff_t end_fsb = xfs_iomap_end_fsb(mp, offset, count);
u16 iomap_flags = IOMAP_F_SHARED;
unsigned int lockmode = XFS_ILOCK_EXCL;
xfs_filblks_t count_fsb;
xfs_extlen_t indlen;
struct xfs_bmbt_irec got;
struct xfs_iext_cursor icur;
int error = 0;
ASSERT(!xfs_get_extsz_hint(ip));
ASSERT(!(flags & IOMAP_UNSHARE));
ASSERT(ac);
if (xfs_is_shutdown(mp))
return -EIO;
error = xfs_qm_dqattach(ip);
if (error)
return error;
error = xfs_ilock_for_iomap(ip, flags, &lockmode);
if (error)
return error;
if (XFS_IS_CORRUPT(mp, !xfs_ifork_has_extents(&ip->i_df)) ||
XFS_TEST_ERROR(false, mp, XFS_ERRTAG_BMAPIFORMAT)) {
xfs_bmap_mark_sick(ip, XFS_DATA_FORK);
error = -EFSCORRUPTED;
goto out_unlock;
}
XFS_STATS_INC(mp, xs_blk_mapw);
error = xfs_iread_extents(NULL, ip, XFS_DATA_FORK);
if (error)
goto out_unlock;
/*
* For zeroing operations check if there is any data to zero first.
*
* For regular writes we always need to allocate new blocks, but need to
* provide the source mapping when the range is unaligned to support
* read-modify-write of the whole block in the page cache.
*
* In either case we need to limit the reported range to the boundaries
* of the source map in the data fork.
*/
if (!IS_ALIGNED(offset, mp->m_sb.sb_blocksize) ||
!IS_ALIGNED(offset + count, mp->m_sb.sb_blocksize) ||
(flags & IOMAP_ZERO)) {
struct xfs_bmbt_irec smap;
struct xfs_iext_cursor scur;
if (!xfs_iext_lookup_extent(ip, &ip->i_df, offset_fsb, &scur,
&smap))
smap.br_startoff = end_fsb; /* fake hole until EOF */
if (smap.br_startoff > offset_fsb) {
/*
* We never need to allocate blocks for zeroing a hole.
*/
if (flags & IOMAP_ZERO) {
xfs_hole_to_iomap(ip, iomap, offset_fsb,
smap.br_startoff);
goto out_unlock;
}
end_fsb = min(end_fsb, smap.br_startoff);
} else {
end_fsb = min(end_fsb,
smap.br_startoff + smap.br_blockcount);
xfs_trim_extent(&smap, offset_fsb,
end_fsb - offset_fsb);
error = xfs_bmbt_to_iomap(ip, srcmap, &smap, flags, 0,
xfs_iomap_inode_sequence(ip, 0));
if (error)
goto out_unlock;
}
}
if (!ip->i_cowfp)
xfs_ifork_init_cow(ip);
if (!xfs_iext_lookup_extent(ip, ip->i_cowfp, offset_fsb, &icur, &got))
got.br_startoff = end_fsb;
if (got.br_startoff <= offset_fsb) {
trace_xfs_reflink_cow_found(ip, &got);
goto done;
}
/*
* Cap the maximum length to keep the chunks of work done here somewhat
* symmetric with the work writeback does.
*/
end_fsb = min(end_fsb, got.br_startoff);
count_fsb = min3(end_fsb - offset_fsb, XFS_MAX_BMBT_EXTLEN,
XFS_B_TO_FSB(mp, 1024 * PAGE_SIZE));
/*
* The block reservation is supposed to cover all blocks that the
* operation could possible write, but there is a nasty corner case
* where blocks could be stolen from underneath us:
*
* 1) while this thread iterates over a larger buffered write,
* 2) another thread is causing a write fault that calls into
* ->page_mkwrite in range this thread writes to, using up the
* delalloc reservation created by a previous call to this function.
* 3) another thread does direct I/O on the range that the write fault
* happened on, which causes writeback of the dirty data.
* 4) this then set the stale flag, which cuts the current iomap
* iteration short, causing the new call to ->iomap_begin that gets
* us here again, but now without a sufficient reservation.
*
* This is a very unusual I/O pattern, and nothing but generic/095 is
* known to hit it. There's not really much we can do here, so turn this
* into a short write.
*/
if (count_fsb > ac->reserved_blocks) {
xfs_warn_ratelimited(mp,
"Short write on ino 0x%llx comm %.20s due to three-way race with write fault and direct I/O",
ip->i_ino, current->comm);
count_fsb = ac->reserved_blocks;
if (!count_fsb) {
error = -EIO;
goto out_unlock;
}
}
error = xfs_quota_reserve_blkres(ip, count_fsb);
if (error)
goto out_unlock;
indlen = xfs_bmap_worst_indlen(ip, count_fsb);
error = xfs_dec_fdblocks(mp, indlen, false);
if (error)
goto out_unlock;
ip->i_delayed_blks += count_fsb;
xfs_mod_delalloc(ip, count_fsb, indlen);
got.br_startoff = offset_fsb;
got.br_startblock = nullstartblock(indlen);
got.br_blockcount = count_fsb;
got.br_state = XFS_EXT_NORM;
xfs_bmap_add_extent_hole_delay(ip, XFS_COW_FORK, &icur, &got);
ac->reserved_blocks -= count_fsb;
iomap_flags |= IOMAP_F_NEW;
trace_xfs_iomap_alloc(ip, offset, XFS_FSB_TO_B(mp, count_fsb),
XFS_COW_FORK, &got);
done:
error = xfs_bmbt_to_iomap(ip, iomap, &got, flags, iomap_flags,
xfs_iomap_inode_sequence(ip, IOMAP_F_SHARED));
out_unlock:
xfs_iunlock(ip, lockmode);
return error;
}
static int
xfs_buffered_write_iomap_begin(
struct inode *inode,
@ -1017,6 +1522,10 @@ xfs_buffered_write_iomap_begin(
if (xfs_is_shutdown(mp))
return -EIO;
if (xfs_is_zoned_inode(ip))
return xfs_zoned_buffered_write_iomap_begin(inode, offset,
count, flags, iomap, srcmap);
/* we can't use delayed allocations when using extent size hints */
if (xfs_get_extsz_hint(ip))
return xfs_direct_write_iomap_begin(inode, offset, count,
@ -1249,10 +1758,13 @@ xfs_buffered_write_delalloc_punch(
loff_t length,
struct iomap *iomap)
{
struct iomap_iter *iter =
container_of(iomap, struct iomap_iter, iomap);
xfs_bmap_punch_delalloc_range(XFS_I(inode),
(iomap->flags & IOMAP_F_SHARED) ?
XFS_COW_FORK : XFS_DATA_FORK,
offset, offset + length);
offset, offset + length, iter->private);
}
static int
@ -1489,6 +2001,7 @@ xfs_zero_range(
struct xfs_inode *ip,
loff_t pos,
loff_t len,
struct xfs_zone_alloc_ctx *ac,
bool *did_zero)
{
struct inode *inode = VFS_I(ip);
@ -1499,13 +2012,14 @@ xfs_zero_range(
return dax_zero_range(inode, pos, len, did_zero,
&xfs_dax_write_iomap_ops);
return iomap_zero_range(inode, pos, len, did_zero,
&xfs_buffered_write_iomap_ops, NULL);
&xfs_buffered_write_iomap_ops, ac);
}
int
xfs_truncate_page(
struct xfs_inode *ip,
loff_t pos,
struct xfs_zone_alloc_ctx *ac,
bool *did_zero)
{
struct inode *inode = VFS_I(ip);
@ -1514,5 +2028,5 @@ xfs_truncate_page(
return dax_truncate_page(inode, pos, did_zero,
&xfs_dax_write_iomap_ops);
return iomap_truncate_page(inode, pos, did_zero,
&xfs_buffered_write_iomap_ops, NULL);
&xfs_buffered_write_iomap_ops, ac);
}

View file

@ -10,6 +10,7 @@
struct xfs_inode;
struct xfs_bmbt_irec;
struct xfs_zone_alloc_ctx;
int xfs_iomap_write_direct(struct xfs_inode *ip, xfs_fileoff_t offset_fsb,
xfs_fileoff_t count_fsb, unsigned int flags,
@ -24,8 +25,9 @@ int xfs_bmbt_to_iomap(struct xfs_inode *ip, struct iomap *iomap,
u16 iomap_flags, u64 sequence_cookie);
int xfs_zero_range(struct xfs_inode *ip, loff_t pos, loff_t len,
bool *did_zero);
int xfs_truncate_page(struct xfs_inode *ip, loff_t pos, bool *did_zero);
struct xfs_zone_alloc_ctx *ac, bool *did_zero);
int xfs_truncate_page(struct xfs_inode *ip, loff_t pos,
struct xfs_zone_alloc_ctx *ac, bool *did_zero);
static inline xfs_filblks_t
xfs_aligned_fsb_count(
@ -49,6 +51,7 @@ xfs_aligned_fsb_count(
extern const struct iomap_ops xfs_buffered_write_iomap_ops;
extern const struct iomap_ops xfs_direct_write_iomap_ops;
extern const struct iomap_ops xfs_zoned_direct_write_iomap_ops;
extern const struct iomap_ops xfs_read_iomap_ops;
extern const struct iomap_ops xfs_seek_iomap_ops;
extern const struct iomap_ops xfs_xattr_iomap_ops;

View file

@ -29,6 +29,7 @@
#include "xfs_xattr.h"
#include "xfs_file.h"
#include "xfs_bmap.h"
#include "xfs_zone_alloc.h"
#include <linux/posix_acl.h>
#include <linux/security.h>
@ -854,6 +855,7 @@ xfs_setattr_size(
uint lock_flags = 0;
uint resblks = 0;
bool did_zeroing = false;
struct xfs_zone_alloc_ctx ac = { };
xfs_assert_ilocked(ip, XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL);
ASSERT(S_ISREG(inode->i_mode));
@ -889,6 +891,28 @@ xfs_setattr_size(
*/
inode_dio_wait(inode);
/*
* Normally xfs_zoned_space_reserve is supposed to be called outside the
* IOLOCK. For truncate we can't do that since ->setattr is called with
* it already held by the VFS. So for now chicken out and try to
* allocate space under it.
*
* To avoid deadlocks this means we can't block waiting for space, which
* can lead to spurious -ENOSPC if there are no directly available
* blocks. We mitigate this a bit by allowing zeroing to dip into the
* reserved pool, but eventually the VFS calling convention needs to
* change.
*/
if (xfs_is_zoned_inode(ip)) {
error = xfs_zoned_space_reserve(ip, 1,
XFS_ZR_NOWAIT | XFS_ZR_RESERVED, &ac);
if (error) {
if (error == -EAGAIN)
return -ENOSPC;
return error;
}
}
/*
* File data changes must be complete before we start the transaction to
* modify the inode. This needs to be done before joining the inode to
@ -902,11 +926,14 @@ xfs_setattr_size(
if (newsize > oldsize) {
trace_xfs_zero_eof(ip, oldsize, newsize - oldsize);
error = xfs_zero_range(ip, oldsize, newsize - oldsize,
&did_zeroing);
&ac, &did_zeroing);
} else {
error = xfs_truncate_page(ip, newsize, &did_zeroing);
error = xfs_truncate_page(ip, newsize, &ac, &did_zeroing);
}
if (xfs_is_zoned_inode(ip))
xfs_zoned_space_unreserve(ip, &ac);
if (error)
return error;

View file

@ -20,6 +20,7 @@
#include "xfs_sysfs.h"
#include "xfs_sb.h"
#include "xfs_health.h"
#include "xfs_zone_alloc.h"
struct kmem_cache *xfs_log_ticket_cache;
@ -3540,6 +3541,9 @@ xlog_force_shutdown(
spin_unlock(&log->l_icloglock);
wake_up_var(&log->l_opstate);
if (IS_ENABLED(CONFIG_XFS_RT) && xfs_has_zoned(log->l_mp))
xfs_zoned_wake_all(log->l_mp);
return log_error;
}

View file

@ -173,6 +173,10 @@ xfs_warn_experimental(
.opstate = XFS_OPSTATE_WARNED_METADIR,
.name = "metadata directory tree",
},
[XFS_EXPERIMENTAL_ZONED] = {
.opstate = XFS_OPSTATE_WARNED_ZONED,
.name = "zoned RT device",
},
};
ASSERT(feat >= 0 && feat < XFS_EXPERIMENTAL_MAX);
BUILD_BUG_ON(ARRAY_SIZE(features) != XFS_EXPERIMENTAL_MAX);

View file

@ -99,6 +99,7 @@ enum xfs_experimental_feat {
XFS_EXPERIMENTAL_EXCHRANGE,
XFS_EXPERIMENTAL_PPTR,
XFS_EXPERIMENTAL_METADIR,
XFS_EXPERIMENTAL_ZONED,
XFS_EXPERIMENTAL_MAX,
};

View file

@ -40,6 +40,7 @@
#include "xfs_rtrmap_btree.h"
#include "xfs_rtrefcount_btree.h"
#include "scrub/stats.h"
#include "xfs_zone_alloc.h"
static DEFINE_MUTEX(xfs_uuid_table_mutex);
static int xfs_uuid_table_size;
@ -185,7 +186,7 @@ xfs_readsb(
*/
reread:
error = xfs_buf_read_uncached(mp->m_ddev_targp, XFS_SB_DADDR,
BTOBB(sector_size), 0, &bp, buf_ops);
BTOBB(sector_size), &bp, buf_ops);
if (error) {
if (loud)
xfs_warn(mp, "SB validate failed with error %d.", error);
@ -413,7 +414,7 @@ xfs_check_sizes(
}
error = xfs_buf_read_uncached(mp->m_ddev_targp,
d - XFS_FSS_TO_BB(mp, 1),
XFS_FSS_TO_BB(mp, 1), 0, &bp, NULL);
XFS_FSS_TO_BB(mp, 1), &bp, NULL);
if (error) {
xfs_warn(mp, "last sector read failed");
return error;
@ -430,7 +431,7 @@ xfs_check_sizes(
}
error = xfs_buf_read_uncached(mp->m_logdev_targp,
d - XFS_FSB_TO_BB(mp, 1),
XFS_FSB_TO_BB(mp, 1), 0, &bp, NULL);
XFS_FSB_TO_BB(mp, 1), &bp, NULL);
if (error) {
xfs_warn(mp, "log device read failed");
return error;
@ -461,22 +462,38 @@ xfs_mount_reset_sbqflags(
return xfs_sync_sb(mp, false);
}
uint64_t
xfs_default_resblks(xfs_mount_t *mp)
{
uint64_t resblks;
static const char *const xfs_free_pool_name[] = {
[XC_FREE_BLOCKS] = "free blocks",
[XC_FREE_RTEXTENTS] = "free rt extents",
[XC_FREE_RTAVAILABLE] = "available rt extents",
};
/*
* We default to 5% or 8192 fsbs of space reserved, whichever is
* smaller. This is intended to cover concurrent allocation
* transactions when we initially hit enospc. These each require a 4
* block reservation. Hence by default we cover roughly 2000 concurrent
* allocation reservations.
*/
resblks = mp->m_sb.sb_dblocks;
do_div(resblks, 20);
resblks = min_t(uint64_t, resblks, 8192);
return resblks;
uint64_t
xfs_default_resblks(
struct xfs_mount *mp,
enum xfs_free_counter ctr)
{
switch (ctr) {
case XC_FREE_BLOCKS:
/*
* Default to 5% or 8192 FSBs of space reserved, whichever is
* smaller.
*
* This is intended to cover concurrent allocation transactions
* when we initially hit ENOSPC. These each require a 4 block
* reservation. Hence by default we cover roughly 2000
* concurrent allocation reservations.
*/
return min(div_u64(mp->m_sb.sb_dblocks, 20), 8192ULL);
case XC_FREE_RTEXTENTS:
case XC_FREE_RTAVAILABLE:
if (IS_ENABLED(CONFIG_XFS_RT) && xfs_has_zoned(mp))
return xfs_zoned_default_resblks(mp, ctr);
return 0;
default:
ASSERT(0);
return 0;
}
}
/* Ensure the summary counts are correct. */
@ -543,7 +560,7 @@ xfs_check_summary_counts(
* If we're mounting the rt volume after recovering the log, recompute
* frextents from the rtbitmap file to fix the inconsistency.
*/
if (xfs_has_realtime(mp) && !xfs_is_clean(mp)) {
if (xfs_has_realtime(mp) && !xfs_has_zoned(mp) && !xfs_is_clean(mp)) {
error = xfs_rtalloc_reinit_frextents(mp);
if (error)
return error;
@ -678,6 +695,7 @@ xfs_mountfs(
uint quotamount = 0;
uint quotaflags = 0;
int error = 0;
int i;
xfs_sb_mount_common(mp, sbp);
@ -747,27 +765,15 @@ xfs_mountfs(
/* enable fail_at_unmount as default */
mp->m_fail_unmount = true;
super_set_sysfs_name_id(mp->m_super);
error = xfs_sysfs_init(&mp->m_kobj, &xfs_mp_ktype,
NULL, mp->m_super->s_id);
if (error)
goto out;
error = xfs_sysfs_init(&mp->m_stats.xs_kobj, &xfs_stats_ktype,
&mp->m_kobj, "stats");
if (error)
goto out_remove_sysfs;
xchk_stats_register(mp->m_scrub_stats, mp->m_debugfs);
error = xfs_error_sysfs_init(mp);
error = xfs_mount_sysfs_init(mp);
if (error)
goto out_remove_scrub_stats;
xchk_stats_register(mp->m_scrub_stats, mp->m_debugfs);
error = xfs_errortag_init(mp);
if (error)
goto out_remove_error_sysfs;
goto out_remove_sysfs;
error = xfs_uuid_mount(mp);
if (error)
@ -1031,6 +1037,12 @@ xfs_mountfs(
if (xfs_is_readonly(mp) && !xfs_has_norecovery(mp))
xfs_log_clean(mp);
if (xfs_has_zoned(mp)) {
error = xfs_mount_zones(mp);
if (error)
goto out_rtunmount;
}
/*
* Complete the quota initialisation, post-log-replay component.
*/
@ -1046,22 +1058,28 @@ xfs_mountfs(
* privileged transactions. This is needed so that transaction
* space required for critical operations can dip into this pool
* when at ENOSPC. This is needed for operations like create with
* attr, unwritten extent conversion at ENOSPC, etc. Data allocations
* are not allowed to use this reserved space.
* attr, unwritten extent conversion at ENOSPC, garbage collection
* etc. Data allocations are not allowed to use this reserved space.
*
* This may drive us straight to ENOSPC on mount, but that implies
* we were already there on the last unmount. Warn if this occurs.
*/
if (!xfs_is_readonly(mp)) {
error = xfs_reserve_blocks(mp, xfs_default_resblks(mp));
if (error)
xfs_warn(mp,
"Unable to allocate reserve blocks. Continuing without reserve pool.");
for (i = 0; i < XC_FREE_NR; i++) {
error = xfs_reserve_blocks(mp, i,
xfs_default_resblks(mp, i));
if (error)
xfs_warn(mp,
"Unable to allocate reserve blocks. Continuing without reserve pool for %s.",
xfs_free_pool_name[i]);
}
/* Reserve AG blocks for future btree expansion. */
error = xfs_fs_reserve_ag_blocks(mp);
if (error && error != -ENOSPC)
goto out_agresv;
xfs_zone_gc_start(mp);
}
return 0;
@ -1069,6 +1087,8 @@ xfs_mountfs(
out_agresv:
xfs_fs_unreserve_ag_blocks(mp);
xfs_qm_unmount_quotas(mp);
if (xfs_has_zoned(mp))
xfs_unmount_zones(mp);
out_rtunmount:
xfs_rtunmount_inodes(mp);
out_rele_rip:
@ -1116,13 +1136,10 @@ xfs_mountfs(
xfs_uuid_unmount(mp);
out_remove_errortag:
xfs_errortag_del(mp);
out_remove_error_sysfs:
xfs_error_sysfs_del(mp);
out_remove_sysfs:
xfs_mount_sysfs_del(mp);
out_remove_scrub_stats:
xchk_stats_unregister(mp->m_scrub_stats);
xfs_sysfs_del(&mp->m_stats.xs_kobj);
out_remove_sysfs:
xfs_sysfs_del(&mp->m_kobj);
out:
return error;
}
@ -1148,8 +1165,12 @@ xfs_unmountfs(
xfs_inodegc_flush(mp);
xfs_blockgc_stop(mp);
if (!test_bit(XFS_OPSTATE_READONLY, &mp->m_opstate))
xfs_zone_gc_stop(mp);
xfs_fs_unreserve_ag_blocks(mp);
xfs_qm_unmount_quotas(mp);
if (xfs_has_zoned(mp))
xfs_unmount_zones(mp);
xfs_rtunmount_inodes(mp);
xfs_irele(mp->m_rootip);
if (mp->m_metadirip)
@ -1173,7 +1194,7 @@ xfs_unmountfs(
* we only every apply deltas to the superblock and hence the incore
* value does not matter....
*/
error = xfs_reserve_blocks(mp, 0);
error = xfs_reserve_blocks(mp, XC_FREE_BLOCKS, 0);
if (error)
xfs_warn(mp, "Unable to free reserved block pool. "
"Freespace may not be correct on next mount.");
@ -1195,10 +1216,8 @@ xfs_unmountfs(
xfs_free_rtgroups(mp, 0, mp->m_sb.sb_rgcount);
xfs_free_perag_range(mp, 0, mp->m_sb.sb_agcount);
xfs_errortag_del(mp);
xfs_error_sysfs_del(mp);
xchk_stats_unregister(mp->m_scrub_stats);
xfs_sysfs_del(&mp->m_stats.xs_kobj);
xfs_sysfs_del(&mp->m_kobj);
xfs_mount_sysfs_del(mp);
}
/*
@ -1220,52 +1239,67 @@ xfs_fs_writable(
return true;
}
/*
* Estimate the amount of free space that is not available to userspace and is
* not explicitly reserved from the incore fdblocks. This includes:
*
* - The minimum number of blocks needed to support splitting a bmap btree
* - The blocks currently in use by the freespace btrees because they record
* the actual blocks that will fill per-AG metadata space reservations
*/
uint64_t
xfs_freecounter_unavailable(
struct xfs_mount *mp,
enum xfs_free_counter ctr)
{
if (ctr != XC_FREE_BLOCKS)
return 0;
return mp->m_alloc_set_aside + atomic64_read(&mp->m_allocbt_blks);
}
void
xfs_add_freecounter(
struct xfs_mount *mp,
struct percpu_counter *counter,
enum xfs_free_counter ctr,
uint64_t delta)
{
bool has_resv_pool = (counter == &mp->m_fdblocks);
struct xfs_freecounter *counter = &mp->m_free[ctr];
uint64_t res_used;
/*
* If the reserve pool is depleted, put blocks back into it first.
* Most of the time the pool is full.
*/
if (!has_resv_pool || mp->m_resblks == mp->m_resblks_avail) {
percpu_counter_add(counter, delta);
if (likely(counter->res_avail == counter->res_total)) {
percpu_counter_add(&counter->count, delta);
return;
}
spin_lock(&mp->m_sb_lock);
res_used = mp->m_resblks - mp->m_resblks_avail;
res_used = counter->res_total - counter->res_avail;
if (res_used > delta) {
mp->m_resblks_avail += delta;
counter->res_avail += delta;
} else {
delta -= res_used;
mp->m_resblks_avail = mp->m_resblks;
percpu_counter_add(counter, delta);
counter->res_avail = counter->res_total;
percpu_counter_add(&counter->count, delta);
}
spin_unlock(&mp->m_sb_lock);
}
/* Adjust in-core free blocks or RT extents. */
int
xfs_dec_freecounter(
struct xfs_mount *mp,
struct percpu_counter *counter,
enum xfs_free_counter ctr,
uint64_t delta,
bool rsvd)
{
int64_t lcounter;
uint64_t set_aside = 0;
struct xfs_freecounter *counter = &mp->m_free[ctr];
s32 batch;
bool has_resv_pool;
ASSERT(counter == &mp->m_fdblocks || counter == &mp->m_frextents);
has_resv_pool = (counter == &mp->m_fdblocks);
if (rsvd)
ASSERT(has_resv_pool);
ASSERT(ctr < XC_FREE_NR);
/*
* Taking blocks away, need to be more accurate the closer we
@ -1275,7 +1309,7 @@ xfs_dec_freecounter(
* then make everything serialise as we are real close to
* ENOSPC.
*/
if (__percpu_counter_compare(counter, 2 * XFS_FDBLOCKS_BATCH,
if (__percpu_counter_compare(&counter->count, 2 * XFS_FDBLOCKS_BATCH,
XFS_FDBLOCKS_BATCH) < 0)
batch = 1;
else
@ -1292,34 +1326,34 @@ xfs_dec_freecounter(
* problems (i.e. transaction abort, pagecache discards, etc.) than
* slightly premature -ENOSPC.
*/
if (has_resv_pool)
set_aside = xfs_fdblocks_unavailable(mp);
percpu_counter_add_batch(counter, -((int64_t)delta), batch);
if (__percpu_counter_compare(counter, set_aside,
XFS_FDBLOCKS_BATCH) >= 0) {
/* we had space! */
return 0;
}
/*
* lock up the sb for dipping into reserves before releasing the space
* that took us to ENOSPC.
*/
spin_lock(&mp->m_sb_lock);
percpu_counter_add(counter, delta);
if (!has_resv_pool || !rsvd)
goto fdblocks_enospc;
lcounter = (long long)mp->m_resblks_avail - delta;
if (lcounter >= 0) {
mp->m_resblks_avail = lcounter;
spin_unlock(&mp->m_sb_lock);
return 0;
}
xfs_warn_once(mp,
percpu_counter_add_batch(&counter->count, -((int64_t)delta), batch);
if (__percpu_counter_compare(&counter->count,
xfs_freecounter_unavailable(mp, ctr),
XFS_FDBLOCKS_BATCH) < 0) {
/*
* Lock up the sb for dipping into reserves before releasing the
* space that took us to ENOSPC.
*/
spin_lock(&mp->m_sb_lock);
percpu_counter_add(&counter->count, delta);
if (!rsvd)
goto fdblocks_enospc;
if (delta > counter->res_avail) {
if (ctr == XC_FREE_BLOCKS)
xfs_warn_once(mp,
"Reserve blocks depleted! Consider increasing reserve pool size.");
goto fdblocks_enospc;
}
counter->res_avail -= delta;
trace_xfs_freecounter_reserved(mp, ctr, delta, _RET_IP_);
spin_unlock(&mp->m_sb_lock);
}
/* we had space! */
return 0;
fdblocks_enospc:
trace_xfs_freecounter_enospc(mp, ctr, delta, _RET_IP_);
spin_unlock(&mp->m_sb_lock);
return -ENOSPC;
}

View file

@ -97,12 +97,42 @@ struct xfs_groups {
*/
uint8_t blklog;
/*
* Zoned devices can have gaps beyond the usable capacity of a zone and
* the end in the LBA/daddr address space. In other words, the hardware
* equivalent to the RT groups already takes care of the power of 2
* alignment for us. In this case the sparse FSB/RTB address space maps
* 1:1 to the device address space.
*/
bool has_daddr_gaps;
/*
* Mask to extract the group-relative block number from a FSB.
* For a pre-rtgroups filesystem we pretend to have one very large
* rtgroup, so this mask must be 64-bit.
*/
uint64_t blkmask;
/*
* Start of the first group in the device. This is used to support a
* RT device following the data device on the same block device for
* SMR hard drives.
*/
xfs_fsblock_t start_fsb;
};
struct xfs_freecounter {
/* free blocks for general use: */
struct percpu_counter count;
/* total reserved blocks: */
uint64_t res_total;
/* available reserved blocks: */
uint64_t res_avail;
/* reserved blks @ remount,ro: */
uint64_t res_saved;
};
/*
@ -198,6 +228,7 @@ typedef struct xfs_mount {
bool m_fail_unmount;
bool m_finobt_nores; /* no per-AG finobt resv. */
bool m_update_sb; /* sb needs update in mount */
unsigned int m_max_open_zones;
/*
* Bitsets of per-fs metadata that have been checked and/or are sick.
@ -222,8 +253,8 @@ typedef struct xfs_mount {
spinlock_t ____cacheline_aligned m_sb_lock; /* sb counter lock */
struct percpu_counter m_icount; /* allocated inodes counter */
struct percpu_counter m_ifree; /* free inodes counter */
struct percpu_counter m_fdblocks; /* free block counter */
struct percpu_counter m_frextents; /* free rt extent counter */
struct xfs_freecounter m_free[XC_FREE_NR];
/*
* Count of data device blocks reserved for delayed allocations,
@ -245,10 +276,8 @@ typedef struct xfs_mount {
atomic64_t m_allocbt_blks;
struct xfs_groups m_groups[XG_TYPE_MAX];
uint64_t m_resblks; /* total reserved blocks */
uint64_t m_resblks_avail;/* available reserved blocks */
uint64_t m_resblks_save; /* reserved blks @ remount,ro */
struct delayed_work m_reclaim_work; /* background inode reclaim */
struct xfs_zone_info *m_zone_info; /* zone allocator information */
struct dentry *m_debugfs; /* debugfs parent */
struct xfs_kobj m_kobj;
struct xfs_kobj m_error_kobj;
@ -258,10 +287,16 @@ typedef struct xfs_mount {
#ifdef CONFIG_XFS_ONLINE_SCRUB_STATS
struct xchk_stats *m_scrub_stats;
#endif
struct xfs_kobj m_zoned_kobj;
xfs_agnumber_t m_agfrotor; /* last ag where space found */
atomic_t m_agirotor; /* last ag dir inode alloced */
atomic_t m_rtgrotor; /* last rtgroup rtpicked */
struct mutex m_metafile_resv_lock;
uint64_t m_metafile_resv_target;
uint64_t m_metafile_resv_used;
uint64_t m_metafile_resv_avail;
/* Memory shrinker to throttle and reprioritize inodegc */
struct shrinker *m_inodegc_shrinker;
/*
@ -336,8 +371,10 @@ typedef struct xfs_mount {
#define XFS_FEAT_NREXT64 (1ULL << 26) /* large extent counters */
#define XFS_FEAT_EXCHANGE_RANGE (1ULL << 27) /* exchange range */
#define XFS_FEAT_METADIR (1ULL << 28) /* metadata directory tree */
#define XFS_FEAT_ZONED (1ULL << 29) /* zoned RT device */
/* Mount features */
#define XFS_FEAT_NOLIFETIME (1ULL << 47) /* disable lifetime hints */
#define XFS_FEAT_NOATTR2 (1ULL << 48) /* disable attr2 creation */
#define XFS_FEAT_NOALIGN (1ULL << 49) /* ignore alignment */
#define XFS_FEAT_ALLOCSIZE (1ULL << 50) /* user specified allocation size */
@ -392,6 +429,8 @@ __XFS_HAS_FEAT(needsrepair, NEEDSREPAIR)
__XFS_HAS_FEAT(large_extent_counts, NREXT64)
__XFS_HAS_FEAT(exchange_range, EXCHANGE_RANGE)
__XFS_HAS_FEAT(metadir, METADIR)
__XFS_HAS_FEAT(zoned, ZONED)
__XFS_HAS_FEAT(nolifetime, NOLIFETIME)
static inline bool xfs_has_rtgroups(const struct xfs_mount *mp)
{
@ -402,7 +441,9 @@ static inline bool xfs_has_rtgroups(const struct xfs_mount *mp)
static inline bool xfs_has_rtsb(const struct xfs_mount *mp)
{
/* all rtgroups filesystems with an rt section have an rtsb */
return xfs_has_rtgroups(mp) && xfs_has_realtime(mp);
return xfs_has_rtgroups(mp) &&
xfs_has_realtime(mp) &&
!xfs_has_zoned(mp);
}
static inline bool xfs_has_rtrmapbt(const struct xfs_mount *mp)
@ -417,6 +458,11 @@ static inline bool xfs_has_rtreflink(const struct xfs_mount *mp)
xfs_has_reflink(mp);
}
static inline bool xfs_has_nonzoned(const struct xfs_mount *mp)
{
return !xfs_has_zoned(mp);
}
/*
* Some features are always on for v5 file systems, allow the compiler to
* eliminiate dead code when building without v4 support.
@ -520,6 +566,10 @@ __XFS_HAS_FEAT(nouuid, NOUUID)
#define XFS_OPSTATE_WARNED_METADIR 17
/* Filesystem should use qflags to determine quotaon status */
#define XFS_OPSTATE_RESUMING_QUOTAON 18
/* Kernel has logged a warning about zoned RT device being used on this fs. */
#define XFS_OPSTATE_WARNED_ZONED 19
/* (Zoned) GC is in progress */
#define XFS_OPSTATE_ZONEGC_RUNNING 20
#define __XFS_IS_OPSTATE(name, NAME) \
static inline bool xfs_is_ ## name (struct xfs_mount *mp) \
@ -564,6 +614,7 @@ static inline bool xfs_clear_resuming_quotaon(struct xfs_mount *mp)
#endif /* CONFIG_XFS_QUOTA */
__XFS_IS_OPSTATE(done_with_log_incompat, UNSET_LOG_INCOMPAT)
__XFS_IS_OPSTATE(using_logged_xattrs, USE_LARP)
__XFS_IS_OPSTATE(zonegc_running, ZONEGC_RUNNING)
static inline bool
xfs_should_warn(struct xfs_mount *mp, long nr)
@ -633,7 +684,8 @@ xfs_daddr_to_agbno(struct xfs_mount *mp, xfs_daddr_t d)
}
extern void xfs_uuid_table_free(void);
extern uint64_t xfs_default_resblks(xfs_mount_t *mp);
uint64_t xfs_default_resblks(struct xfs_mount *mp,
enum xfs_free_counter ctr);
extern int xfs_mountfs(xfs_mount_t *mp);
extern void xfs_unmountfs(xfs_mount_t *);
@ -646,45 +698,74 @@ extern void xfs_unmountfs(xfs_mount_t *);
*/
#define XFS_FDBLOCKS_BATCH 1024
uint64_t xfs_freecounter_unavailable(struct xfs_mount *mp,
enum xfs_free_counter ctr);
/*
* Estimate the amount of free space that is not available to userspace and is
* not explicitly reserved from the incore fdblocks. This includes:
*
* - The minimum number of blocks needed to support splitting a bmap btree
* - The blocks currently in use by the freespace btrees because they record
* the actual blocks that will fill per-AG metadata space reservations
* Sum up the freecount, but never return negative values.
*/
static inline uint64_t
xfs_fdblocks_unavailable(
struct xfs_mount *mp)
static inline s64 xfs_sum_freecounter(struct xfs_mount *mp,
enum xfs_free_counter ctr)
{
return mp->m_alloc_set_aside + atomic64_read(&mp->m_allocbt_blks);
return percpu_counter_sum_positive(&mp->m_free[ctr].count);
}
int xfs_dec_freecounter(struct xfs_mount *mp, struct percpu_counter *counter,
/*
* Same as above, but does return negative values. Mostly useful for
* special cases like repair and tracing.
*/
static inline s64 xfs_sum_freecounter_raw(struct xfs_mount *mp,
enum xfs_free_counter ctr)
{
return percpu_counter_sum(&mp->m_free[ctr].count);
}
/*
* This just provides and estimate without the cpu-local updates, use
* xfs_sum_freecounter for the exact value.
*/
static inline s64 xfs_estimate_freecounter(struct xfs_mount *mp,
enum xfs_free_counter ctr)
{
return percpu_counter_read_positive(&mp->m_free[ctr].count);
}
static inline int xfs_compare_freecounter(struct xfs_mount *mp,
enum xfs_free_counter ctr, s64 rhs, s32 batch)
{
return __percpu_counter_compare(&mp->m_free[ctr].count, rhs, batch);
}
static inline void xfs_set_freecounter(struct xfs_mount *mp,
enum xfs_free_counter ctr, uint64_t val)
{
percpu_counter_set(&mp->m_free[ctr].count, val);
}
int xfs_dec_freecounter(struct xfs_mount *mp, enum xfs_free_counter ctr,
uint64_t delta, bool rsvd);
void xfs_add_freecounter(struct xfs_mount *mp, struct percpu_counter *counter,
void xfs_add_freecounter(struct xfs_mount *mp, enum xfs_free_counter ctr,
uint64_t delta);
static inline int xfs_dec_fdblocks(struct xfs_mount *mp, uint64_t delta,
bool reserved)
{
return xfs_dec_freecounter(mp, &mp->m_fdblocks, delta, reserved);
return xfs_dec_freecounter(mp, XC_FREE_BLOCKS, delta, reserved);
}
static inline void xfs_add_fdblocks(struct xfs_mount *mp, uint64_t delta)
{
xfs_add_freecounter(mp, &mp->m_fdblocks, delta);
xfs_add_freecounter(mp, XC_FREE_BLOCKS, delta);
}
static inline int xfs_dec_frextents(struct xfs_mount *mp, uint64_t delta)
{
return xfs_dec_freecounter(mp, &mp->m_frextents, delta, false);
return xfs_dec_freecounter(mp, XC_FREE_RTEXTENTS, delta, false);
}
static inline void xfs_add_frextents(struct xfs_mount *mp, uint64_t delta)
{
xfs_add_freecounter(mp, &mp->m_frextents, delta);
xfs_add_freecounter(mp, XC_FREE_RTEXTENTS, delta);
}
extern int xfs_readsb(xfs_mount_t *, int);
@ -706,5 +787,9 @@ int xfs_add_incompat_log_feature(struct xfs_mount *mp, uint32_t feature);
bool xfs_clear_incompat_log_features(struct xfs_mount *mp);
void xfs_mod_delalloc(struct xfs_inode *ip, int64_t data_delta,
int64_t ind_delta);
static inline void xfs_mod_sb_delalloc(struct xfs_mount *mp, int64_t delta)
{
percpu_counter_add(&mp->m_delalloc_blks, delta);
}
#endif /* __XFS_MOUNT_H__ */

View file

@ -1711,7 +1711,8 @@ xfs_qm_mount_quotas(
* immediately. We only support rtquota if rtgroups are enabled to
* avoid problems with older kernels.
*/
if (mp->m_sb.sb_rextents && !xfs_has_rtgroups(mp)) {
if (mp->m_sb.sb_rextents &&
(!xfs_has_rtgroups(mp) || xfs_has_zoned(mp))) {
xfs_notice(mp, "Cannot turn on quotas for realtime filesystem");
mp->m_qflags = 0;
goto write_changes;

View file

@ -235,7 +235,7 @@ xfs_reflink_trim_around_shared(
int error = 0;
/* Holes, unwritten, and delalloc extents cannot be shared */
if (!xfs_is_cow_inode(ip) || !xfs_bmap_is_written_extent(irec)) {
if (!xfs_is_reflink_inode(ip) || !xfs_bmap_is_written_extent(irec)) {
*shared = false;
return 0;
}
@ -651,7 +651,7 @@ xfs_reflink_cancel_cow_blocks(
if (isnullstartblock(del.br_startblock)) {
xfs_bmap_del_extent_delay(ip, XFS_COW_FORK, &icur, &got,
&del);
&del, 0);
} else if (del.br_state == XFS_EXT_UNWRITTEN || cancel_real) {
ASSERT((*tpp)->t_highest_agno == NULLAGNUMBER);
@ -1207,15 +1207,9 @@ xfs_reflink_ag_has_free_space(
if (!xfs_has_rmapbt(mp))
return 0;
if (XFS_IS_REALTIME_INODE(ip)) {
struct xfs_rtgroup *rtg;
xfs_rgnumber_t rgno;
rgno = xfs_rtb_to_rgno(mp, fsb);
rtg = xfs_rtgroup_get(mp, rgno);
if (xfs_metafile_resv_critical(rtg_rmap(rtg)))
error = -ENOSPC;
xfs_rtgroup_put(rtg);
return error;
if (xfs_metafile_resv_critical(mp))
return -ENOSPC;
return 0;
}
agno = XFS_FSB_TO_AGNO(mp, fsb);
@ -1538,7 +1532,7 @@ xfs_reflink_zero_posteof(
return 0;
trace_xfs_zero_eof(ip, isize, pos - isize);
return xfs_zero_range(ip, isize, pos - isize, NULL);
return xfs_zero_range(ip, isize, pos - isize, NULL, NULL);
}
/*

View file

@ -33,6 +33,7 @@
#include "xfs_trace.h"
#include "xfs_rtrefcount_btree.h"
#include "xfs_reflink.h"
#include "xfs_zone_alloc.h"
/*
* Return whether there are any free extents in the size range given
@ -663,7 +664,8 @@ xfs_rtunmount_rtg(
for (i = 0; i < XFS_RTGI_MAX; i++)
xfs_rtginode_irele(&rtg->rtg_inodes[i]);
kvfree(rtg->rtg_rsum_cache);
if (!xfs_has_zoned(rtg_mount(rtg)))
kvfree(rtg->rtg_rsum_cache);
}
static int
@ -837,7 +839,7 @@ xfs_growfs_rt_init_rtsb(
return 0;
error = xfs_buf_get_uncached(mp->m_rtdev_targp, XFS_FSB_TO_BB(mp, 1),
0, &rtsb_bp);
&rtsb_bp);
if (error)
return error;
@ -858,6 +860,84 @@ xfs_growfs_rt_init_rtsb(
return error;
}
static void
xfs_growfs_rt_sb_fields(
struct xfs_trans *tp,
const struct xfs_mount *nmp)
{
struct xfs_mount *mp = tp->t_mountp;
if (nmp->m_sb.sb_rextsize != mp->m_sb.sb_rextsize)
xfs_trans_mod_sb(tp, XFS_TRANS_SB_REXTSIZE,
nmp->m_sb.sb_rextsize - mp->m_sb.sb_rextsize);
if (nmp->m_sb.sb_rbmblocks != mp->m_sb.sb_rbmblocks)
xfs_trans_mod_sb(tp, XFS_TRANS_SB_RBMBLOCKS,
nmp->m_sb.sb_rbmblocks - mp->m_sb.sb_rbmblocks);
if (nmp->m_sb.sb_rblocks != mp->m_sb.sb_rblocks)
xfs_trans_mod_sb(tp, XFS_TRANS_SB_RBLOCKS,
nmp->m_sb.sb_rblocks - mp->m_sb.sb_rblocks);
if (nmp->m_sb.sb_rextents != mp->m_sb.sb_rextents)
xfs_trans_mod_sb(tp, XFS_TRANS_SB_REXTENTS,
nmp->m_sb.sb_rextents - mp->m_sb.sb_rextents);
if (nmp->m_sb.sb_rextslog != mp->m_sb.sb_rextslog)
xfs_trans_mod_sb(tp, XFS_TRANS_SB_REXTSLOG,
nmp->m_sb.sb_rextslog - mp->m_sb.sb_rextslog);
if (nmp->m_sb.sb_rgcount != mp->m_sb.sb_rgcount)
xfs_trans_mod_sb(tp, XFS_TRANS_SB_RGCOUNT,
nmp->m_sb.sb_rgcount - mp->m_sb.sb_rgcount);
}
static int
xfs_growfs_rt_zoned(
struct xfs_rtgroup *rtg,
xfs_rfsblock_t nrblocks)
{
struct xfs_mount *mp = rtg_mount(rtg);
struct xfs_mount *nmp;
struct xfs_trans *tp;
xfs_rtbxlen_t freed_rtx;
int error;
/*
* Calculate new sb and mount fields for this round. Also ensure the
* rtg_extents value is uptodate as the rtbitmap code relies on it.
*/
nmp = xfs_growfs_rt_alloc_fake_mount(mp, nrblocks,
mp->m_sb.sb_rextsize);
if (!nmp)
return -ENOMEM;
freed_rtx = nmp->m_sb.sb_rextents - mp->m_sb.sb_rextents;
xfs_rtgroup_calc_geometry(nmp, rtg, rtg_rgno(rtg),
nmp->m_sb.sb_rgcount, nmp->m_sb.sb_rextents);
error = xfs_trans_alloc(mp, &M_RES(nmp)->tr_growrtfree, 0, 0, 0, &tp);
if (error)
goto out_free;
xfs_rtgroup_lock(rtg, XFS_RTGLOCK_RMAP);
xfs_rtgroup_trans_join(tp, rtg, XFS_RTGLOCK_RMAP);
xfs_growfs_rt_sb_fields(tp, nmp);
xfs_trans_mod_sb(tp, XFS_TRANS_SB_FREXTENTS, freed_rtx);
error = xfs_trans_commit(tp);
if (error)
goto out_free;
/*
* Ensure the mount RT feature flag is now set, and compute new
* maxlevels for rt btrees.
*/
mp->m_features |= XFS_FEAT_REALTIME;
xfs_rtrmapbt_compute_maxlevels(mp);
xfs_rtrefcountbt_compute_maxlevels(mp);
xfs_zoned_add_available(mp, freed_rtx);
out_free:
kfree(nmp);
return error;
}
static int
xfs_growfs_rt_bmblock(
struct xfs_rtgroup *rtg,
@ -943,24 +1023,7 @@ xfs_growfs_rt_bmblock(
/*
* Update superblock fields.
*/
if (nmp->m_sb.sb_rextsize != mp->m_sb.sb_rextsize)
xfs_trans_mod_sb(args.tp, XFS_TRANS_SB_REXTSIZE,
nmp->m_sb.sb_rextsize - mp->m_sb.sb_rextsize);
if (nmp->m_sb.sb_rbmblocks != mp->m_sb.sb_rbmblocks)
xfs_trans_mod_sb(args.tp, XFS_TRANS_SB_RBMBLOCKS,
nmp->m_sb.sb_rbmblocks - mp->m_sb.sb_rbmblocks);
if (nmp->m_sb.sb_rblocks != mp->m_sb.sb_rblocks)
xfs_trans_mod_sb(args.tp, XFS_TRANS_SB_RBLOCKS,
nmp->m_sb.sb_rblocks - mp->m_sb.sb_rblocks);
if (nmp->m_sb.sb_rextents != mp->m_sb.sb_rextents)
xfs_trans_mod_sb(args.tp, XFS_TRANS_SB_REXTENTS,
nmp->m_sb.sb_rextents - mp->m_sb.sb_rextents);
if (nmp->m_sb.sb_rextslog != mp->m_sb.sb_rextslog)
xfs_trans_mod_sb(args.tp, XFS_TRANS_SB_REXTSLOG,
nmp->m_sb.sb_rextslog - mp->m_sb.sb_rextslog);
if (nmp->m_sb.sb_rgcount != mp->m_sb.sb_rgcount)
xfs_trans_mod_sb(args.tp, XFS_TRANS_SB_RGCOUNT,
nmp->m_sb.sb_rgcount - mp->m_sb.sb_rgcount);
xfs_growfs_rt_sb_fields(args.tp, nmp);
/*
* Free the new extent.
@ -1127,6 +1190,11 @@ xfs_growfs_rtg(
goto out_rele;
}
if (xfs_has_zoned(mp)) {
error = xfs_growfs_rt_zoned(rtg, nrblocks);
goto out_rele;
}
error = xfs_growfs_rt_alloc_blocks(rtg, nrblocks, rextsize, &bmblocks);
if (error)
goto out_rele;
@ -1144,10 +1212,8 @@ xfs_growfs_rtg(
goto out_error;
}
if (old_rsum_cache)
kvfree(old_rsum_cache);
xfs_rtgroup_rele(rtg);
return 0;
kvfree(old_rsum_cache);
goto out_rele;
out_error:
/*
@ -1195,6 +1261,22 @@ xfs_growfs_check_rtgeom(
if (min_logfsbs > mp->m_sb.sb_logblocks)
return -EINVAL;
if (xfs_has_zoned(mp)) {
uint32_t gblocks = mp->m_groups[XG_TYPE_RTG].blocks;
uint32_t rem;
if (rextsize != 1)
return -EINVAL;
div_u64_rem(mp->m_sb.sb_rblocks, gblocks, &rem);
if (rem) {
xfs_warn(mp,
"new RT volume size (%lld) not aligned to RT group size (%d)",
mp->m_sb.sb_rblocks, gblocks);
return -EINVAL;
}
}
return 0;
}
@ -1248,6 +1330,35 @@ xfs_grow_last_rtg(
mp->m_sb.sb_rgextents;
}
/*
* Read in the last block of the RT device to make sure it is accessible.
*/
static int
xfs_rt_check_size(
struct xfs_mount *mp,
xfs_rfsblock_t last_block)
{
xfs_daddr_t daddr = XFS_FSB_TO_BB(mp, last_block);
struct xfs_buf *bp;
int error;
if (XFS_BB_TO_FSB(mp, daddr) != last_block) {
xfs_warn(mp, "RT device size overflow: %llu != %llu",
XFS_BB_TO_FSB(mp, daddr), last_block);
return -EFBIG;
}
error = xfs_buf_read_uncached(mp->m_rtdev_targp,
XFS_FSB_TO_BB(mp, mp->m_sb.sb_rtstart) + daddr,
XFS_FSB_TO_BB(mp, 1), &bp, NULL);
if (error)
xfs_warn(mp, "cannot read last RT device sector (%lld)",
last_block);
else
xfs_buf_relse(bp);
return error;
}
/*
* Grow the realtime area of the filesystem.
*/
@ -1259,7 +1370,6 @@ xfs_growfs_rt(
xfs_rgnumber_t old_rgcount = mp->m_sb.sb_rgcount;
xfs_rgnumber_t new_rgcount = 1;
xfs_rgnumber_t rgno;
struct xfs_buf *bp;
xfs_agblock_t old_rextsize = mp->m_sb.sb_rextsize;
int error;
@ -1302,15 +1412,10 @@ xfs_growfs_rt(
error = xfs_sb_validate_fsb_count(&mp->m_sb, in->newblocks);
if (error)
goto out_unlock;
/*
* Read in the last block of the device, make sure it exists.
*/
error = xfs_buf_read_uncached(mp->m_rtdev_targp,
XFS_FSB_TO_BB(mp, in->newblocks - 1),
XFS_FSB_TO_BB(mp, 1), 0, &bp, NULL);
error = xfs_rt_check_size(mp, in->newblocks - 1);
if (error)
goto out_unlock;
xfs_buf_relse(bp);
/*
* Calculate new parameters. These are the final values to be reached.
@ -1376,8 +1481,7 @@ xfs_growfs_rt(
error = error2;
/* Reset the rt metadata btree space reservations. */
xfs_rt_resv_free(mp);
error2 = xfs_rt_resv_init(mp);
error2 = xfs_metafile_resv_init(mp);
if (error2 && error2 != -ENOSPC)
error = error2;
}
@ -1407,7 +1511,7 @@ xfs_rtmount_readsb(
/* m_blkbb_log is not set up yet */
error = xfs_buf_read_uncached(mp->m_rtdev_targp, XFS_RTSB_DADDR,
mp->m_sb.sb_blocksize >> BBSHIFT, 0, &bp,
mp->m_sb.sb_blocksize >> BBSHIFT, &bp,
&xfs_rtsb_buf_ops);
if (error) {
xfs_warn(mp, "rt sb validate failed with error %d.", error);
@ -1444,10 +1548,6 @@ int /* error */
xfs_rtmount_init(
struct xfs_mount *mp) /* file system mount structure */
{
struct xfs_buf *bp; /* buffer for last block of subvolume */
xfs_daddr_t d; /* address of last block of subvolume */
int error;
if (mp->m_sb.sb_rblocks == 0)
return 0;
if (mp->m_rtdev_targp == NULL) {
@ -1458,25 +1558,7 @@ xfs_rtmount_init(
mp->m_rsumblocks = xfs_rtsummary_blockcount(mp, &mp->m_rsumlevels);
/*
* Check that the realtime section is an ok size.
*/
d = (xfs_daddr_t)XFS_FSB_TO_BB(mp, mp->m_sb.sb_rblocks);
if (XFS_BB_TO_FSB(mp, d) != mp->m_sb.sb_rblocks) {
xfs_warn(mp, "realtime mount -- %llu != %llu",
(unsigned long long) XFS_BB_TO_FSB(mp, d),
(unsigned long long) mp->m_sb.sb_rblocks);
return -EFBIG;
}
error = xfs_buf_read_uncached(mp->m_rtdev_targp,
d - XFS_FSB_TO_BB(mp, 1),
XFS_FSB_TO_BB(mp, 1), 0, &bp, NULL);
if (error) {
xfs_warn(mp, "realtime device size check failed");
return error;
}
xfs_buf_relse(bp);
return 0;
return xfs_rt_check_size(mp, mp->m_sb.sb_rblocks - 1);
}
static int
@ -1519,50 +1601,10 @@ xfs_rtalloc_reinit_frextents(
spin_lock(&mp->m_sb_lock);
mp->m_sb.sb_frextents = val;
spin_unlock(&mp->m_sb_lock);
percpu_counter_set(&mp->m_frextents, mp->m_sb.sb_frextents);
xfs_set_freecounter(mp, XC_FREE_RTEXTENTS, mp->m_sb.sb_frextents);
return 0;
}
/* Free space reservations for rt metadata inodes. */
void
xfs_rt_resv_free(
struct xfs_mount *mp)
{
struct xfs_rtgroup *rtg = NULL;
unsigned int i;
while ((rtg = xfs_rtgroup_next(mp, rtg))) {
for (i = 0; i < XFS_RTGI_MAX; i++)
xfs_metafile_resv_free(rtg->rtg_inodes[i]);
}
}
/* Reserve space for rt metadata inodes' space expansion. */
int
xfs_rt_resv_init(
struct xfs_mount *mp)
{
struct xfs_rtgroup *rtg = NULL;
xfs_filblks_t ask;
int error = 0;
while ((rtg = xfs_rtgroup_next(mp, rtg))) {
int err2;
ask = xfs_rtrmapbt_calc_reserves(mp);
err2 = xfs_metafile_resv_init(rtg_rmap(rtg), ask);
if (err2 && !error)
error = err2;
ask = xfs_rtrefcountbt_calc_reserves(mp);
err2 = xfs_metafile_resv_init(rtg_refcount(rtg), ask);
if (err2 && !error)
error = err2;
}
return error;
}
/*
* Read in the bmbt of an rt metadata inode so that we never have to load them
* at runtime. This enables the use of shared ILOCKs for rtbitmap scans. Use
@ -1613,6 +1655,8 @@ xfs_rtmount_rtg(
}
}
if (xfs_has_zoned(mp))
return 0;
return xfs_alloc_rsum_cache(rtg, mp->m_sb.sb_rbmblocks);
}
@ -2097,6 +2141,8 @@ xfs_bmap_rtalloc(
ap->datatype & XFS_ALLOC_INITIAL_USER_DATA;
int error;
ASSERT(!xfs_has_zoned(ap->tp->t_mountp));
retry:
error = xfs_rtallocate_align(ap, &ralen, &raminlen, &prod, &noalign);
if (error)

View file

@ -34,9 +34,6 @@ int /* error */
xfs_rtmount_inodes(
struct xfs_mount *mp); /* file system mount structure */
void xfs_rt_resv_free(struct xfs_mount *mp);
int xfs_rt_resv_init(struct xfs_mount *mp);
/*
* Grow the realtime area of the filesystem.
*/
@ -65,8 +62,6 @@ xfs_rtmount_init(
}
# define xfs_rtmount_inodes(m) (((mp)->m_sb.sb_rblocks == 0)? 0 : (-ENOSYS))
# define xfs_rtunmount_inodes(m)
# define xfs_rt_resv_free(mp) ((void)0)
# define xfs_rt_resv_init(mp) (0)
static inline int
xfs_growfs_check_rtgeom(const struct xfs_mount *mp,

View file

@ -46,6 +46,7 @@
#include "xfs_exchmaps_item.h"
#include "xfs_parent.h"
#include "xfs_rtalloc.h"
#include "xfs_zone_alloc.h"
#include "scrub/stats.h"
#include "scrub/rcbag_btree.h"
@ -109,7 +110,8 @@ enum {
Opt_filestreams, Opt_quota, Opt_noquota, Opt_usrquota, Opt_grpquota,
Opt_prjquota, Opt_uquota, Opt_gquota, Opt_pquota,
Opt_uqnoenforce, Opt_gqnoenforce, Opt_pqnoenforce, Opt_qnoenforce,
Opt_discard, Opt_nodiscard, Opt_dax, Opt_dax_enum,
Opt_discard, Opt_nodiscard, Opt_dax, Opt_dax_enum, Opt_max_open_zones,
Opt_lifetime, Opt_nolifetime,
};
static const struct fs_parameter_spec xfs_fs_parameters[] = {
@ -154,6 +156,9 @@ static const struct fs_parameter_spec xfs_fs_parameters[] = {
fsparam_flag("nodiscard", Opt_nodiscard),
fsparam_flag("dax", Opt_dax),
fsparam_enum("dax", Opt_dax_enum, dax_param_enums),
fsparam_u32("max_open_zones", Opt_max_open_zones),
fsparam_flag("lifetime", Opt_lifetime),
fsparam_flag("nolifetime", Opt_nolifetime),
{}
};
@ -182,6 +187,7 @@ xfs_fs_show_options(
{ XFS_FEAT_LARGE_IOSIZE, ",largeio" },
{ XFS_FEAT_DAX_ALWAYS, ",dax=always" },
{ XFS_FEAT_DAX_NEVER, ",dax=never" },
{ XFS_FEAT_NOLIFETIME, ",nolifetime" },
{ 0, NULL }
};
struct xfs_mount *mp = XFS_M(root->d_sb);
@ -233,6 +239,9 @@ xfs_fs_show_options(
if (!(mp->m_qflags & XFS_ALL_QUOTA_ACCT))
seq_puts(m, ",noquota");
if (mp->m_max_open_zones)
seq_printf(m, ",max_open_zones=%u", mp->m_max_open_zones);
return 0;
}
@ -533,7 +542,15 @@ xfs_setup_devices(
if (error)
return error;
}
if (mp->m_rtdev_targp) {
if (mp->m_sb.sb_rtstart) {
if (mp->m_rtdev_targp) {
xfs_warn(mp,
"can't use internal and external rtdev at the same time");
return -EINVAL;
}
mp->m_rtdev_targp = mp->m_ddev_targp;
} else if (mp->m_rtname) {
error = xfs_setsize_buftarg(mp->m_rtdev_targp,
mp->m_sb.sb_sectsize);
if (error)
@ -757,7 +774,7 @@ xfs_mount_free(
{
if (mp->m_logdev_targp && mp->m_logdev_targp != mp->m_ddev_targp)
xfs_free_buftarg(mp->m_logdev_targp);
if (mp->m_rtdev_targp)
if (mp->m_rtdev_targp && mp->m_rtdev_targp != mp->m_ddev_targp)
xfs_free_buftarg(mp->m_rtdev_targp);
if (mp->m_ddev_targp)
xfs_free_buftarg(mp->m_ddev_targp);
@ -814,6 +831,7 @@ xfs_fs_sync_fs(
if (sb->s_writers.frozen == SB_FREEZE_PAGEFAULT) {
xfs_inodegc_stop(mp);
xfs_blockgc_stop(mp);
xfs_zone_gc_stop(mp);
}
return 0;
@ -834,10 +852,12 @@ xfs_statfs_data(
struct kstatfs *st)
{
int64_t fdblocks =
percpu_counter_sum(&mp->m_fdblocks);
xfs_sum_freecounter(mp, XC_FREE_BLOCKS);
/* make sure st->f_bfree does not underflow */
st->f_bfree = max(0LL, fdblocks - xfs_fdblocks_unavailable(mp));
st->f_bfree = max(0LL,
fdblocks - xfs_freecounter_unavailable(mp, XC_FREE_BLOCKS));
/*
* sb_dblocks can change during growfs, but nothing cares about reporting
* the old or new value during growfs.
@ -856,8 +876,9 @@ xfs_statfs_rt(
struct kstatfs *st)
{
st->f_bfree = xfs_rtbxlen_to_blen(mp,
percpu_counter_sum_positive(&mp->m_frextents));
st->f_blocks = mp->m_sb.sb_rblocks;
xfs_sum_freecounter(mp, XC_FREE_RTEXTENTS));
st->f_blocks = mp->m_sb.sb_rblocks - xfs_rtbxlen_to_blen(mp,
mp->m_free[XC_FREE_RTEXTENTS].res_total);
}
static void
@ -922,24 +943,32 @@ xfs_fs_statfs(
}
STATIC void
xfs_save_resvblks(struct xfs_mount *mp)
xfs_save_resvblks(
struct xfs_mount *mp)
{
mp->m_resblks_save = mp->m_resblks;
xfs_reserve_blocks(mp, 0);
enum xfs_free_counter i;
for (i = 0; i < XC_FREE_NR; i++) {
mp->m_free[i].res_saved = mp->m_free[i].res_total;
xfs_reserve_blocks(mp, i, 0);
}
}
STATIC void
xfs_restore_resvblks(struct xfs_mount *mp)
xfs_restore_resvblks(
struct xfs_mount *mp)
{
uint64_t resblks;
uint64_t resblks;
enum xfs_free_counter i;
if (mp->m_resblks_save) {
resblks = mp->m_resblks_save;
mp->m_resblks_save = 0;
} else
resblks = xfs_default_resblks(mp);
xfs_reserve_blocks(mp, resblks);
for (i = 0; i < XC_FREE_NR; i++) {
if (mp->m_free[i].res_saved) {
resblks = mp->m_free[i].res_saved;
mp->m_free[i].res_saved = 0;
} else
resblks = xfs_default_resblks(mp, i);
xfs_reserve_blocks(mp, i, resblks);
}
}
/*
@ -976,6 +1005,7 @@ xfs_fs_freeze(
if (ret && !xfs_is_readonly(mp)) {
xfs_blockgc_start(mp);
xfs_inodegc_start(mp);
xfs_zone_gc_start(mp);
}
return ret;
@ -997,6 +1027,7 @@ xfs_fs_unfreeze(
* filesystem.
*/
if (!xfs_is_readonly(mp)) {
xfs_zone_gc_start(mp);
xfs_blockgc_start(mp);
xfs_inodegc_start(mp);
}
@ -1058,6 +1089,19 @@ xfs_finish_flags(
return -EINVAL;
}
if (!xfs_has_zoned(mp)) {
if (mp->m_max_open_zones) {
xfs_warn(mp,
"max_open_zones mount option only supported on zoned file systems.");
return -EINVAL;
}
if (mp->m_features & XFS_FEAT_NOLIFETIME) {
xfs_warn(mp,
"nolifetime mount option only supported on zoned file systems.");
return -EINVAL;
}
}
return 0;
}
@ -1065,7 +1109,8 @@ static int
xfs_init_percpu_counters(
struct xfs_mount *mp)
{
int error;
int error;
int i;
error = percpu_counter_init(&mp->m_icount, 0, GFP_KERNEL);
if (error)
@ -1075,30 +1120,29 @@ xfs_init_percpu_counters(
if (error)
goto free_icount;
error = percpu_counter_init(&mp->m_fdblocks, 0, GFP_KERNEL);
if (error)
goto free_ifree;
error = percpu_counter_init(&mp->m_delalloc_blks, 0, GFP_KERNEL);
if (error)
goto free_fdblocks;
goto free_ifree;
error = percpu_counter_init(&mp->m_delalloc_rtextents, 0, GFP_KERNEL);
if (error)
goto free_delalloc;
error = percpu_counter_init(&mp->m_frextents, 0, GFP_KERNEL);
if (error)
goto free_delalloc_rt;
for (i = 0; i < XC_FREE_NR; i++) {
error = percpu_counter_init(&mp->m_free[i].count, 0,
GFP_KERNEL);
if (error)
goto free_freecounters;
}
return 0;
free_delalloc_rt:
free_freecounters:
while (--i > 0)
percpu_counter_destroy(&mp->m_free[i].count);
percpu_counter_destroy(&mp->m_delalloc_rtextents);
free_delalloc:
percpu_counter_destroy(&mp->m_delalloc_blks);
free_fdblocks:
percpu_counter_destroy(&mp->m_fdblocks);
free_ifree:
percpu_counter_destroy(&mp->m_ifree);
free_icount:
@ -1112,24 +1156,28 @@ xfs_reinit_percpu_counters(
{
percpu_counter_set(&mp->m_icount, mp->m_sb.sb_icount);
percpu_counter_set(&mp->m_ifree, mp->m_sb.sb_ifree);
percpu_counter_set(&mp->m_fdblocks, mp->m_sb.sb_fdblocks);
percpu_counter_set(&mp->m_frextents, mp->m_sb.sb_frextents);
xfs_set_freecounter(mp, XC_FREE_BLOCKS, mp->m_sb.sb_fdblocks);
if (!xfs_has_zoned(mp))
xfs_set_freecounter(mp, XC_FREE_RTEXTENTS,
mp->m_sb.sb_frextents);
}
static void
xfs_destroy_percpu_counters(
struct xfs_mount *mp)
{
enum xfs_free_counter i;
for (i = 0; i < XC_FREE_NR; i++)
percpu_counter_destroy(&mp->m_free[i].count);
percpu_counter_destroy(&mp->m_icount);
percpu_counter_destroy(&mp->m_ifree);
percpu_counter_destroy(&mp->m_fdblocks);
ASSERT(xfs_is_shutdown(mp) ||
percpu_counter_sum(&mp->m_delalloc_rtextents) == 0);
percpu_counter_destroy(&mp->m_delalloc_rtextents);
ASSERT(xfs_is_shutdown(mp) ||
percpu_counter_sum(&mp->m_delalloc_blks) == 0);
percpu_counter_destroy(&mp->m_delalloc_blks);
percpu_counter_destroy(&mp->m_frextents);
}
static int
@ -1210,6 +1258,18 @@ xfs_fs_shutdown(
xfs_force_shutdown(XFS_M(sb), SHUTDOWN_DEVICE_REMOVED);
}
static int
xfs_fs_show_stats(
struct seq_file *m,
struct dentry *root)
{
struct xfs_mount *mp = XFS_M(root->d_sb);
if (xfs_has_zoned(mp) && IS_ENABLED(CONFIG_XFS_RT))
xfs_zoned_show_stats(m, mp);
return 0;
}
static const struct super_operations xfs_super_operations = {
.alloc_inode = xfs_fs_alloc_inode,
.destroy_inode = xfs_fs_destroy_inode,
@ -1224,6 +1284,7 @@ static const struct super_operations xfs_super_operations = {
.nr_cached_objects = xfs_fs_nr_cached_objects,
.free_cached_objects = xfs_fs_free_cached_objects,
.shutdown = xfs_fs_shutdown,
.show_stats = xfs_fs_show_stats,
};
static int
@ -1436,6 +1497,15 @@ xfs_fs_parse_param(
xfs_fs_warn_deprecated(fc, param, XFS_FEAT_NOATTR2, true);
parsing_mp->m_features |= XFS_FEAT_NOATTR2;
return 0;
case Opt_max_open_zones:
parsing_mp->m_max_open_zones = result.uint_32;
return 0;
case Opt_lifetime:
parsing_mp->m_features &= ~XFS_FEAT_NOLIFETIME;
return 0;
case Opt_nolifetime:
parsing_mp->m_features |= XFS_FEAT_NOLIFETIME;
return 0;
default:
xfs_warn(parsing_mp, "unknown mount option [%s].", param->key);
return -EINVAL;
@ -1780,8 +1850,17 @@ xfs_fs_fill_super(
mp->m_features &= ~XFS_FEAT_DISCARD;
}
if (xfs_has_metadir(mp))
if (xfs_has_zoned(mp)) {
if (!xfs_has_metadir(mp)) {
xfs_alert(mp,
"metadir feature required for zoned realtime devices.");
error = -EINVAL;
goto out_filestream_unmount;
}
xfs_warn_experimental(mp, XFS_EXPERIMENTAL_ZONED);
} else if (xfs_has_metadir(mp)) {
xfs_warn_experimental(mp, XFS_EXPERIMENTAL_METADIR);
}
if (xfs_has_reflink(mp)) {
if (xfs_has_realtime(mp) &&
@ -1793,6 +1872,13 @@ xfs_fs_fill_super(
goto out_filestream_unmount;
}
if (xfs_has_zoned(mp)) {
xfs_alert(mp,
"reflink not compatible with zoned RT device!");
error = -EINVAL;
goto out_filestream_unmount;
}
if (xfs_globals.always_cow) {
xfs_info(mp, "using DEBUG-only always_cow mode.");
mp->m_always_cow = true;
@ -1917,6 +2003,9 @@ xfs_remount_rw(
/* Re-enable the background inode inactivation worker. */
xfs_inodegc_start(mp);
/* Restart zone reclaim */
xfs_zone_gc_start(mp);
return 0;
}
@ -1961,6 +2050,9 @@ xfs_remount_ro(
*/
xfs_inodegc_stop(mp);
/* Stop zone reclaim */
xfs_zone_gc_stop(mp);
/* Free the per-AG metadata reservation pool. */
xfs_fs_unreserve_ag_blocks(mp);
@ -2082,6 +2174,7 @@ xfs_init_fs_context(
for (i = 0; i < XG_TYPE_MAX; i++)
xa_init(&mp->m_groups[i].xa);
mutex_init(&mp->m_growlock);
mutex_init(&mp->m_metafile_resv_lock);
INIT_WORK(&mp->m_flush_inodes_work, xfs_flush_inodes_worker);
INIT_DELAYED_WORK(&mp->m_reclaim_work, xfs_reclaim_worker);
mp->m_kobj.kobject.kset = xfs_kset;

View file

@ -13,6 +13,7 @@
#include "xfs_log.h"
#include "xfs_log_priv.h"
#include "xfs_mount.h"
#include "xfs_zones.h"
struct xfs_sysfs_attr {
struct attribute attr;
@ -69,7 +70,7 @@ static struct attribute *xfs_mp_attrs[] = {
};
ATTRIBUTE_GROUPS(xfs_mp);
const struct kobj_type xfs_mp_ktype = {
static const struct kobj_type xfs_mp_ktype = {
.release = xfs_sysfs_release,
.sysfs_ops = &xfs_sysfs_ops,
.default_groups = xfs_mp_groups,
@ -701,45 +702,103 @@ out_error:
return error;
}
static inline struct xfs_mount *zoned_to_mp(struct kobject *kobj)
{
return container_of(to_kobj(kobj), struct xfs_mount, m_zoned_kobj);
}
static ssize_t
max_open_zones_show(
struct kobject *kobj,
char *buf)
{
/* only report the open zones available for user data */
return sysfs_emit(buf, "%u\n",
zoned_to_mp(kobj)->m_max_open_zones - XFS_OPEN_GC_ZONES);
}
XFS_SYSFS_ATTR_RO(max_open_zones);
static struct attribute *xfs_zoned_attrs[] = {
ATTR_LIST(max_open_zones),
NULL,
};
ATTRIBUTE_GROUPS(xfs_zoned);
static const struct kobj_type xfs_zoned_ktype = {
.release = xfs_sysfs_release,
.sysfs_ops = &xfs_sysfs_ops,
.default_groups = xfs_zoned_groups,
};
int
xfs_error_sysfs_init(
xfs_mount_sysfs_init(
struct xfs_mount *mp)
{
int error;
super_set_sysfs_name_id(mp->m_super);
/* .../xfs/<dev>/ */
error = xfs_sysfs_init(&mp->m_kobj, &xfs_mp_ktype,
NULL, mp->m_super->s_id);
if (error)
return error;
/* .../xfs/<dev>/stats/ */
error = xfs_sysfs_init(&mp->m_stats.xs_kobj, &xfs_stats_ktype,
&mp->m_kobj, "stats");
if (error)
goto out_remove_fsdir;
/* .../xfs/<dev>/error/ */
error = xfs_sysfs_init(&mp->m_error_kobj, &xfs_error_ktype,
&mp->m_kobj, "error");
if (error)
return error;
goto out_remove_stats_dir;
/* .../xfs/<dev>/error/fail_at_unmount */
error = sysfs_create_file(&mp->m_error_kobj.kobject,
ATTR_LIST(fail_at_unmount));
if (error)
goto out_error;
goto out_remove_error_dir;
/* .../xfs/<dev>/error/metadata/ */
error = xfs_error_sysfs_init_class(mp, XFS_ERR_METADATA,
"metadata", &mp->m_error_meta_kobj,
xfs_error_meta_init);
if (error)
goto out_error;
goto out_remove_error_dir;
if (IS_ENABLED(CONFIG_XFS_RT) && xfs_has_zoned(mp)) {
/* .../xfs/<dev>/zoned/ */
error = xfs_sysfs_init(&mp->m_zoned_kobj, &xfs_zoned_ktype,
&mp->m_kobj, "zoned");
if (error)
goto out_remove_error_dir;
}
return 0;
out_error:
out_remove_error_dir:
xfs_sysfs_del(&mp->m_error_kobj);
out_remove_stats_dir:
xfs_sysfs_del(&mp->m_stats.xs_kobj);
out_remove_fsdir:
xfs_sysfs_del(&mp->m_kobj);
return error;
}
void
xfs_error_sysfs_del(
xfs_mount_sysfs_del(
struct xfs_mount *mp)
{
struct xfs_error_cfg *cfg;
int i, j;
if (IS_ENABLED(CONFIG_XFS_RT) && xfs_has_zoned(mp))
xfs_sysfs_del(&mp->m_zoned_kobj);
for (i = 0; i < XFS_ERR_CLASS_MAX; i++) {
for (j = 0; j < XFS_ERR_ERRNO_MAX; j++) {
cfg = &mp->m_error_cfg[i][j];
@ -749,6 +808,8 @@ xfs_error_sysfs_del(
}
xfs_sysfs_del(&mp->m_error_meta_kobj);
xfs_sysfs_del(&mp->m_error_kobj);
xfs_sysfs_del(&mp->m_stats.xs_kobj);
xfs_sysfs_del(&mp->m_kobj);
}
struct xfs_error_cfg *

View file

@ -7,7 +7,6 @@
#ifndef __XFS_SYSFS_H__
#define __XFS_SYSFS_H__
extern const struct kobj_type xfs_mp_ktype; /* xfs_mount */
extern const struct kobj_type xfs_dbg_ktype; /* debug */
extern const struct kobj_type xfs_log_ktype; /* xlog */
extern const struct kobj_type xfs_stats_ktype; /* stats */
@ -53,7 +52,7 @@ xfs_sysfs_del(
wait_for_completion(&kobj->complete);
}
int xfs_error_sysfs_init(struct xfs_mount *mp);
void xfs_error_sysfs_del(struct xfs_mount *mp);
int xfs_mount_sysfs_init(struct xfs_mount *mp);
void xfs_mount_sysfs_del(struct xfs_mount *mp);
#endif /* __XFS_SYSFS_H__ */

View file

@ -49,6 +49,8 @@
#include "xfs_metafile.h"
#include "xfs_metadir.h"
#include "xfs_rtgroup.h"
#include "xfs_zone_alloc.h"
#include "xfs_zone_priv.h"
/*
* We include this last to have the helpers above available for the trace

View file

@ -102,6 +102,7 @@ struct xfs_rmap_intent;
struct xfs_refcount_intent;
struct xfs_metadir_update;
struct xfs_rtgroup;
struct xfs_open_zone;
#define XFS_ATTR_FILTER_FLAGS \
{ XFS_ATTR_ROOT, "ROOT" }, \
@ -265,6 +266,152 @@ DEFINE_GROUP_REF_EVENT(xfs_group_grab);
DEFINE_GROUP_REF_EVENT(xfs_group_grab_next_tag);
DEFINE_GROUP_REF_EVENT(xfs_group_rele);
#ifdef CONFIG_XFS_RT
DECLARE_EVENT_CLASS(xfs_zone_class,
TP_PROTO(struct xfs_rtgroup *rtg),
TP_ARGS(rtg),
TP_STRUCT__entry(
__field(dev_t, dev)
__field(xfs_rgnumber_t, rgno)
__field(xfs_rgblock_t, used)
__field(unsigned int, nr_open)
),
TP_fast_assign(
struct xfs_mount *mp = rtg_mount(rtg);
__entry->dev = mp->m_super->s_dev;
__entry->rgno = rtg_rgno(rtg);
__entry->used = rtg_rmap(rtg)->i_used_blocks;
__entry->nr_open = mp->m_zone_info->zi_nr_open_zones;
),
TP_printk("dev %d:%d rgno 0x%x used 0x%x nr_open %u",
MAJOR(__entry->dev), MINOR(__entry->dev),
__entry->rgno,
__entry->used,
__entry->nr_open)
);
#define DEFINE_ZONE_EVENT(name) \
DEFINE_EVENT(xfs_zone_class, name, \
TP_PROTO(struct xfs_rtgroup *rtg), \
TP_ARGS(rtg))
DEFINE_ZONE_EVENT(xfs_zone_emptied);
DEFINE_ZONE_EVENT(xfs_zone_full);
DEFINE_ZONE_EVENT(xfs_zone_opened);
DEFINE_ZONE_EVENT(xfs_zone_reset);
DEFINE_ZONE_EVENT(xfs_zone_gc_target_opened);
TRACE_EVENT(xfs_zone_free_blocks,
TP_PROTO(struct xfs_rtgroup *rtg, xfs_rgblock_t rgbno,
xfs_extlen_t len),
TP_ARGS(rtg, rgbno, len),
TP_STRUCT__entry(
__field(dev_t, dev)
__field(xfs_rgnumber_t, rgno)
__field(xfs_rgblock_t, used)
__field(xfs_rgblock_t, rgbno)
__field(xfs_extlen_t, len)
),
TP_fast_assign(
__entry->dev = rtg_mount(rtg)->m_super->s_dev;
__entry->rgno = rtg_rgno(rtg);
__entry->used = rtg_rmap(rtg)->i_used_blocks;
__entry->rgbno = rgbno;
__entry->len = len;
),
TP_printk("dev %d:%d rgno 0x%x used 0x%x rgbno 0x%x len 0x%x",
MAJOR(__entry->dev), MINOR(__entry->dev),
__entry->rgno,
__entry->used,
__entry->rgbno,
__entry->len)
);
DECLARE_EVENT_CLASS(xfs_zone_alloc_class,
TP_PROTO(struct xfs_open_zone *oz, xfs_rgblock_t rgbno,
xfs_extlen_t len),
TP_ARGS(oz, rgbno, len),
TP_STRUCT__entry(
__field(dev_t, dev)
__field(xfs_rgnumber_t, rgno)
__field(xfs_rgblock_t, used)
__field(xfs_rgblock_t, written)
__field(xfs_rgblock_t, write_pointer)
__field(xfs_rgblock_t, rgbno)
__field(xfs_extlen_t, len)
),
TP_fast_assign(
__entry->dev = rtg_mount(oz->oz_rtg)->m_super->s_dev;
__entry->rgno = rtg_rgno(oz->oz_rtg);
__entry->used = rtg_rmap(oz->oz_rtg)->i_used_blocks;
__entry->written = oz->oz_written;
__entry->write_pointer = oz->oz_write_pointer;
__entry->rgbno = rgbno;
__entry->len = len;
),
TP_printk("dev %d:%d rgno 0x%x used 0x%x written 0x%x wp 0x%x rgbno 0x%x len 0x%x",
MAJOR(__entry->dev), MINOR(__entry->dev),
__entry->rgno,
__entry->used,
__entry->written,
__entry->write_pointer,
__entry->rgbno,
__entry->len)
);
#define DEFINE_ZONE_ALLOC_EVENT(name) \
DEFINE_EVENT(xfs_zone_alloc_class, name, \
TP_PROTO(struct xfs_open_zone *oz, xfs_rgblock_t rgbno, \
xfs_extlen_t len), \
TP_ARGS(oz, rgbno, len))
DEFINE_ZONE_ALLOC_EVENT(xfs_zone_record_blocks);
DEFINE_ZONE_ALLOC_EVENT(xfs_zone_alloc_blocks);
TRACE_EVENT(xfs_zone_gc_select_victim,
TP_PROTO(struct xfs_rtgroup *rtg, unsigned int bucket),
TP_ARGS(rtg, bucket),
TP_STRUCT__entry(
__field(dev_t, dev)
__field(xfs_rgnumber_t, rgno)
__field(xfs_rgblock_t, used)
__field(unsigned int, bucket)
),
TP_fast_assign(
__entry->dev = rtg_mount(rtg)->m_super->s_dev;
__entry->rgno = rtg_rgno(rtg);
__entry->used = rtg_rmap(rtg)->i_used_blocks;
__entry->bucket = bucket;
),
TP_printk("dev %d:%d rgno 0x%x used 0x%x bucket %u",
MAJOR(__entry->dev), MINOR(__entry->dev),
__entry->rgno,
__entry->used,
__entry->bucket)
);
TRACE_EVENT(xfs_zones_mount,
TP_PROTO(struct xfs_mount *mp),
TP_ARGS(mp),
TP_STRUCT__entry(
__field(dev_t, dev)
__field(xfs_rgnumber_t, rgcount)
__field(uint32_t, blocks)
__field(unsigned int, max_open_zones)
),
TP_fast_assign(
__entry->dev = mp->m_super->s_dev;
__entry->rgcount = mp->m_sb.sb_rgcount;
__entry->blocks = mp->m_groups[XG_TYPE_RTG].blocks;
__entry->max_open_zones = mp->m_max_open_zones;
),
TP_printk("dev %d:%d zoned %u blocks_per_zone %u, max_open %u",
MAJOR(__entry->dev), MINOR(__entry->dev),
__entry->rgcount,
__entry->blocks,
__entry->max_open_zones)
);
#endif /* CONFIG_XFS_RT */
TRACE_EVENT(xfs_inodegc_worker,
TP_PROTO(struct xfs_mount *mp, unsigned int shrinker_hits),
TP_ARGS(mp, shrinker_hits),
@ -545,6 +692,10 @@ DEFINE_BUF_EVENT(xfs_buf_iodone_async);
DEFINE_BUF_EVENT(xfs_buf_error_relse);
DEFINE_BUF_EVENT(xfs_buf_drain_buftarg);
DEFINE_BUF_EVENT(xfs_trans_read_buf_shut);
DEFINE_BUF_EVENT(xfs_buf_backing_folio);
DEFINE_BUF_EVENT(xfs_buf_backing_kmem);
DEFINE_BUF_EVENT(xfs_buf_backing_vmalloc);
DEFINE_BUF_EVENT(xfs_buf_backing_fallback);
/* not really buffer traces, but the buf provides useful information */
DEFINE_BUF_EVENT(xfs_btree_corrupt);
@ -1596,6 +1747,7 @@ DEFINE_SIMPLE_IO_EVENT(xfs_end_io_direct_write);
DEFINE_SIMPLE_IO_EVENT(xfs_end_io_direct_write_unwritten);
DEFINE_SIMPLE_IO_EVENT(xfs_end_io_direct_write_append);
DEFINE_SIMPLE_IO_EVENT(xfs_file_splice_read);
DEFINE_SIMPLE_IO_EVENT(xfs_zoned_map_blocks);
DECLARE_EVENT_CLASS(xfs_itrunc_class,
TP_PROTO(struct xfs_inode *ip, xfs_fsize_t new_size),
@ -3983,6 +4135,7 @@ DEFINE_SIMPLE_IO_EVENT(xfs_reflink_cancel_cow_range);
DEFINE_SIMPLE_IO_EVENT(xfs_reflink_end_cow);
DEFINE_INODE_IREC_EVENT(xfs_reflink_cow_remap_from);
DEFINE_INODE_IREC_EVENT(xfs_reflink_cow_remap_to);
DEFINE_INODE_IREC_EVENT(xfs_reflink_cow_remap_skip);
DEFINE_INODE_ERROR_EVENT(xfs_reflink_cancel_cow_range_error);
DEFINE_INODE_ERROR_EVENT(xfs_reflink_end_cow_error);
@ -5606,11 +5759,10 @@ DEFINE_METADIR_EVENT(xfs_metadir_lookup);
/* metadata inode space reservations */
DECLARE_EVENT_CLASS(xfs_metafile_resv_class,
TP_PROTO(struct xfs_inode *ip, xfs_filblks_t len),
TP_ARGS(ip, len),
TP_PROTO(struct xfs_mount *mp, xfs_filblks_t len),
TP_ARGS(mp, len),
TP_STRUCT__entry(
__field(dev_t, dev)
__field(xfs_ino_t, ino)
__field(unsigned long long, freeblks)
__field(unsigned long long, reserved)
__field(unsigned long long, asked)
@ -5618,19 +5770,15 @@ DECLARE_EVENT_CLASS(xfs_metafile_resv_class,
__field(unsigned long long, len)
),
TP_fast_assign(
struct xfs_mount *mp = ip->i_mount;
__entry->dev = mp->m_super->s_dev;
__entry->ino = ip->i_ino;
__entry->freeblks = percpu_counter_sum(&mp->m_fdblocks);
__entry->reserved = ip->i_delayed_blks;
__entry->asked = ip->i_meta_resv_asked;
__entry->used = ip->i_nblocks;
__entry->freeblks = xfs_sum_freecounter_raw(mp, XC_FREE_BLOCKS);
__entry->reserved = mp->m_metafile_resv_avail;
__entry->asked = mp->m_metafile_resv_target;
__entry->used = mp->m_metafile_resv_used;
__entry->len = len;
),
TP_printk("dev %d:%d ino 0x%llx freeblks %llu resv %llu ask %llu used %llu len %llu",
TP_printk("dev %d:%d freeblks %llu resv %llu ask %llu used %llu len %llu",
MAJOR(__entry->dev), MINOR(__entry->dev),
__entry->ino,
__entry->freeblks,
__entry->reserved,
__entry->asked,
@ -5639,14 +5787,14 @@ DECLARE_EVENT_CLASS(xfs_metafile_resv_class,
)
#define DEFINE_METAFILE_RESV_EVENT(name) \
DEFINE_EVENT(xfs_metafile_resv_class, name, \
TP_PROTO(struct xfs_inode *ip, xfs_filblks_t len), \
TP_ARGS(ip, len))
TP_PROTO(struct xfs_mount *mp, xfs_filblks_t len), \
TP_ARGS(mp, len))
DEFINE_METAFILE_RESV_EVENT(xfs_metafile_resv_init);
DEFINE_METAFILE_RESV_EVENT(xfs_metafile_resv_free);
DEFINE_METAFILE_RESV_EVENT(xfs_metafile_resv_alloc_space);
DEFINE_METAFILE_RESV_EVENT(xfs_metafile_resv_free_space);
DEFINE_METAFILE_RESV_EVENT(xfs_metafile_resv_critical);
DEFINE_INODE_ERROR_EVENT(xfs_metafile_resv_init_error);
DEFINE_METAFILE_RESV_EVENT(xfs_metafile_resv_init_error);
#ifdef CONFIG_XFS_RT
TRACE_EVENT(xfs_growfs_check_rtgeom,
@ -5669,6 +5817,46 @@ TRACE_EVENT(xfs_growfs_check_rtgeom,
);
#endif /* CONFIG_XFS_RT */
TRACE_DEFINE_ENUM(XC_FREE_BLOCKS);
TRACE_DEFINE_ENUM(XC_FREE_RTEXTENTS);
TRACE_DEFINE_ENUM(XC_FREE_RTAVAILABLE);
DECLARE_EVENT_CLASS(xfs_freeblocks_resv_class,
TP_PROTO(struct xfs_mount *mp, enum xfs_free_counter ctr,
uint64_t delta, unsigned long caller_ip),
TP_ARGS(mp, ctr, delta, caller_ip),
TP_STRUCT__entry(
__field(dev_t, dev)
__field(enum xfs_free_counter, ctr)
__field(uint64_t, delta)
__field(uint64_t, avail)
__field(uint64_t, total)
__field(unsigned long, caller_ip)
),
TP_fast_assign(
__entry->dev = mp->m_super->s_dev;
__entry->ctr = ctr;
__entry->delta = delta;
__entry->avail = mp->m_free[ctr].res_avail;
__entry->total = mp->m_free[ctr].res_total;
__entry->caller_ip = caller_ip;
),
TP_printk("dev %d:%d ctr %s delta %llu avail %llu total %llu caller %pS",
MAJOR(__entry->dev), MINOR(__entry->dev),
__print_symbolic(__entry->ctr, XFS_FREECOUNTER_STR),
__entry->delta,
__entry->avail,
__entry->total,
(char *)__entry->caller_ip)
)
#define DEFINE_FREEBLOCKS_RESV_EVENT(name) \
DEFINE_EVENT(xfs_freeblocks_resv_class, name, \
TP_PROTO(struct xfs_mount *mp, enum xfs_free_counter ctr, \
uint64_t delta, unsigned long caller_ip), \
TP_ARGS(mp, ctr, delta, caller_ip))
DEFINE_FREEBLOCKS_RESV_EVENT(xfs_freecounter_reserved);
DEFINE_FREEBLOCKS_RESV_EVENT(xfs_freecounter_enospc);
#endif /* _TRACE_XFS_H */
#undef TRACE_INCLUDE_PATH

1220
fs/xfs/xfs_zone_alloc.c Normal file

File diff suppressed because it is too large Load diff

70
fs/xfs/xfs_zone_alloc.h Normal file
View file

@ -0,0 +1,70 @@
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _XFS_ZONE_ALLOC_H
#define _XFS_ZONE_ALLOC_H
struct iomap_ioend;
struct xfs_open_zone;
struct xfs_zone_alloc_ctx {
struct xfs_open_zone *open_zone;
xfs_filblks_t reserved_blocks;
};
/*
* Grab any available space, even if it is less than what the caller asked for.
*/
#define XFS_ZR_GREEDY (1U << 0)
/*
* Only grab instantly available space, don't wait or GC.
*/
#define XFS_ZR_NOWAIT (1U << 1)
/*
* Dip into the reserved pool.
*/
#define XFS_ZR_RESERVED (1U << 2)
int xfs_zoned_space_reserve(struct xfs_inode *ip, xfs_filblks_t count_fsb,
unsigned int flags, struct xfs_zone_alloc_ctx *ac);
void xfs_zoned_space_unreserve(struct xfs_inode *ip,
struct xfs_zone_alloc_ctx *ac);
void xfs_zoned_add_available(struct xfs_mount *mp, xfs_filblks_t count_fsb);
void xfs_zone_alloc_and_submit(struct iomap_ioend *ioend,
struct xfs_open_zone **oz);
int xfs_zone_free_blocks(struct xfs_trans *tp, struct xfs_rtgroup *rtg,
xfs_fsblock_t fsbno, xfs_filblks_t len);
int xfs_zoned_end_io(struct xfs_inode *ip, xfs_off_t offset, xfs_off_t count,
xfs_daddr_t daddr, struct xfs_open_zone *oz,
xfs_fsblock_t old_startblock);
void xfs_open_zone_put(struct xfs_open_zone *oz);
void xfs_zoned_wake_all(struct xfs_mount *mp);
bool xfs_zone_rgbno_is_valid(struct xfs_rtgroup *rtg, xfs_rgnumber_t rgbno);
void xfs_mark_rtg_boundary(struct iomap_ioend *ioend);
uint64_t xfs_zoned_default_resblks(struct xfs_mount *mp,
enum xfs_free_counter ctr);
void xfs_zoned_show_stats(struct seq_file *m, struct xfs_mount *mp);
#ifdef CONFIG_XFS_RT
int xfs_mount_zones(struct xfs_mount *mp);
void xfs_unmount_zones(struct xfs_mount *mp);
void xfs_zone_gc_start(struct xfs_mount *mp);
void xfs_zone_gc_stop(struct xfs_mount *mp);
#else
static inline int xfs_mount_zones(struct xfs_mount *mp)
{
return -EIO;
}
static inline void xfs_unmount_zones(struct xfs_mount *mp)
{
}
static inline void xfs_zone_gc_start(struct xfs_mount *mp)
{
}
static inline void xfs_zone_gc_stop(struct xfs_mount *mp)
{
}
#endif /* CONFIG_XFS_RT */
#endif /* _XFS_ZONE_ALLOC_H */

1165
fs/xfs/xfs_zone_gc.c Normal file

File diff suppressed because it is too large Load diff

105
fs/xfs/xfs_zone_info.c Normal file
View file

@ -0,0 +1,105 @@
// SPDX-License-Identifier: GPL-2.0
/*
* Copyright (c) 2023-2025 Christoph Hellwig.
* Copyright (c) 2024-2025, Western Digital Corporation or its affiliates.
*/
#include "xfs.h"
#include "xfs_shared.h"
#include "xfs_format.h"
#include "xfs_trans_resv.h"
#include "xfs_mount.h"
#include "xfs_inode.h"
#include "xfs_rtgroup.h"
#include "xfs_zone_alloc.h"
#include "xfs_zone_priv.h"
static const char xfs_write_hint_shorthand[6][16] = {
"NOT_SET", "NONE", "SHORT", "MEDIUM", "LONG", "EXTREME"};
static inline const char *
xfs_write_hint_to_str(
uint8_t write_hint)
{
if (write_hint > WRITE_LIFE_EXTREME)
return "UNKNOWN";
return xfs_write_hint_shorthand[write_hint];
}
static void
xfs_show_open_zone(
struct seq_file *m,
struct xfs_open_zone *oz)
{
seq_printf(m, "\t zone %d, wp %u, written %u, used %u, hint %s\n",
rtg_rgno(oz->oz_rtg),
oz->oz_write_pointer, oz->oz_written,
rtg_rmap(oz->oz_rtg)->i_used_blocks,
xfs_write_hint_to_str(oz->oz_write_hint));
}
static void
xfs_show_full_zone_used_distribution(
struct seq_file *m,
struct xfs_mount *mp)
{
struct xfs_zone_info *zi = mp->m_zone_info;
unsigned int reclaimable = 0, full, i;
spin_lock(&zi->zi_used_buckets_lock);
for (i = 0; i < XFS_ZONE_USED_BUCKETS; i++) {
unsigned int entries = zi->zi_used_bucket_entries[i];
seq_printf(m, "\t %2u..%2u%%: %u\n",
i * (100 / XFS_ZONE_USED_BUCKETS),
(i + 1) * (100 / XFS_ZONE_USED_BUCKETS) - 1,
entries);
reclaimable += entries;
}
spin_unlock(&zi->zi_used_buckets_lock);
full = mp->m_sb.sb_rgcount;
if (zi->zi_open_gc_zone)
full--;
full -= zi->zi_nr_open_zones;
full -= atomic_read(&zi->zi_nr_free_zones);
full -= reclaimable;
seq_printf(m, "\t 100%%: %u\n", full);
}
void
xfs_zoned_show_stats(
struct seq_file *m,
struct xfs_mount *mp)
{
struct xfs_zone_info *zi = mp->m_zone_info;
struct xfs_open_zone *oz;
seq_puts(m, "\n");
seq_printf(m, "\tuser free RT blocks: %lld\n",
xfs_sum_freecounter(mp, XC_FREE_RTEXTENTS));
seq_printf(m, "\treserved free RT blocks: %lld\n",
mp->m_free[XC_FREE_RTEXTENTS].res_avail);
seq_printf(m, "\tuser available RT blocks: %lld\n",
xfs_sum_freecounter(mp, XC_FREE_RTAVAILABLE));
seq_printf(m, "\treserved available RT blocks: %lld\n",
mp->m_free[XC_FREE_RTAVAILABLE].res_avail);
seq_printf(m, "\tRT reservations required: %d\n",
!list_empty_careful(&zi->zi_reclaim_reservations));
seq_printf(m, "\tRT GC required: %d\n",
xfs_zoned_need_gc(mp));
seq_printf(m, "\tfree zones: %d\n", atomic_read(&zi->zi_nr_free_zones));
seq_puts(m, "\topen zones:\n");
spin_lock(&zi->zi_open_zones_lock);
list_for_each_entry(oz, &zi->zi_open_zones, oz_entry)
xfs_show_open_zone(m, oz);
if (zi->zi_open_gc_zone) {
seq_puts(m, "\topen gc zone:\n");
xfs_show_open_zone(m, zi->zi_open_gc_zone);
}
spin_unlock(&zi->zi_open_zones_lock);
seq_puts(m, "\tused blocks distribution (fully written zones):\n");
xfs_show_full_zone_used_distribution(m, mp);
}

119
fs/xfs/xfs_zone_priv.h Normal file
View file

@ -0,0 +1,119 @@
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _XFS_ZONE_PRIV_H
#define _XFS_ZONE_PRIV_H
struct xfs_open_zone {
/*
* Entry in the open zone list and refcount. Protected by
* zi_open_zones_lock in struct xfs_zone_info.
*/
struct list_head oz_entry;
atomic_t oz_ref;
/*
* oz_write_pointer is the write pointer at which space is handed out
* for conventional zones, or simple the count of blocks handed out
* so far for sequential write required zones and is protected by
* oz_alloc_lock/
*/
spinlock_t oz_alloc_lock;
xfs_rgblock_t oz_write_pointer;
/*
* oz_written is the number of blocks for which we've received a
* write completion. oz_written must always be <= oz_write_pointer
* and is protected by the ILOCK of the rmap inode.
*/
xfs_rgblock_t oz_written;
/*
* Write hint (data temperature) assigned to this zone, or
* WRITE_LIFE_NOT_SET if none was set.
*/
enum rw_hint oz_write_hint;
/*
* Is this open zone used for garbage collection? There can only be a
* single open GC zone, which is pointed to by zi_open_gc_zone in
* struct xfs_zone_info. Constant over the life time of an open zone.
*/
bool oz_is_gc;
/*
* Pointer to the RT groups structure for this open zone. Constant over
* the life time of an open zone.
*/
struct xfs_rtgroup *oz_rtg;
};
/*
* Number of bitmap buckets to track reclaimable zones. There are 10 buckets
* so that each 10% of the usable capacity get their own bucket and GC can
* only has to walk the bitmaps of the lesser used zones if there are any.
*/
#define XFS_ZONE_USED_BUCKETS 10u
struct xfs_zone_info {
/*
* List of pending space reservations:
*/
spinlock_t zi_reservation_lock;
struct list_head zi_reclaim_reservations;
/*
* List and number of open zones:
*/
spinlock_t zi_open_zones_lock;
struct list_head zi_open_zones;
unsigned int zi_nr_open_zones;
/*
* Free zone search cursor and number of free zones:
*/
unsigned long zi_free_zone_cursor;
atomic_t zi_nr_free_zones;
/*
* Wait queue to wait for free zones or open zone resources to become
* available:
*/
wait_queue_head_t zi_zone_wait;
/*
* Pointer to the GC thread, and the current open zone used by GC
* (if any).
*
* zi_open_gc_zone is mostly private to the GC thread, but can be read
* for debugging from other threads, in which case zi_open_zones_lock
* must be taken to access it.
*/
struct task_struct *zi_gc_thread;
struct xfs_open_zone *zi_open_gc_zone;
/*
* List of zones that need a reset:
*/
spinlock_t zi_reset_list_lock;
struct xfs_group *zi_reset_list;
/*
* A set of bitmaps to bucket-sort reclaimable zones by used blocks to help
* garbage collection to quickly find the best candidate for reclaim.
*/
spinlock_t zi_used_buckets_lock;
unsigned int zi_used_bucket_entries[XFS_ZONE_USED_BUCKETS];
unsigned long *zi_used_bucket_bitmap[XFS_ZONE_USED_BUCKETS];
};
struct xfs_open_zone *xfs_open_zone(struct xfs_mount *mp,
enum rw_hint write_hint, bool is_gc);
int xfs_zone_gc_reset_sync(struct xfs_rtgroup *rtg);
bool xfs_zoned_need_gc(struct xfs_mount *mp);
int xfs_zone_gc_mount(struct xfs_mount *mp);
void xfs_zone_gc_unmount(struct xfs_mount *mp);
void xfs_zoned_resv_wake_all(struct xfs_mount *mp);
#endif /* _XFS_ZONE_PRIV_H */

View file

@ -0,0 +1,263 @@
// SPDX-License-Identifier: GPL-2.0
/*
* Copyright (c) 2023-2025 Christoph Hellwig.
* Copyright (c) 2024-2025, Western Digital Corporation or its affiliates.
*/
#include "xfs.h"
#include "xfs_shared.h"
#include "xfs_format.h"
#include "xfs_trans_resv.h"
#include "xfs_mount.h"
#include "xfs_inode.h"
#include "xfs_rtbitmap.h"
#include "xfs_zone_alloc.h"
#include "xfs_zone_priv.h"
#include "xfs_zones.h"
/*
* Note: the zoned allocator does not support a rtextsize > 1, so this code and
* the allocator itself uses file system blocks interchangeable with realtime
* extents without doing the otherwise required conversions.
*/
/*
* Per-task space reservation.
*
* Tasks that need to wait for GC to free up space allocate one of these
* on-stack and adds it to the per-mount zi_reclaim_reservations lists.
* The GC thread will then wake the tasks in order when space becomes available.
*/
struct xfs_zone_reservation {
struct list_head entry;
struct task_struct *task;
xfs_filblks_t count_fsb;
};
/*
* Calculate the number of reserved blocks.
*
* XC_FREE_RTEXTENTS counts the user available capacity, to which the file
* system can be filled, while XC_FREE_RTAVAILABLE counts the blocks instantly
* available for writes without waiting for GC.
*
* For XC_FREE_RTAVAILABLE only the smaller reservation required for GC and
* block zeroing is excluded from the user capacity, while XC_FREE_RTEXTENTS
* is further restricted by at least one zone as well as the optional
* persistently reserved blocks. This allows the allocator to run more
* smoothly by not always triggering GC.
*/
uint64_t
xfs_zoned_default_resblks(
struct xfs_mount *mp,
enum xfs_free_counter ctr)
{
switch (ctr) {
case XC_FREE_RTEXTENTS:
return (uint64_t)XFS_RESERVED_ZONES *
mp->m_groups[XG_TYPE_RTG].blocks +
mp->m_sb.sb_rtreserved;
case XC_FREE_RTAVAILABLE:
return (uint64_t)XFS_GC_ZONES *
mp->m_groups[XG_TYPE_RTG].blocks;
default:
ASSERT(0);
return 0;
}
}
void
xfs_zoned_resv_wake_all(
struct xfs_mount *mp)
{
struct xfs_zone_info *zi = mp->m_zone_info;
struct xfs_zone_reservation *reservation;
spin_lock(&zi->zi_reservation_lock);
list_for_each_entry(reservation, &zi->zi_reclaim_reservations, entry)
wake_up_process(reservation->task);
spin_unlock(&zi->zi_reservation_lock);
}
void
xfs_zoned_add_available(
struct xfs_mount *mp,
xfs_filblks_t count_fsb)
{
struct xfs_zone_info *zi = mp->m_zone_info;
struct xfs_zone_reservation *reservation;
if (list_empty_careful(&zi->zi_reclaim_reservations)) {
xfs_add_freecounter(mp, XC_FREE_RTAVAILABLE, count_fsb);
return;
}
spin_lock(&zi->zi_reservation_lock);
xfs_add_freecounter(mp, XC_FREE_RTAVAILABLE, count_fsb);
count_fsb = xfs_sum_freecounter(mp, XC_FREE_RTAVAILABLE);
list_for_each_entry(reservation, &zi->zi_reclaim_reservations, entry) {
if (reservation->count_fsb > count_fsb)
break;
wake_up_process(reservation->task);
count_fsb -= reservation->count_fsb;
}
spin_unlock(&zi->zi_reservation_lock);
}
static int
xfs_zoned_space_wait_error(
struct xfs_mount *mp)
{
if (xfs_is_shutdown(mp))
return -EIO;
if (fatal_signal_pending(current))
return -EINTR;
return 0;
}
static int
xfs_zoned_reserve_available(
struct xfs_inode *ip,
xfs_filblks_t count_fsb,
unsigned int flags)
{
struct xfs_mount *mp = ip->i_mount;
struct xfs_zone_info *zi = mp->m_zone_info;
struct xfs_zone_reservation reservation = {
.task = current,
.count_fsb = count_fsb,
};
int error;
/*
* If there are no waiters, try to directly grab the available blocks
* from the percpu counter.
*
* If the caller wants to dip into the reserved pool also bypass the
* wait list. This relies on the fact that we have a very graciously
* sized reserved pool that always has enough space. If the reserved
* allocations fail we're in trouble.
*/
if (likely(list_empty_careful(&zi->zi_reclaim_reservations) ||
(flags & XFS_ZR_RESERVED))) {
error = xfs_dec_freecounter(mp, XC_FREE_RTAVAILABLE, count_fsb,
flags & XFS_ZR_RESERVED);
if (error != -ENOSPC)
return error;
}
if (flags & XFS_ZR_NOWAIT)
return -EAGAIN;
spin_lock(&zi->zi_reservation_lock);
list_add_tail(&reservation.entry, &zi->zi_reclaim_reservations);
while ((error = xfs_zoned_space_wait_error(mp)) == 0) {
set_current_state(TASK_KILLABLE);
error = xfs_dec_freecounter(mp, XC_FREE_RTAVAILABLE, count_fsb,
flags & XFS_ZR_RESERVED);
if (error != -ENOSPC)
break;
/*
* Make sure to start GC if it is not running already. As we
* check the rtavailable count when filling up zones, GC is
* normally already running at this point, but in some setups
* with very few zones we may completely run out of non-
* reserved blocks in between filling zones.
*/
if (!xfs_is_zonegc_running(mp))
wake_up_process(zi->zi_gc_thread);
/*
* If there is no reclaimable group left and we aren't still
* processing a pending GC request give up as we're fully out
* of space.
*/
if (!xfs_group_marked(mp, XG_TYPE_RTG, XFS_RTG_RECLAIMABLE) &&
!xfs_is_zonegc_running(mp))
break;
spin_unlock(&zi->zi_reservation_lock);
schedule();
spin_lock(&zi->zi_reservation_lock);
}
list_del(&reservation.entry);
spin_unlock(&zi->zi_reservation_lock);
__set_current_state(TASK_RUNNING);
return error;
}
/*
* Implement greedy space allocation for short writes by trying to grab all
* that is left after locking out other threads from trying to do the same.
*
* This isn't exactly optimal and can hopefully be replaced by a proper
* percpu_counter primitive one day.
*/
static int
xfs_zoned_reserve_extents_greedy(
struct xfs_inode *ip,
xfs_filblks_t *count_fsb,
unsigned int flags)
{
struct xfs_mount *mp = ip->i_mount;
struct xfs_zone_info *zi = mp->m_zone_info;
s64 len = *count_fsb;
int error = -ENOSPC;
spin_lock(&zi->zi_reservation_lock);
len = min(len, xfs_sum_freecounter(mp, XC_FREE_RTEXTENTS));
if (len > 0) {
*count_fsb = len;
error = xfs_dec_freecounter(mp, XC_FREE_RTEXTENTS, *count_fsb,
flags & XFS_ZR_RESERVED);
}
spin_unlock(&zi->zi_reservation_lock);
return error;
}
int
xfs_zoned_space_reserve(
struct xfs_inode *ip,
xfs_filblks_t count_fsb,
unsigned int flags,
struct xfs_zone_alloc_ctx *ac)
{
struct xfs_mount *mp = ip->i_mount;
int error;
ASSERT(ac->reserved_blocks == 0);
ASSERT(ac->open_zone == NULL);
error = xfs_dec_freecounter(mp, XC_FREE_RTEXTENTS, count_fsb,
flags & XFS_ZR_RESERVED);
if (error == -ENOSPC && (flags & XFS_ZR_GREEDY) && count_fsb > 1)
error = xfs_zoned_reserve_extents_greedy(ip, &count_fsb, flags);
if (error)
return error;
error = xfs_zoned_reserve_available(ip, count_fsb, flags);
if (error) {
xfs_add_freecounter(mp, XC_FREE_RTEXTENTS, count_fsb);
return error;
}
ac->reserved_blocks = count_fsb;
return 0;
}
void
xfs_zoned_space_unreserve(
struct xfs_inode *ip,
struct xfs_zone_alloc_ctx *ac)
{
if (ac->reserved_blocks > 0) {
struct xfs_mount *mp = ip->i_mount;
xfs_zoned_add_available(mp, ac->reserved_blocks);
xfs_add_freecounter(mp, XC_FREE_RTEXTENTS, ac->reserved_blocks);
}
if (ac->open_zone)
xfs_open_zone_put(ac->open_zone);
}