2019-05-19 13:08:20 +01:00
|
|
|
// SPDX-License-Identifier: GPL-2.0-only
|
2014-08-06 16:08:36 -07:00
|
|
|
/*
|
|
|
|
* zpool memory storage api
|
|
|
|
*
|
|
|
|
* Copyright (C) 2014 Dan Streetman
|
|
|
|
*
|
|
|
|
* This is a common frontend for memory storage pool implementations.
|
|
|
|
* Typically, this is used to store compressed memory.
|
|
|
|
*/
|
|
|
|
|
|
|
|
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
|
|
|
|
|
|
|
|
#include <linux/list.h>
|
|
|
|
#include <linux/types.h>
|
|
|
|
#include <linux/mm.h>
|
|
|
|
#include <linux/slab.h>
|
|
|
|
#include <linux/spinlock.h>
|
|
|
|
#include <linux/module.h>
|
|
|
|
#include <linux/zpool.h>
|
|
|
|
|
|
|
|
struct zpool {
|
|
|
|
struct zpool_driver *driver;
|
|
|
|
void *pool;
|
|
|
|
};
|
|
|
|
|
|
|
|
static LIST_HEAD(drivers_head);
|
|
|
|
static DEFINE_SPINLOCK(drivers_lock);
|
|
|
|
|
|
|
|
/**
|
|
|
|
* zpool_register_driver() - register a zpool implementation.
|
|
|
|
* @driver: driver to register
|
|
|
|
*/
|
|
|
|
void zpool_register_driver(struct zpool_driver *driver)
|
|
|
|
{
|
|
|
|
spin_lock(&drivers_lock);
|
|
|
|
atomic_set(&driver->refcount, 0);
|
|
|
|
list_add(&driver->list, &drivers_head);
|
|
|
|
spin_unlock(&drivers_lock);
|
|
|
|
}
|
|
|
|
EXPORT_SYMBOL(zpool_register_driver);
|
|
|
|
|
|
|
|
/**
|
|
|
|
* zpool_unregister_driver() - unregister a zpool implementation.
|
|
|
|
* @driver: driver to unregister.
|
|
|
|
*
|
|
|
|
* Module usage counting is used to prevent using a driver
|
|
|
|
* while/after unloading, so if this is called from module
|
|
|
|
* exit function, this should never fail; if called from
|
|
|
|
* other than the module exit function, and this returns
|
|
|
|
* failure, the driver is in use and must remain available.
|
|
|
|
*/
|
|
|
|
int zpool_unregister_driver(struct zpool_driver *driver)
|
|
|
|
{
|
|
|
|
int ret = 0, refcount;
|
|
|
|
|
|
|
|
spin_lock(&drivers_lock);
|
|
|
|
refcount = atomic_read(&driver->refcount);
|
|
|
|
WARN_ON(refcount < 0);
|
|
|
|
if (refcount > 0)
|
|
|
|
ret = -EBUSY;
|
|
|
|
else
|
|
|
|
list_del(&driver->list);
|
|
|
|
spin_unlock(&drivers_lock);
|
|
|
|
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
EXPORT_SYMBOL(zpool_unregister_driver);
|
|
|
|
|
2015-11-06 16:29:18 -08:00
|
|
|
/* this assumes @type is null-terminated. */
|
2015-11-06 16:29:21 -08:00
|
|
|
static struct zpool_driver *zpool_get_driver(const char *type)
|
2014-08-06 16:08:36 -07:00
|
|
|
{
|
|
|
|
struct zpool_driver *driver;
|
|
|
|
|
|
|
|
spin_lock(&drivers_lock);
|
|
|
|
list_for_each_entry(driver, &drivers_head, list) {
|
|
|
|
if (!strcmp(driver->type, type)) {
|
|
|
|
bool got = try_module_get(driver->owner);
|
|
|
|
|
|
|
|
if (got)
|
|
|
|
atomic_inc(&driver->refcount);
|
|
|
|
spin_unlock(&drivers_lock);
|
|
|
|
return got ? driver : NULL;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
spin_unlock(&drivers_lock);
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
|
|
|
|
static void zpool_put_driver(struct zpool_driver *driver)
|
|
|
|
{
|
|
|
|
atomic_dec(&driver->refcount);
|
|
|
|
module_put(driver->owner);
|
|
|
|
}
|
|
|
|
|
zpool: add zpool_has_pool()
This series makes creation of the zpool and compressor dynamic, so that
they can be changed at runtime. This makes using/configuring zswap
easier, as before this zswap had to be configured at boot time, using boot
params.
This uses a single list to track both the zpool and compressor together,
although Seth had mentioned an alternative which is to track the zpools
and compressors using separate lists. In the most common case, only a
single zpool and single compressor, using one list is slightly simpler
than using two lists, and for the uncommon case of multiple zpools and/or
compressors, using one list is slightly less simple (and uses slightly
more memory, probably) than using two lists.
This patch (of 4):
Add zpool_has_pool() function, indicating if the specified type of zpool
is available (i.e. zsmalloc or zbud). This allows checking if a pool is
available, without actually trying to allocate it, similar to
crypto_has_alg().
This is used by a following patch to zswap that enables the dynamic
runtime creation of zswap zpools.
Signed-off-by: Dan Streetman <ddstreet@ieee.org>
Acked-by: Seth Jennings <sjennings@variantweb.net>
Cc: Sergey Senozhatsky <sergey.senozhatsky.work@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2015-09-09 15:35:16 -07:00
|
|
|
/**
|
|
|
|
* zpool_has_pool() - Check if the pool driver is available
|
mm: zbud: remove zbud
The zbud compressed pages allocator is rarely used, most users use
zsmalloc. zbud consumes much more memory (only stores 1 or 2 compressed
pages per physical page). The only advantage of zbud is a marginal
performance improvement that by no means justify the memory overhead.
Historically, zsmalloc had significantly worse latency than zbud and
z3fold but offered better memory savings. This is no longer the case as
shown by a simple recent analysis [1]. In a kernel build test on tmpfs in
a limited cgroup, zbud 2-3% less time than zsmalloc, but at the cost of
using ~32% more memory (1.5G vs 1.13G). The tradeoff does not make sense
for zbud in any practical scenario.
The only alleged advantage of zbud is not having the dependency on
CONFIG_MMU, but CONFIG_SWAP already depends on CONFIG_MMU anyway, and zbud
is only used by zswap.
Remove zbud after z3fold's removal, leaving zsmalloc as the one and only
zpool allocator. Leave the removal of the zpool API (and its associated
config options) to a followup cleanup after no more allocators show up.
Deprecating zbud for a few cycles before removing it was initially
proposed [2], like z3fold was marked as deprecated for 2 cycles [3].
However, Johannes rightfully pointed out that the 2 cycles is too short
for most downstream consumers, and z3fold was deprecated first only as a
courtesy anyway.
[1]https://lore.kernel.org/lkml/CAJD7tkbRF6od-2x_L8-A1QL3=2Ww13sCj4S3i4bNndqF+3+_Vg@mail.gmail.com/
[2]https://lore.kernel.org/lkml/Z5gdnSX5Lv-nfjQL@google.com/
[3]https://lore.kernel.org/lkml/20240904233343.933462-1-yosryahmed@google.com/
Link: https://lkml.kernel.org/r/20250129180633.3501650-3-yosry.ahmed@linux.dev
Signed-off-by: Yosry Ahmed <yosry.ahmed@linux.dev>
Reviewed-by: Shakeel Butt <shakeel.butt@linux.dev>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Nhat Pham <nphamcs@gmail.com>
Cc: Alexander Gordeev <agordeev@linux.ibm.com>
Cc: Chengming Zhou <chengming.zhou@linux.dev>
Cc: Christian Borntraeger <borntraeger@linux.ibm.com>
Cc: Dan Streetman <ddstreet@ieee.org>
Cc: Heiko Carstens <hca@linux.ibm.com>
Cc: Huacai Chen <chenhuacai@kernel.org>
Cc: Miaohe Lin <linmiaohe@huawei.com>
Cc: Seth Jennings <sjenning@redhat.com>
Cc: Sven Schnelle <svens@linux.ibm.com>
Cc: Vasily Gorbik <gor@linux.ibm.com>
Cc: Vitaly Wool <vitaly.wool@konsulko.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: WANG Xuerui <kernel@xen0n.name>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-01-29 18:06:32 +00:00
|
|
|
* @type: The type of the zpool to check (e.g. zsmalloc)
|
zpool: add zpool_has_pool()
This series makes creation of the zpool and compressor dynamic, so that
they can be changed at runtime. This makes using/configuring zswap
easier, as before this zswap had to be configured at boot time, using boot
params.
This uses a single list to track both the zpool and compressor together,
although Seth had mentioned an alternative which is to track the zpools
and compressors using separate lists. In the most common case, only a
single zpool and single compressor, using one list is slightly simpler
than using two lists, and for the uncommon case of multiple zpools and/or
compressors, using one list is slightly less simple (and uses slightly
more memory, probably) than using two lists.
This patch (of 4):
Add zpool_has_pool() function, indicating if the specified type of zpool
is available (i.e. zsmalloc or zbud). This allows checking if a pool is
available, without actually trying to allocate it, similar to
crypto_has_alg().
This is used by a following patch to zswap that enables the dynamic
runtime creation of zswap zpools.
Signed-off-by: Dan Streetman <ddstreet@ieee.org>
Acked-by: Seth Jennings <sjennings@variantweb.net>
Cc: Sergey Senozhatsky <sergey.senozhatsky.work@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2015-09-09 15:35:16 -07:00
|
|
|
*
|
|
|
|
* This checks if the @type pool driver is available. This will try to load
|
|
|
|
* the requested module, if needed, but there is no guarantee the module will
|
|
|
|
* still be loaded and available immediately after calling. If this returns
|
|
|
|
* true, the caller should assume the pool is available, but must be prepared
|
|
|
|
* to handle the @zpool_create_pool() returning failure. However if this
|
|
|
|
* returns false, the caller should assume the requested pool type is not
|
|
|
|
* available; either the requested pool type module does not exist, or could
|
|
|
|
* not be loaded, and calling @zpool_create_pool() with the pool type will
|
|
|
|
* fail.
|
|
|
|
*
|
2015-11-06 16:29:18 -08:00
|
|
|
* The @type string must be null-terminated.
|
|
|
|
*
|
zpool: add zpool_has_pool()
This series makes creation of the zpool and compressor dynamic, so that
they can be changed at runtime. This makes using/configuring zswap
easier, as before this zswap had to be configured at boot time, using boot
params.
This uses a single list to track both the zpool and compressor together,
although Seth had mentioned an alternative which is to track the zpools
and compressors using separate lists. In the most common case, only a
single zpool and single compressor, using one list is slightly simpler
than using two lists, and for the uncommon case of multiple zpools and/or
compressors, using one list is slightly less simple (and uses slightly
more memory, probably) than using two lists.
This patch (of 4):
Add zpool_has_pool() function, indicating if the specified type of zpool
is available (i.e. zsmalloc or zbud). This allows checking if a pool is
available, without actually trying to allocate it, similar to
crypto_has_alg().
This is used by a following patch to zswap that enables the dynamic
runtime creation of zswap zpools.
Signed-off-by: Dan Streetman <ddstreet@ieee.org>
Acked-by: Seth Jennings <sjennings@variantweb.net>
Cc: Sergey Senozhatsky <sergey.senozhatsky.work@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2015-09-09 15:35:16 -07:00
|
|
|
* Returns: true if @type pool is available, false if not
|
|
|
|
*/
|
|
|
|
bool zpool_has_pool(char *type)
|
|
|
|
{
|
|
|
|
struct zpool_driver *driver = zpool_get_driver(type);
|
|
|
|
|
|
|
|
if (!driver) {
|
|
|
|
request_module("zpool-%s", type);
|
|
|
|
driver = zpool_get_driver(type);
|
|
|
|
}
|
|
|
|
|
|
|
|
if (!driver)
|
|
|
|
return false;
|
|
|
|
|
|
|
|
zpool_put_driver(driver);
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
EXPORT_SYMBOL(zpool_has_pool);
|
|
|
|
|
2014-08-06 16:08:36 -07:00
|
|
|
/**
|
|
|
|
* zpool_create_pool() - Create a new zpool
|
mm: zbud: remove zbud
The zbud compressed pages allocator is rarely used, most users use
zsmalloc. zbud consumes much more memory (only stores 1 or 2 compressed
pages per physical page). The only advantage of zbud is a marginal
performance improvement that by no means justify the memory overhead.
Historically, zsmalloc had significantly worse latency than zbud and
z3fold but offered better memory savings. This is no longer the case as
shown by a simple recent analysis [1]. In a kernel build test on tmpfs in
a limited cgroup, zbud 2-3% less time than zsmalloc, but at the cost of
using ~32% more memory (1.5G vs 1.13G). The tradeoff does not make sense
for zbud in any practical scenario.
The only alleged advantage of zbud is not having the dependency on
CONFIG_MMU, but CONFIG_SWAP already depends on CONFIG_MMU anyway, and zbud
is only used by zswap.
Remove zbud after z3fold's removal, leaving zsmalloc as the one and only
zpool allocator. Leave the removal of the zpool API (and its associated
config options) to a followup cleanup after no more allocators show up.
Deprecating zbud for a few cycles before removing it was initially
proposed [2], like z3fold was marked as deprecated for 2 cycles [3].
However, Johannes rightfully pointed out that the 2 cycles is too short
for most downstream consumers, and z3fold was deprecated first only as a
courtesy anyway.
[1]https://lore.kernel.org/lkml/CAJD7tkbRF6od-2x_L8-A1QL3=2Ww13sCj4S3i4bNndqF+3+_Vg@mail.gmail.com/
[2]https://lore.kernel.org/lkml/Z5gdnSX5Lv-nfjQL@google.com/
[3]https://lore.kernel.org/lkml/20240904233343.933462-1-yosryahmed@google.com/
Link: https://lkml.kernel.org/r/20250129180633.3501650-3-yosry.ahmed@linux.dev
Signed-off-by: Yosry Ahmed <yosry.ahmed@linux.dev>
Reviewed-by: Shakeel Butt <shakeel.butt@linux.dev>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Nhat Pham <nphamcs@gmail.com>
Cc: Alexander Gordeev <agordeev@linux.ibm.com>
Cc: Chengming Zhou <chengming.zhou@linux.dev>
Cc: Christian Borntraeger <borntraeger@linux.ibm.com>
Cc: Dan Streetman <ddstreet@ieee.org>
Cc: Heiko Carstens <hca@linux.ibm.com>
Cc: Huacai Chen <chenhuacai@kernel.org>
Cc: Miaohe Lin <linmiaohe@huawei.com>
Cc: Seth Jennings <sjenning@redhat.com>
Cc: Sven Schnelle <svens@linux.ibm.com>
Cc: Vasily Gorbik <gor@linux.ibm.com>
Cc: Vitaly Wool <vitaly.wool@konsulko.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: WANG Xuerui <kernel@xen0n.name>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-01-29 18:06:32 +00:00
|
|
|
* @type: The type of the zpool to create (e.g. zsmalloc)
|
2018-02-06 15:42:13 -08:00
|
|
|
* @name: The name of the zpool (e.g. zram0, zswap)
|
|
|
|
* @gfp: The GFP flags to use when allocating the pool.
|
2014-08-06 16:08:36 -07:00
|
|
|
*
|
|
|
|
* This creates a new zpool of the specified type. The gfp flags will be
|
|
|
|
* used when allocating memory, if the implementation supports it. If the
|
2018-01-31 16:19:59 -08:00
|
|
|
* ops param is NULL, then the created zpool will not be evictable.
|
2014-08-06 16:08:36 -07:00
|
|
|
*
|
|
|
|
* Implementations must guarantee this to be thread-safe.
|
|
|
|
*
|
2015-11-06 16:29:18 -08:00
|
|
|
* The @type and @name strings must be null-terminated.
|
|
|
|
*
|
2014-08-06 16:08:36 -07:00
|
|
|
* Returns: New zpool on success, NULL on failure.
|
|
|
|
*/
|
2023-06-12 11:38:13 +02:00
|
|
|
struct zpool *zpool_create_pool(const char *type, const char *name, gfp_t gfp)
|
2014-08-06 16:08:36 -07:00
|
|
|
{
|
|
|
|
struct zpool_driver *driver;
|
|
|
|
struct zpool *zpool;
|
|
|
|
|
2015-06-25 15:00:37 -07:00
|
|
|
pr_debug("creating pool type %s\n", type);
|
2014-08-06 16:08:36 -07:00
|
|
|
|
|
|
|
driver = zpool_get_driver(type);
|
|
|
|
|
|
|
|
if (!driver) {
|
2014-08-29 15:18:40 -07:00
|
|
|
request_module("zpool-%s", type);
|
2014-08-06 16:08:36 -07:00
|
|
|
driver = zpool_get_driver(type);
|
|
|
|
}
|
|
|
|
|
|
|
|
if (!driver) {
|
|
|
|
pr_err("no driver for type %s\n", type);
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
|
|
|
|
zpool = kmalloc(sizeof(*zpool), gfp);
|
|
|
|
if (!zpool) {
|
|
|
|
pr_err("couldn't create zpool - out of memory\n");
|
|
|
|
zpool_put_driver(driver);
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
|
|
|
|
zpool->driver = driver;
|
2023-06-12 11:38:13 +02:00
|
|
|
zpool->pool = driver->create(name, gfp);
|
2014-08-06 16:08:36 -07:00
|
|
|
|
|
|
|
if (!zpool->pool) {
|
|
|
|
pr_err("couldn't create %s pool\n", type);
|
|
|
|
zpool_put_driver(driver);
|
|
|
|
kfree(zpool);
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
|
2015-06-25 15:00:37 -07:00
|
|
|
pr_debug("created pool type %s\n", type);
|
2014-08-06 16:08:36 -07:00
|
|
|
|
|
|
|
return zpool;
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* zpool_destroy_pool() - Destroy a zpool
|
2018-02-06 15:42:16 -08:00
|
|
|
* @zpool: The zpool to destroy.
|
2014-08-06 16:08:36 -07:00
|
|
|
*
|
|
|
|
* Implementations must guarantee this to be thread-safe,
|
|
|
|
* however only when destroying different pools. The same
|
|
|
|
* pool should only be destroyed once, and should not be used
|
|
|
|
* after it is destroyed.
|
|
|
|
*
|
|
|
|
* This destroys an existing zpool. The zpool should not be in use.
|
|
|
|
*/
|
|
|
|
void zpool_destroy_pool(struct zpool *zpool)
|
|
|
|
{
|
2015-11-06 16:29:18 -08:00
|
|
|
pr_debug("destroying pool type %s\n", zpool->driver->type);
|
2014-08-06 16:08:36 -07:00
|
|
|
|
|
|
|
zpool->driver->destroy(zpool->pool);
|
|
|
|
zpool_put_driver(zpool->driver);
|
|
|
|
kfree(zpool);
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* zpool_get_type() - Get the type of the zpool
|
2018-02-06 15:42:16 -08:00
|
|
|
* @zpool: The zpool to check
|
2014-08-06 16:08:36 -07:00
|
|
|
*
|
|
|
|
* This returns the type of the pool.
|
|
|
|
*
|
|
|
|
* Implementations must guarantee this to be thread-safe.
|
|
|
|
*
|
|
|
|
* Returns: The type of zpool.
|
|
|
|
*/
|
2015-11-06 16:29:18 -08:00
|
|
|
const char *zpool_get_type(struct zpool *zpool)
|
2014-08-06 16:08:36 -07:00
|
|
|
{
|
2015-11-06 16:29:18 -08:00
|
|
|
return zpool->driver->type;
|
2014-08-06 16:08:36 -07:00
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* zpool_malloc() - Allocate memory
|
2018-02-06 15:42:16 -08:00
|
|
|
* @zpool: The zpool to allocate from.
|
2018-02-06 15:42:13 -08:00
|
|
|
* @size: The amount of memory to allocate.
|
|
|
|
* @gfp: The GFP flags to use when allocating memory.
|
|
|
|
* @handle: Pointer to the handle to set
|
zsmalloc: prefer the the original page's node for compressed data
Currently, zsmalloc, zswap's and zram's backend memory allocator, does not
enforce any policy for the allocation of memory for the compressed data,
instead just adopting the memory policy of the task entering reclaim, or
the default policy (prefer local node) if no such policy is specified.
This can lead to several pathological behaviors in multi-node NUMA
systems:
1. Systems with CXL-based memory tiering can encounter the following
inversion with zswap/zram: the coldest pages demoted to the CXL tier
can return to the high tier when they are reclaimed to compressed swap,
creating memory pressure on the high tier.
2. Consider a direct reclaimer scanning nodes in order of allocation
preference. If it ventures into remote nodes, the memory it compresses
there should stay there. Trying to shift those contents over to the
reclaiming thread's preferred node further *increases* its local
pressure, and provoking more spills. The remote node is also the most
likely to refault this data again. This undesirable behavior was
pointed out by Johannes Weiner in [1].
3. For zswap writeback, the zswap entries are organized in
node-specific LRUs, based on the node placement of the original pages,
allowing for targeted zswap writeback for specific nodes.
However, the compressed data of a zswap entry can be placed on a
different node from the LRU it is placed on. This means that reclaim
targeted at one node might not free up memory used for zswap entries in
that node, but instead reclaiming memory in a different node.
All of these issues will be resolved if the compressed data go to the same
node as the original page. This patch encourages this behavior by having
zswap and zram pass the node of the original page to zsmalloc, and have
zsmalloc prefer the specified node if we need to allocate new (zs)pages
for the compressed data.
Note that we are not strictly binding the allocation to the preferred
node. We still allow the allocation to fall back to other nodes when the
preferred node is full, or if we have zspages with slots available on a
different node. This is OK, and still a strict improvement over the
status quo:
1. On a system with demotion enabled, we will generally prefer
demotions over compressed swapping, and only swap when pages have
already gone to the lowest tier. This patch should achieve the desired
effect for the most part.
2. If the preferred node is out of memory, letting the compressed data
going to other nodes can be better than the alternative (OOMs, keeping
cold memory unreclaimed, disk swapping, etc.).
3. If the allocation go to a separate node because we have a zspage
with slots available, at least we're not creating extra immediate
memory pressure (since the space is already allocated).
3. While there can be mixings, we generally reclaim pages in same-node
batches, which encourage zspage grouping that is more likely to go to
the right node.
4. A strict binding would require partitioning zsmalloc by node, which
is more complicated, and more prone to regression, since it reduces the
storage density of zsmalloc. We need to evaluate the tradeoff and
benchmark carefully before adopting such an involved solution.
[1]: https://lore.kernel.org/linux-mm/20250331165306.GC2110528@cmpxchg.org/
[senozhatsky@chromium.org: coding-style fixes]
Link: https://lkml.kernel.org/r/mnvexa7kseswglcqbhlot4zg3b3la2ypv2rimdl5mh5glbmhvz@wi6bgqn47hge
Link: https://lkml.kernel.org/r/20250402204416.3435994-1-nphamcs@gmail.com
Signed-off-by: Nhat Pham <nphamcs@gmail.com>
Suggested-by: Gregory Price <gourry@gourry.net>
Acked-by: Dan Williams <dan.j.williams@intel.com>
Reviewed-by: Chengming Zhou <chengming.zhou@linux.dev>
Acked-by: Sergey Senozhatsky <senozhatsky@chromium.org> [zram, zsmalloc]
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Yosry Ahmed <yosry.ahmed@linux.dev> [zswap/zsmalloc]
Cc: "Huang, Ying" <ying.huang@linux.alibaba.com>
Cc: Joanthan Cameron <Jonathan.Cameron@huawei.com>
Cc: Minchan Kim <minchan@kernel.org>
Cc: SeongJae Park <sj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-04-02 13:44:16 -07:00
|
|
|
* @nid: The preferred node id.
|
2014-08-06 16:08:36 -07:00
|
|
|
*
|
|
|
|
* This allocates the requested amount of memory from the pool.
|
|
|
|
* The gfp flags will be used when allocating memory, if the
|
|
|
|
* implementation supports it. The provided @handle will be
|
zsmalloc: prefer the the original page's node for compressed data
Currently, zsmalloc, zswap's and zram's backend memory allocator, does not
enforce any policy for the allocation of memory for the compressed data,
instead just adopting the memory policy of the task entering reclaim, or
the default policy (prefer local node) if no such policy is specified.
This can lead to several pathological behaviors in multi-node NUMA
systems:
1. Systems with CXL-based memory tiering can encounter the following
inversion with zswap/zram: the coldest pages demoted to the CXL tier
can return to the high tier when they are reclaimed to compressed swap,
creating memory pressure on the high tier.
2. Consider a direct reclaimer scanning nodes in order of allocation
preference. If it ventures into remote nodes, the memory it compresses
there should stay there. Trying to shift those contents over to the
reclaiming thread's preferred node further *increases* its local
pressure, and provoking more spills. The remote node is also the most
likely to refault this data again. This undesirable behavior was
pointed out by Johannes Weiner in [1].
3. For zswap writeback, the zswap entries are organized in
node-specific LRUs, based on the node placement of the original pages,
allowing for targeted zswap writeback for specific nodes.
However, the compressed data of a zswap entry can be placed on a
different node from the LRU it is placed on. This means that reclaim
targeted at one node might not free up memory used for zswap entries in
that node, but instead reclaiming memory in a different node.
All of these issues will be resolved if the compressed data go to the same
node as the original page. This patch encourages this behavior by having
zswap and zram pass the node of the original page to zsmalloc, and have
zsmalloc prefer the specified node if we need to allocate new (zs)pages
for the compressed data.
Note that we are not strictly binding the allocation to the preferred
node. We still allow the allocation to fall back to other nodes when the
preferred node is full, or if we have zspages with slots available on a
different node. This is OK, and still a strict improvement over the
status quo:
1. On a system with demotion enabled, we will generally prefer
demotions over compressed swapping, and only swap when pages have
already gone to the lowest tier. This patch should achieve the desired
effect for the most part.
2. If the preferred node is out of memory, letting the compressed data
going to other nodes can be better than the alternative (OOMs, keeping
cold memory unreclaimed, disk swapping, etc.).
3. If the allocation go to a separate node because we have a zspage
with slots available, at least we're not creating extra immediate
memory pressure (since the space is already allocated).
3. While there can be mixings, we generally reclaim pages in same-node
batches, which encourage zspage grouping that is more likely to go to
the right node.
4. A strict binding would require partitioning zsmalloc by node, which
is more complicated, and more prone to regression, since it reduces the
storage density of zsmalloc. We need to evaluate the tradeoff and
benchmark carefully before adopting such an involved solution.
[1]: https://lore.kernel.org/linux-mm/20250331165306.GC2110528@cmpxchg.org/
[senozhatsky@chromium.org: coding-style fixes]
Link: https://lkml.kernel.org/r/mnvexa7kseswglcqbhlot4zg3b3la2ypv2rimdl5mh5glbmhvz@wi6bgqn47hge
Link: https://lkml.kernel.org/r/20250402204416.3435994-1-nphamcs@gmail.com
Signed-off-by: Nhat Pham <nphamcs@gmail.com>
Suggested-by: Gregory Price <gourry@gourry.net>
Acked-by: Dan Williams <dan.j.williams@intel.com>
Reviewed-by: Chengming Zhou <chengming.zhou@linux.dev>
Acked-by: Sergey Senozhatsky <senozhatsky@chromium.org> [zram, zsmalloc]
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Yosry Ahmed <yosry.ahmed@linux.dev> [zswap/zsmalloc]
Cc: "Huang, Ying" <ying.huang@linux.alibaba.com>
Cc: Joanthan Cameron <Jonathan.Cameron@huawei.com>
Cc: Minchan Kim <minchan@kernel.org>
Cc: SeongJae Park <sj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-04-02 13:44:16 -07:00
|
|
|
* set to the allocated object handle. The allocation will
|
|
|
|
* prefer the NUMA node specified by @nid.
|
2014-08-06 16:08:36 -07:00
|
|
|
*
|
|
|
|
* Implementations must guarantee this to be thread-safe.
|
|
|
|
*
|
|
|
|
* Returns: 0 on success, negative value on error.
|
|
|
|
*/
|
|
|
|
int zpool_malloc(struct zpool *zpool, size_t size, gfp_t gfp,
|
zsmalloc: prefer the the original page's node for compressed data
Currently, zsmalloc, zswap's and zram's backend memory allocator, does not
enforce any policy for the allocation of memory for the compressed data,
instead just adopting the memory policy of the task entering reclaim, or
the default policy (prefer local node) if no such policy is specified.
This can lead to several pathological behaviors in multi-node NUMA
systems:
1. Systems with CXL-based memory tiering can encounter the following
inversion with zswap/zram: the coldest pages demoted to the CXL tier
can return to the high tier when they are reclaimed to compressed swap,
creating memory pressure on the high tier.
2. Consider a direct reclaimer scanning nodes in order of allocation
preference. If it ventures into remote nodes, the memory it compresses
there should stay there. Trying to shift those contents over to the
reclaiming thread's preferred node further *increases* its local
pressure, and provoking more spills. The remote node is also the most
likely to refault this data again. This undesirable behavior was
pointed out by Johannes Weiner in [1].
3. For zswap writeback, the zswap entries are organized in
node-specific LRUs, based on the node placement of the original pages,
allowing for targeted zswap writeback for specific nodes.
However, the compressed data of a zswap entry can be placed on a
different node from the LRU it is placed on. This means that reclaim
targeted at one node might not free up memory used for zswap entries in
that node, but instead reclaiming memory in a different node.
All of these issues will be resolved if the compressed data go to the same
node as the original page. This patch encourages this behavior by having
zswap and zram pass the node of the original page to zsmalloc, and have
zsmalloc prefer the specified node if we need to allocate new (zs)pages
for the compressed data.
Note that we are not strictly binding the allocation to the preferred
node. We still allow the allocation to fall back to other nodes when the
preferred node is full, or if we have zspages with slots available on a
different node. This is OK, and still a strict improvement over the
status quo:
1. On a system with demotion enabled, we will generally prefer
demotions over compressed swapping, and only swap when pages have
already gone to the lowest tier. This patch should achieve the desired
effect for the most part.
2. If the preferred node is out of memory, letting the compressed data
going to other nodes can be better than the alternative (OOMs, keeping
cold memory unreclaimed, disk swapping, etc.).
3. If the allocation go to a separate node because we have a zspage
with slots available, at least we're not creating extra immediate
memory pressure (since the space is already allocated).
3. While there can be mixings, we generally reclaim pages in same-node
batches, which encourage zspage grouping that is more likely to go to
the right node.
4. A strict binding would require partitioning zsmalloc by node, which
is more complicated, and more prone to regression, since it reduces the
storage density of zsmalloc. We need to evaluate the tradeoff and
benchmark carefully before adopting such an involved solution.
[1]: https://lore.kernel.org/linux-mm/20250331165306.GC2110528@cmpxchg.org/
[senozhatsky@chromium.org: coding-style fixes]
Link: https://lkml.kernel.org/r/mnvexa7kseswglcqbhlot4zg3b3la2ypv2rimdl5mh5glbmhvz@wi6bgqn47hge
Link: https://lkml.kernel.org/r/20250402204416.3435994-1-nphamcs@gmail.com
Signed-off-by: Nhat Pham <nphamcs@gmail.com>
Suggested-by: Gregory Price <gourry@gourry.net>
Acked-by: Dan Williams <dan.j.williams@intel.com>
Reviewed-by: Chengming Zhou <chengming.zhou@linux.dev>
Acked-by: Sergey Senozhatsky <senozhatsky@chromium.org> [zram, zsmalloc]
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Yosry Ahmed <yosry.ahmed@linux.dev> [zswap/zsmalloc]
Cc: "Huang, Ying" <ying.huang@linux.alibaba.com>
Cc: Joanthan Cameron <Jonathan.Cameron@huawei.com>
Cc: Minchan Kim <minchan@kernel.org>
Cc: SeongJae Park <sj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-04-02 13:44:16 -07:00
|
|
|
unsigned long *handle, const int nid)
|
2014-08-06 16:08:36 -07:00
|
|
|
{
|
zsmalloc: prefer the the original page's node for compressed data
Currently, zsmalloc, zswap's and zram's backend memory allocator, does not
enforce any policy for the allocation of memory for the compressed data,
instead just adopting the memory policy of the task entering reclaim, or
the default policy (prefer local node) if no such policy is specified.
This can lead to several pathological behaviors in multi-node NUMA
systems:
1. Systems with CXL-based memory tiering can encounter the following
inversion with zswap/zram: the coldest pages demoted to the CXL tier
can return to the high tier when they are reclaimed to compressed swap,
creating memory pressure on the high tier.
2. Consider a direct reclaimer scanning nodes in order of allocation
preference. If it ventures into remote nodes, the memory it compresses
there should stay there. Trying to shift those contents over to the
reclaiming thread's preferred node further *increases* its local
pressure, and provoking more spills. The remote node is also the most
likely to refault this data again. This undesirable behavior was
pointed out by Johannes Weiner in [1].
3. For zswap writeback, the zswap entries are organized in
node-specific LRUs, based on the node placement of the original pages,
allowing for targeted zswap writeback for specific nodes.
However, the compressed data of a zswap entry can be placed on a
different node from the LRU it is placed on. This means that reclaim
targeted at one node might not free up memory used for zswap entries in
that node, but instead reclaiming memory in a different node.
All of these issues will be resolved if the compressed data go to the same
node as the original page. This patch encourages this behavior by having
zswap and zram pass the node of the original page to zsmalloc, and have
zsmalloc prefer the specified node if we need to allocate new (zs)pages
for the compressed data.
Note that we are not strictly binding the allocation to the preferred
node. We still allow the allocation to fall back to other nodes when the
preferred node is full, or if we have zspages with slots available on a
different node. This is OK, and still a strict improvement over the
status quo:
1. On a system with demotion enabled, we will generally prefer
demotions over compressed swapping, and only swap when pages have
already gone to the lowest tier. This patch should achieve the desired
effect for the most part.
2. If the preferred node is out of memory, letting the compressed data
going to other nodes can be better than the alternative (OOMs, keeping
cold memory unreclaimed, disk swapping, etc.).
3. If the allocation go to a separate node because we have a zspage
with slots available, at least we're not creating extra immediate
memory pressure (since the space is already allocated).
3. While there can be mixings, we generally reclaim pages in same-node
batches, which encourage zspage grouping that is more likely to go to
the right node.
4. A strict binding would require partitioning zsmalloc by node, which
is more complicated, and more prone to regression, since it reduces the
storage density of zsmalloc. We need to evaluate the tradeoff and
benchmark carefully before adopting such an involved solution.
[1]: https://lore.kernel.org/linux-mm/20250331165306.GC2110528@cmpxchg.org/
[senozhatsky@chromium.org: coding-style fixes]
Link: https://lkml.kernel.org/r/mnvexa7kseswglcqbhlot4zg3b3la2ypv2rimdl5mh5glbmhvz@wi6bgqn47hge
Link: https://lkml.kernel.org/r/20250402204416.3435994-1-nphamcs@gmail.com
Signed-off-by: Nhat Pham <nphamcs@gmail.com>
Suggested-by: Gregory Price <gourry@gourry.net>
Acked-by: Dan Williams <dan.j.williams@intel.com>
Reviewed-by: Chengming Zhou <chengming.zhou@linux.dev>
Acked-by: Sergey Senozhatsky <senozhatsky@chromium.org> [zram, zsmalloc]
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Yosry Ahmed <yosry.ahmed@linux.dev> [zswap/zsmalloc]
Cc: "Huang, Ying" <ying.huang@linux.alibaba.com>
Cc: Joanthan Cameron <Jonathan.Cameron@huawei.com>
Cc: Minchan Kim <minchan@kernel.org>
Cc: SeongJae Park <sj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-04-02 13:44:16 -07:00
|
|
|
return zpool->driver->malloc(zpool->pool, size, gfp, handle, nid);
|
2014-08-06 16:08:36 -07:00
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* zpool_free() - Free previously allocated memory
|
2018-02-06 15:42:16 -08:00
|
|
|
* @zpool: The zpool that allocated the memory.
|
2018-02-06 15:42:13 -08:00
|
|
|
* @handle: The handle to the memory to free.
|
2014-08-06 16:08:36 -07:00
|
|
|
*
|
|
|
|
* This frees previously allocated memory. This does not guarantee
|
|
|
|
* that the pool will actually free memory, only that the memory
|
|
|
|
* in the pool will become available for use by the pool.
|
|
|
|
*
|
|
|
|
* Implementations must guarantee this to be thread-safe,
|
|
|
|
* however only when freeing different handles. The same
|
|
|
|
* handle should only be freed once, and should not be used
|
|
|
|
* after freeing.
|
|
|
|
*/
|
|
|
|
void zpool_free(struct zpool *zpool, unsigned long handle)
|
|
|
|
{
|
|
|
|
zpool->driver->free(zpool->pool, handle);
|
|
|
|
}
|
|
|
|
|
2025-03-05 06:11:29 +00:00
|
|
|
/**
|
|
|
|
* zpool_obj_read_begin() - Start reading from a previously allocated handle.
|
|
|
|
* @zpool: The zpool that the handle was allocated from
|
|
|
|
* @handle: The handle to read from
|
|
|
|
* @local_copy: A local buffer to use if needed.
|
|
|
|
*
|
|
|
|
* This starts a read operation of a previously allocated handle. The passed
|
|
|
|
* @local_copy buffer may be used if needed by copying the memory into.
|
|
|
|
* zpool_obj_read_end() MUST be called after the read is completed to undo any
|
|
|
|
* actions taken (e.g. release locks).
|
|
|
|
*
|
|
|
|
* Returns: A pointer to the handle memory to be read, if @local_copy is used,
|
|
|
|
* the returned pointer is @local_copy.
|
|
|
|
*/
|
|
|
|
void *zpool_obj_read_begin(struct zpool *zpool, unsigned long handle,
|
|
|
|
void *local_copy)
|
|
|
|
{
|
|
|
|
return zpool->driver->obj_read_begin(zpool->pool, handle, local_copy);
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* zpool_obj_read_end() - Finish reading from a previously allocated handle.
|
|
|
|
* @zpool: The zpool that the handle was allocated from
|
|
|
|
* @handle: The handle to read from
|
|
|
|
* @handle_mem: The pointer returned by zpool_obj_read_begin()
|
|
|
|
*
|
|
|
|
* Finishes a read operation previously started by zpool_obj_read_begin().
|
|
|
|
*/
|
|
|
|
void zpool_obj_read_end(struct zpool *zpool, unsigned long handle,
|
|
|
|
void *handle_mem)
|
|
|
|
{
|
|
|
|
zpool->driver->obj_read_end(zpool->pool, handle, handle_mem);
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* zpool_obj_write() - Write to a previously allocated handle.
|
|
|
|
* @zpool: The zpool that the handle was allocated from
|
|
|
|
* @handle: The handle to read from
|
|
|
|
* @handle_mem: The memory to copy from into the handle.
|
|
|
|
* @mem_len: The length of memory to be written.
|
|
|
|
*
|
|
|
|
*/
|
|
|
|
void zpool_obj_write(struct zpool *zpool, unsigned long handle,
|
|
|
|
void *handle_mem, size_t mem_len)
|
|
|
|
{
|
|
|
|
zpool->driver->obj_write(zpool->pool, handle, handle_mem, mem_len);
|
|
|
|
}
|
|
|
|
|
2014-08-06 16:08:36 -07:00
|
|
|
/**
|
2024-03-12 11:34:12 -04:00
|
|
|
* zpool_get_total_pages() - The total size of the pool
|
2018-02-06 15:42:16 -08:00
|
|
|
* @zpool: The zpool to check
|
2014-08-06 16:08:36 -07:00
|
|
|
*
|
2024-03-12 11:34:12 -04:00
|
|
|
* This returns the total size in pages of the pool.
|
2014-08-06 16:08:36 -07:00
|
|
|
*
|
2024-03-12 11:34:12 -04:00
|
|
|
* Returns: Total size of the zpool in pages.
|
2014-08-06 16:08:36 -07:00
|
|
|
*/
|
2024-03-12 11:34:12 -04:00
|
|
|
u64 zpool_get_total_pages(struct zpool *zpool)
|
2014-08-06 16:08:36 -07:00
|
|
|
{
|
2024-03-12 11:34:12 -04:00
|
|
|
return zpool->driver->total_pages(zpool->pool);
|
2014-08-06 16:08:36 -07:00
|
|
|
}
|
|
|
|
|
|
|
|
MODULE_AUTHOR("Dan Streetman <ddstreet@ieee.org>");
|
|
|
|
MODULE_DESCRIPTION("Common API for compressed memory storage");
|