mirror of
git://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
synced 2025-08-05 16:54:27 +00:00

Patch series "Fixes and cleanups to xarray", v5.
This series contains some random fixes and cleanups to xarray. Patch 1-2
are fixes and patch 3-6 are cleanups. More details can be found in
respective patches.
This patch (of 5):
Similar to issue fixed in commit cbc0285433
("XArray: Do not return
sibling entries from xa_load()"), we may return sibling entries from
xas_find_marked as following:
Thread A: Thread B:
xa_store_range(xa, entry, 6, 7, gfp);
xa_set_mark(xa, 6, mark)
XA_STATE(xas, xa, 6);
xas_find_marked(&xas, 7, mark);
offset = xas_find_chunk(xas, advance, mark);
[offset is 6 which points to a valid entry]
xa_store_range(xa, entry, 4, 7, gfp);
entry = xa_entry(xa, node, 6);
[entry is a sibling of 4]
if (!xa_is_node(entry))
return entry;
Skip sibling entry like xas_find() does to protect caller from seeing
sibling entry from xas_find_marked() or caller may use sibling entry
as a valid entry and crash the kernel.
Besides, load_race() test is modified to catch mentioned issue and modified
load_race() only passes after this fix is merged.
Here is an example how this bug could be triggerred in tmpfs which
enables large folio in mapping:
Let's take a look at involved racer:
1. How pages could be created and dirtied in shmem file.
write
ksys_write
vfs_write
new_sync_write
shmem_file_write_iter
generic_perform_write
shmem_write_begin
shmem_get_folio
shmem_allowable_huge_orders
shmem_alloc_and_add_folios
shmem_alloc_folio
__folio_set_locked
shmem_add_to_page_cache
XA_STATE_ORDER(..., index, order)
xax_store()
shmem_write_end
folio_mark_dirty()
2. How dirty pages could be deleted in shmem file.
ioctl
do_vfs_ioctl
file_ioctl
ioctl_preallocate
vfs_fallocate
shmem_fallocate
shmem_truncate_range
shmem_undo_range
truncate_inode_folio
filemap_remove_folio
page_cache_delete
xas_store(&xas, NULL);
3. How dirty pages could be lockless searched
sync_file_range
ksys_sync_file_range
__filemap_fdatawrite_range
filemap_fdatawrite_wbc
do_writepages
writeback_use_writepage
writeback_iter
writeback_get_folio
filemap_get_folios_tag
find_get_entry
folio = xas_find_marked()
folio_try_get(folio)
Kernel will crash as following:
1.Create 2.Search 3.Delete
/* write page 2,3 */
write
...
shmem_write_begin
XA_STATE_ORDER(xas, i_pages, index = 2, order = 1)
xa_store(&xas, folio)
shmem_write_end
folio_mark_dirty()
/* sync page 2 and page 3 */
sync_file_range
...
find_get_entry
folio = xas_find_marked()
/* offset will be 2 */
offset = xas_find_chunk()
/* delete page 2 and page 3 */
ioctl
...
xas_store(&xas, NULL);
/* write page 0-3 */
write
...
shmem_write_begin
XA_STATE_ORDER(xas, i_pages, index = 0, order = 2)
xa_store(&xas, folio)
shmem_write_end
folio_mark_dirty(folio)
/* get sibling entry from offset 2 */
entry = xa_entry(.., 2)
/* use sibling entry as folio and crash kernel */
folio_try_get(folio)
Link: https://lkml.kernel.org/r/20241213122523.12764-1-shikemeng@huaweicloud.com
Link: https://lkml.kernel.org/r/20241213122523.12764-2-shikemeng@huaweicloud.com
Signed-off-by: Kemeng Shi <shikemeng@huaweicloud.com>
Cc: Mattew Wilcox <willy@infradead.org> [English fixes]
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
300 lines
6.8 KiB
C
300 lines
6.8 KiB
C
// SPDX-License-Identifier: GPL-2.0-only
|
|
/*
|
|
* multiorder.c: Multi-order radix tree entry testing
|
|
* Copyright (c) 2016 Intel Corporation
|
|
* Author: Ross Zwisler <ross.zwisler@linux.intel.com>
|
|
* Author: Matthew Wilcox <matthew.r.wilcox@intel.com>
|
|
*/
|
|
#include <linux/radix-tree.h>
|
|
#include <linux/slab.h>
|
|
#include <linux/errno.h>
|
|
#include <pthread.h>
|
|
|
|
#include "test.h"
|
|
|
|
static int item_insert_order(struct xarray *xa, unsigned long index,
|
|
unsigned order)
|
|
{
|
|
XA_STATE_ORDER(xas, xa, index, order);
|
|
struct item *item = item_create(index, order);
|
|
|
|
do {
|
|
xas_lock(&xas);
|
|
xas_store(&xas, item);
|
|
xas_unlock(&xas);
|
|
} while (xas_nomem(&xas, GFP_KERNEL));
|
|
|
|
if (!xas_error(&xas))
|
|
return 0;
|
|
|
|
free(item);
|
|
return xas_error(&xas);
|
|
}
|
|
|
|
void multiorder_iteration(struct xarray *xa)
|
|
{
|
|
XA_STATE(xas, xa, 0);
|
|
struct item *item;
|
|
int i, j, err;
|
|
|
|
#define NUM_ENTRIES 11
|
|
int index[NUM_ENTRIES] = {0, 2, 4, 8, 16, 32, 34, 36, 64, 72, 128};
|
|
int order[NUM_ENTRIES] = {1, 1, 2, 3, 4, 1, 0, 1, 3, 0, 7};
|
|
|
|
printv(1, "Multiorder iteration test\n");
|
|
|
|
for (i = 0; i < NUM_ENTRIES; i++) {
|
|
err = item_insert_order(xa, index[i], order[i]);
|
|
assert(!err);
|
|
}
|
|
|
|
for (j = 0; j < 256; j++) {
|
|
for (i = 0; i < NUM_ENTRIES; i++)
|
|
if (j <= (index[i] | ((1 << order[i]) - 1)))
|
|
break;
|
|
|
|
xas_set(&xas, j);
|
|
xas_for_each(&xas, item, ULONG_MAX) {
|
|
int height = order[i] / XA_CHUNK_SHIFT;
|
|
int shift = height * XA_CHUNK_SHIFT;
|
|
unsigned long mask = (1UL << order[i]) - 1;
|
|
|
|
assert((xas.xa_index | mask) == (index[i] | mask));
|
|
assert(xas.xa_node->shift == shift);
|
|
assert(!radix_tree_is_internal_node(item));
|
|
assert((item->index | mask) == (index[i] | mask));
|
|
assert(item->order == order[i]);
|
|
i++;
|
|
}
|
|
}
|
|
|
|
item_kill_tree(xa);
|
|
}
|
|
|
|
void multiorder_tagged_iteration(struct xarray *xa)
|
|
{
|
|
XA_STATE(xas, xa, 0);
|
|
struct item *item;
|
|
int i, j;
|
|
|
|
#define MT_NUM_ENTRIES 9
|
|
int index[MT_NUM_ENTRIES] = {0, 2, 4, 16, 32, 40, 64, 72, 128};
|
|
int order[MT_NUM_ENTRIES] = {1, 0, 2, 4, 3, 1, 3, 0, 7};
|
|
|
|
#define TAG_ENTRIES 7
|
|
int tag_index[TAG_ENTRIES] = {0, 4, 16, 40, 64, 72, 128};
|
|
|
|
printv(1, "Multiorder tagged iteration test\n");
|
|
|
|
for (i = 0; i < MT_NUM_ENTRIES; i++)
|
|
assert(!item_insert_order(xa, index[i], order[i]));
|
|
|
|
assert(!xa_marked(xa, XA_MARK_1));
|
|
|
|
for (i = 0; i < TAG_ENTRIES; i++)
|
|
xa_set_mark(xa, tag_index[i], XA_MARK_1);
|
|
|
|
for (j = 0; j < 256; j++) {
|
|
int k;
|
|
|
|
for (i = 0; i < TAG_ENTRIES; i++) {
|
|
for (k = i; index[k] < tag_index[i]; k++)
|
|
;
|
|
if (j <= (index[k] | ((1 << order[k]) - 1)))
|
|
break;
|
|
}
|
|
|
|
xas_set(&xas, j);
|
|
xas_for_each_marked(&xas, item, ULONG_MAX, XA_MARK_1) {
|
|
unsigned long mask;
|
|
for (k = i; index[k] < tag_index[i]; k++)
|
|
;
|
|
mask = (1UL << order[k]) - 1;
|
|
|
|
assert((xas.xa_index | mask) == (tag_index[i] | mask));
|
|
assert(!xa_is_internal(item));
|
|
assert((item->index | mask) == (tag_index[i] | mask));
|
|
assert(item->order == order[k]);
|
|
i++;
|
|
}
|
|
}
|
|
|
|
assert(tag_tagged_items(xa, 0, ULONG_MAX, TAG_ENTRIES, XA_MARK_1,
|
|
XA_MARK_2) == TAG_ENTRIES);
|
|
|
|
for (j = 0; j < 256; j++) {
|
|
int mask, k;
|
|
|
|
for (i = 0; i < TAG_ENTRIES; i++) {
|
|
for (k = i; index[k] < tag_index[i]; k++)
|
|
;
|
|
if (j <= (index[k] | ((1 << order[k]) - 1)))
|
|
break;
|
|
}
|
|
|
|
xas_set(&xas, j);
|
|
xas_for_each_marked(&xas, item, ULONG_MAX, XA_MARK_2) {
|
|
for (k = i; index[k] < tag_index[i]; k++)
|
|
;
|
|
mask = (1 << order[k]) - 1;
|
|
|
|
assert((xas.xa_index | mask) == (tag_index[i] | mask));
|
|
assert(!xa_is_internal(item));
|
|
assert((item->index | mask) == (tag_index[i] | mask));
|
|
assert(item->order == order[k]);
|
|
i++;
|
|
}
|
|
}
|
|
|
|
assert(tag_tagged_items(xa, 1, ULONG_MAX, MT_NUM_ENTRIES * 2, XA_MARK_1,
|
|
XA_MARK_0) == TAG_ENTRIES);
|
|
i = 0;
|
|
xas_set(&xas, 0);
|
|
xas_for_each_marked(&xas, item, ULONG_MAX, XA_MARK_0) {
|
|
assert(xas.xa_index == tag_index[i]);
|
|
i++;
|
|
}
|
|
assert(i == TAG_ENTRIES);
|
|
|
|
item_kill_tree(xa);
|
|
}
|
|
|
|
bool stop_iteration;
|
|
|
|
static void *creator_func(void *ptr)
|
|
{
|
|
/* 'order' is set up to ensure we have sibling entries */
|
|
unsigned int order = RADIX_TREE_MAP_SHIFT - 1;
|
|
struct radix_tree_root *tree = ptr;
|
|
int i;
|
|
|
|
for (i = 0; i < 10000; i++) {
|
|
item_insert_order(tree, 0, order);
|
|
item_delete_rcu(tree, 0);
|
|
}
|
|
|
|
stop_iteration = true;
|
|
return NULL;
|
|
}
|
|
|
|
static void *iterator_func(void *ptr)
|
|
{
|
|
XA_STATE(xas, ptr, 0);
|
|
struct item *item;
|
|
|
|
while (!stop_iteration) {
|
|
rcu_read_lock();
|
|
xas_for_each(&xas, item, ULONG_MAX) {
|
|
if (xas_retry(&xas, item))
|
|
continue;
|
|
|
|
item_sanity(item, xas.xa_index);
|
|
}
|
|
rcu_read_unlock();
|
|
}
|
|
return NULL;
|
|
}
|
|
|
|
static void multiorder_iteration_race(struct xarray *xa)
|
|
{
|
|
const int num_threads = sysconf(_SC_NPROCESSORS_ONLN);
|
|
pthread_t worker_thread[num_threads];
|
|
int i;
|
|
|
|
stop_iteration = false;
|
|
pthread_create(&worker_thread[0], NULL, &creator_func, xa);
|
|
for (i = 1; i < num_threads; i++)
|
|
pthread_create(&worker_thread[i], NULL, &iterator_func, xa);
|
|
|
|
for (i = 0; i < num_threads; i++)
|
|
pthread_join(worker_thread[i], NULL);
|
|
|
|
item_kill_tree(xa);
|
|
}
|
|
|
|
static void *load_creator(void *ptr)
|
|
{
|
|
/* 'order' is set up to ensure we have sibling entries */
|
|
unsigned int order;
|
|
struct radix_tree_root *tree = ptr;
|
|
int i;
|
|
|
|
rcu_register_thread();
|
|
item_insert_order(tree, 3 << RADIX_TREE_MAP_SHIFT, 0);
|
|
item_insert_order(tree, 2 << RADIX_TREE_MAP_SHIFT, 0);
|
|
for (i = 0; i < 10000; i++) {
|
|
for (order = 1; order < RADIX_TREE_MAP_SHIFT; order++) {
|
|
unsigned long index = (3 << RADIX_TREE_MAP_SHIFT) -
|
|
(1 << order);
|
|
item_insert_order(tree, index, order);
|
|
xa_set_mark(tree, index, XA_MARK_1);
|
|
item_delete_rcu(tree, index);
|
|
}
|
|
}
|
|
rcu_unregister_thread();
|
|
|
|
stop_iteration = true;
|
|
return NULL;
|
|
}
|
|
|
|
static void *load_worker(void *ptr)
|
|
{
|
|
unsigned long index = (3 << RADIX_TREE_MAP_SHIFT) - 1;
|
|
|
|
rcu_register_thread();
|
|
while (!stop_iteration) {
|
|
unsigned long find_index = (2 << RADIX_TREE_MAP_SHIFT) + 1;
|
|
struct item *item = xa_load(ptr, index);
|
|
assert(!xa_is_internal(item));
|
|
item = xa_find(ptr, &find_index, index, XA_MARK_1);
|
|
assert(!xa_is_internal(item));
|
|
}
|
|
rcu_unregister_thread();
|
|
|
|
return NULL;
|
|
}
|
|
|
|
static void load_race(struct xarray *xa)
|
|
{
|
|
const int num_threads = sysconf(_SC_NPROCESSORS_ONLN) * 4;
|
|
pthread_t worker_thread[num_threads];
|
|
int i;
|
|
|
|
stop_iteration = false;
|
|
pthread_create(&worker_thread[0], NULL, &load_creator, xa);
|
|
for (i = 1; i < num_threads; i++)
|
|
pthread_create(&worker_thread[i], NULL, &load_worker, xa);
|
|
|
|
for (i = 0; i < num_threads; i++)
|
|
pthread_join(worker_thread[i], NULL);
|
|
|
|
item_kill_tree(xa);
|
|
}
|
|
|
|
static DEFINE_XARRAY(array);
|
|
|
|
void multiorder_checks(void)
|
|
{
|
|
multiorder_iteration(&array);
|
|
multiorder_tagged_iteration(&array);
|
|
multiorder_iteration_race(&array);
|
|
load_race(&array);
|
|
|
|
radix_tree_cpu_dead(0);
|
|
}
|
|
|
|
int __weak main(int argc, char **argv)
|
|
{
|
|
int opt;
|
|
|
|
while ((opt = getopt(argc, argv, "ls:v")) != -1) {
|
|
if (opt == 'v')
|
|
test_verbose++;
|
|
}
|
|
|
|
rcu_register_thread();
|
|
radix_tree_init();
|
|
multiorder_checks();
|
|
rcu_unregister_thread();
|
|
return 0;
|
|
}
|