2019-05-29 07:18:02 -07:00
|
|
|
// SPDX-License-Identifier: GPL-2.0-only
|
2016-05-20 17:02:14 -07:00
|
|
|
/*
|
|
|
|
* multiorder.c: Multi-order radix tree entry testing
|
|
|
|
* Copyright (c) 2016 Intel Corporation
|
|
|
|
* Author: Ross Zwisler <ross.zwisler@linux.intel.com>
|
|
|
|
* Author: Matthew Wilcox <matthew.r.wilcox@intel.com>
|
|
|
|
*/
|
|
|
|
#include <linux/radix-tree.h>
|
|
|
|
#include <linux/slab.h>
|
|
|
|
#include <linux/errno.h>
|
radix tree test suite: multi-order iteration race
Add a test which shows a race in the multi-order iteration code. This
test reliably hits the race in under a second on my machine, and is the
result of a real bug report against kernel a production v4.15 based
kernel (4.15.6-300.fc27.x86_64). With a real kernel this issue is hit
when using order 9 PMD DAX radix tree entries.
The race has to do with how we tear down multi-order sibling entries
when we are removing an item from the tree. Remember that an order 2
entry looks like this:
struct radix_tree_node.slots[] = [entry][sibling][sibling][sibling]
where 'entry' is in some slot in the struct radix_tree_node, and the
three slots following 'entry' contain sibling pointers which point back
to 'entry.'
When we delete 'entry' from the tree, we call :
radix_tree_delete()
radix_tree_delete_item()
__radix_tree_delete()
replace_slot()
replace_slot() first removes the siblings in order from the first to the
last, then at then replaces 'entry' with NULL. This means that for a
brief period of time we end up with one or more of the siblings removed,
so:
struct radix_tree_node.slots[] = [entry][NULL][sibling][sibling]
This causes an issue if you have a reader iterating over the slots in
the tree via radix_tree_for_each_slot() while only under
rcu_read_lock()/rcu_read_unlock() protection. This is a common case in
mm/filemap.c.
The issue is that when __radix_tree_next_slot() => skip_siblings() tries
to skip over the sibling entries in the slots, it currently does so with
an exact match on the slot directly preceding our current slot.
Normally this works:
V preceding slot
struct radix_tree_node.slots[] = [entry][sibling][sibling][sibling]
^ current slot
This lets you find the first sibling, and you skip them all in order.
But in the case where one of the siblings is NULL, that slot is skipped
and then our sibling detection is interrupted:
V preceding slot
struct radix_tree_node.slots[] = [entry][NULL][sibling][sibling]
^ current slot
This means that the sibling pointers aren't recognized since they point
all the way back to 'entry', so we think that they are normal internal
radix tree pointers. This causes us to think we need to walk down to a
struct radix_tree_node starting at the address of 'entry'.
In a real running kernel this will crash the thread with a GP fault when
you try and dereference the slots in your broken node starting at
'entry'.
In the radix tree test suite this will be caught by the address
sanitizer:
==27063==ERROR: AddressSanitizer: heap-buffer-overflow on address
0x60c0008ae400 at pc 0x00000040ce4f bp 0x7fa89b8fcad0 sp 0x7fa89b8fcac0
READ of size 8 at 0x60c0008ae400 thread T3
#0 0x40ce4e in __radix_tree_next_slot /home/rzwisler/project/linux/tools/testing/radix-tree/radix-tree.c:1660
#1 0x4022cc in radix_tree_next_slot linux/../../../../include/linux/radix-tree.h:567
#2 0x4022cc in iterator_func /home/rzwisler/project/linux/tools/testing/radix-tree/multiorder.c:655
#3 0x7fa8a088d50a in start_thread (/lib64/libpthread.so.0+0x750a)
#4 0x7fa8a03bd16e in clone (/lib64/libc.so.6+0xf516e)
Link: http://lkml.kernel.org/r/20180503192430.7582-5-ross.zwisler@linux.intel.com
Signed-off-by: Ross Zwisler <ross.zwisler@linux.intel.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: CR, Sapthagirish <sapthagirish.cr@intel.com>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Dave Chinner <david@fromorbit.com>
Cc: Jan Kara <jack@suse.cz>
Cc: Matthew Wilcox <willy@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2018-05-18 16:09:01 -07:00
|
|
|
#include <pthread.h>
|
2016-05-20 17:02:14 -07:00
|
|
|
|
|
|
|
#include "test.h"
|
|
|
|
|
2018-09-12 23:29:32 -04:00
|
|
|
static int item_insert_order(struct xarray *xa, unsigned long index,
|
|
|
|
unsigned order)
|
|
|
|
{
|
|
|
|
XA_STATE_ORDER(xas, xa, index, order);
|
|
|
|
struct item *item = item_create(index, order);
|
|
|
|
|
|
|
|
do {
|
|
|
|
xas_lock(&xas);
|
|
|
|
xas_store(&xas, item);
|
|
|
|
xas_unlock(&xas);
|
|
|
|
} while (xas_nomem(&xas, GFP_KERNEL));
|
|
|
|
|
|
|
|
if (!xas_error(&xas))
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
free(item);
|
|
|
|
return xas_error(&xas);
|
|
|
|
}
|
|
|
|
|
2018-09-22 16:12:41 -04:00
|
|
|
void multiorder_iteration(struct xarray *xa)
|
2016-05-20 17:02:29 -07:00
|
|
|
{
|
2018-09-22 16:12:41 -04:00
|
|
|
XA_STATE(xas, xa, 0);
|
|
|
|
struct item *item;
|
2016-05-20 17:03:36 -07:00
|
|
|
int i, j, err;
|
2016-05-20 17:02:29 -07:00
|
|
|
|
|
|
|
#define NUM_ENTRIES 11
|
|
|
|
int index[NUM_ENTRIES] = {0, 2, 4, 8, 16, 32, 34, 36, 64, 72, 128};
|
|
|
|
int order[NUM_ENTRIES] = {1, 1, 2, 3, 4, 1, 0, 1, 3, 0, 7};
|
|
|
|
|
2018-09-22 16:12:41 -04:00
|
|
|
printv(1, "Multiorder iteration test\n");
|
|
|
|
|
2016-05-20 17:02:29 -07:00
|
|
|
for (i = 0; i < NUM_ENTRIES; i++) {
|
2018-09-22 16:12:41 -04:00
|
|
|
err = item_insert_order(xa, index[i], order[i]);
|
2016-05-20 17:02:29 -07:00
|
|
|
assert(!err);
|
|
|
|
}
|
|
|
|
|
2016-05-20 17:03:36 -07:00
|
|
|
for (j = 0; j < 256; j++) {
|
|
|
|
for (i = 0; i < NUM_ENTRIES; i++)
|
|
|
|
if (j <= (index[i] | ((1 << order[i]) - 1)))
|
|
|
|
break;
|
|
|
|
|
2018-09-22 16:12:41 -04:00
|
|
|
xas_set(&xas, j);
|
|
|
|
xas_for_each(&xas, item, ULONG_MAX) {
|
|
|
|
int height = order[i] / XA_CHUNK_SHIFT;
|
|
|
|
int shift = height * XA_CHUNK_SHIFT;
|
2016-12-14 15:08:49 -08:00
|
|
|
unsigned long mask = (1UL << order[i]) - 1;
|
2016-05-20 17:03:36 -07:00
|
|
|
|
2018-09-22 16:12:41 -04:00
|
|
|
assert((xas.xa_index | mask) == (index[i] | mask));
|
|
|
|
assert(xas.xa_node->shift == shift);
|
2016-12-14 15:08:49 -08:00
|
|
|
assert(!radix_tree_is_internal_node(item));
|
|
|
|
assert((item->index | mask) == (index[i] | mask));
|
|
|
|
assert(item->order == order[i]);
|
2016-05-20 17:03:36 -07:00
|
|
|
i++;
|
|
|
|
}
|
2016-05-20 17:02:29 -07:00
|
|
|
}
|
|
|
|
|
2018-09-22 16:12:41 -04:00
|
|
|
item_kill_tree(xa);
|
2016-05-20 17:02:29 -07:00
|
|
|
}
|
|
|
|
|
2018-09-22 16:12:41 -04:00
|
|
|
void multiorder_tagged_iteration(struct xarray *xa)
|
2016-05-20 17:02:29 -07:00
|
|
|
{
|
2018-09-22 16:12:41 -04:00
|
|
|
XA_STATE(xas, xa, 0);
|
|
|
|
struct item *item;
|
2016-05-20 17:03:36 -07:00
|
|
|
int i, j;
|
2016-05-20 17:02:29 -07:00
|
|
|
|
|
|
|
#define MT_NUM_ENTRIES 9
|
|
|
|
int index[MT_NUM_ENTRIES] = {0, 2, 4, 16, 32, 40, 64, 72, 128};
|
|
|
|
int order[MT_NUM_ENTRIES] = {1, 0, 2, 4, 3, 1, 3, 0, 7};
|
|
|
|
|
|
|
|
#define TAG_ENTRIES 7
|
|
|
|
int tag_index[TAG_ENTRIES] = {0, 4, 16, 40, 64, 72, 128};
|
|
|
|
|
2018-09-22 16:12:41 -04:00
|
|
|
printv(1, "Multiorder tagged iteration test\n");
|
|
|
|
|
2016-05-20 17:02:29 -07:00
|
|
|
for (i = 0; i < MT_NUM_ENTRIES; i++)
|
2018-09-22 16:12:41 -04:00
|
|
|
assert(!item_insert_order(xa, index[i], order[i]));
|
2016-05-20 17:02:29 -07:00
|
|
|
|
2018-09-22 16:12:41 -04:00
|
|
|
assert(!xa_marked(xa, XA_MARK_1));
|
2016-05-20 17:02:29 -07:00
|
|
|
|
|
|
|
for (i = 0; i < TAG_ENTRIES; i++)
|
2018-09-22 16:12:41 -04:00
|
|
|
xa_set_mark(xa, tag_index[i], XA_MARK_1);
|
2016-05-20 17:02:29 -07:00
|
|
|
|
2016-05-20 17:03:36 -07:00
|
|
|
for (j = 0; j < 256; j++) {
|
2016-12-14 15:08:49 -08:00
|
|
|
int k;
|
2016-05-20 17:03:36 -07:00
|
|
|
|
|
|
|
for (i = 0; i < TAG_ENTRIES; i++) {
|
|
|
|
for (k = i; index[k] < tag_index[i]; k++)
|
|
|
|
;
|
|
|
|
if (j <= (index[k] | ((1 << order[k]) - 1)))
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
2018-09-22 16:12:41 -04:00
|
|
|
xas_set(&xas, j);
|
|
|
|
xas_for_each_marked(&xas, item, ULONG_MAX, XA_MARK_1) {
|
2016-12-14 15:08:49 -08:00
|
|
|
unsigned long mask;
|
2016-05-20 17:03:36 -07:00
|
|
|
for (k = i; index[k] < tag_index[i]; k++)
|
|
|
|
;
|
2016-12-14 15:08:49 -08:00
|
|
|
mask = (1UL << order[k]) - 1;
|
2016-05-20 17:03:36 -07:00
|
|
|
|
2018-09-22 16:12:41 -04:00
|
|
|
assert((xas.xa_index | mask) == (tag_index[i] | mask));
|
|
|
|
assert(!xa_is_internal(item));
|
2016-12-14 15:08:49 -08:00
|
|
|
assert((item->index | mask) == (tag_index[i] | mask));
|
|
|
|
assert(item->order == order[k]);
|
2016-05-20 17:03:36 -07:00
|
|
|
i++;
|
|
|
|
}
|
2016-05-20 17:02:29 -07:00
|
|
|
}
|
|
|
|
|
2018-09-22 16:12:41 -04:00
|
|
|
assert(tag_tagged_items(xa, 0, ULONG_MAX, TAG_ENTRIES, XA_MARK_1,
|
2018-08-18 07:09:22 -04:00
|
|
|
XA_MARK_2) == TAG_ENTRIES);
|
2016-05-20 17:02:52 -07:00
|
|
|
|
2016-05-20 17:03:36 -07:00
|
|
|
for (j = 0; j < 256; j++) {
|
|
|
|
int mask, k;
|
|
|
|
|
|
|
|
for (i = 0; i < TAG_ENTRIES; i++) {
|
|
|
|
for (k = i; index[k] < tag_index[i]; k++)
|
|
|
|
;
|
|
|
|
if (j <= (index[k] | ((1 << order[k]) - 1)))
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
2018-09-22 16:12:41 -04:00
|
|
|
xas_set(&xas, j);
|
|
|
|
xas_for_each_marked(&xas, item, ULONG_MAX, XA_MARK_2) {
|
2016-05-20 17:03:36 -07:00
|
|
|
for (k = i; index[k] < tag_index[i]; k++)
|
|
|
|
;
|
|
|
|
mask = (1 << order[k]) - 1;
|
|
|
|
|
2018-09-22 16:12:41 -04:00
|
|
|
assert((xas.xa_index | mask) == (tag_index[i] | mask));
|
|
|
|
assert(!xa_is_internal(item));
|
2016-12-14 15:08:49 -08:00
|
|
|
assert((item->index | mask) == (tag_index[i] | mask));
|
|
|
|
assert(item->order == order[k]);
|
2016-05-20 17:03:36 -07:00
|
|
|
i++;
|
|
|
|
}
|
2016-05-20 17:02:52 -07:00
|
|
|
}
|
|
|
|
|
2018-09-22 16:12:41 -04:00
|
|
|
assert(tag_tagged_items(xa, 1, ULONG_MAX, MT_NUM_ENTRIES * 2, XA_MARK_1,
|
2018-08-18 07:09:22 -04:00
|
|
|
XA_MARK_0) == TAG_ENTRIES);
|
2016-05-20 17:02:52 -07:00
|
|
|
i = 0;
|
2018-09-22 16:12:41 -04:00
|
|
|
xas_set(&xas, 0);
|
|
|
|
xas_for_each_marked(&xas, item, ULONG_MAX, XA_MARK_0) {
|
|
|
|
assert(xas.xa_index == tag_index[i]);
|
2016-05-20 17:02:52 -07:00
|
|
|
i++;
|
|
|
|
}
|
2018-09-22 16:12:41 -04:00
|
|
|
assert(i == TAG_ENTRIES);
|
2016-05-20 17:02:52 -07:00
|
|
|
|
2018-09-22 16:12:41 -04:00
|
|
|
item_kill_tree(xa);
|
2016-05-20 17:02:29 -07:00
|
|
|
}
|
|
|
|
|
XArray: Do not return sibling entries from xa_load()
It is possible for xa_load() to observe a sibling entry pointing to
another sibling entry. An example:
Thread A: Thread B:
xa_store_range(xa, entry, 188, 191, gfp);
xa_load(xa, 191);
entry = xa_entry(xa, node, 63);
[entry is a sibling of 188]
xa_store_range(xa, entry, 184, 191, gfp);
if (xa_is_sibling(entry))
offset = xa_to_sibling(entry);
entry = xa_entry(xas->xa, node, offset);
[entry is now a sibling of 184]
It is sufficient to go around this loop until we hit a non-sibling entry.
Sibling entries always point earlier in the node, so we are guaranteed
to terminate this search.
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Fixes: 6b24ca4a1a8d ("mm: Use multi-index entries in the page cache")
Cc: stable@vger.kernel.org
2023-07-26 22:58:17 -04:00
|
|
|
bool stop_iteration;
|
radix tree test suite: multi-order iteration race
Add a test which shows a race in the multi-order iteration code. This
test reliably hits the race in under a second on my machine, and is the
result of a real bug report against kernel a production v4.15 based
kernel (4.15.6-300.fc27.x86_64). With a real kernel this issue is hit
when using order 9 PMD DAX radix tree entries.
The race has to do with how we tear down multi-order sibling entries
when we are removing an item from the tree. Remember that an order 2
entry looks like this:
struct radix_tree_node.slots[] = [entry][sibling][sibling][sibling]
where 'entry' is in some slot in the struct radix_tree_node, and the
three slots following 'entry' contain sibling pointers which point back
to 'entry.'
When we delete 'entry' from the tree, we call :
radix_tree_delete()
radix_tree_delete_item()
__radix_tree_delete()
replace_slot()
replace_slot() first removes the siblings in order from the first to the
last, then at then replaces 'entry' with NULL. This means that for a
brief period of time we end up with one or more of the siblings removed,
so:
struct radix_tree_node.slots[] = [entry][NULL][sibling][sibling]
This causes an issue if you have a reader iterating over the slots in
the tree via radix_tree_for_each_slot() while only under
rcu_read_lock()/rcu_read_unlock() protection. This is a common case in
mm/filemap.c.
The issue is that when __radix_tree_next_slot() => skip_siblings() tries
to skip over the sibling entries in the slots, it currently does so with
an exact match on the slot directly preceding our current slot.
Normally this works:
V preceding slot
struct radix_tree_node.slots[] = [entry][sibling][sibling][sibling]
^ current slot
This lets you find the first sibling, and you skip them all in order.
But in the case where one of the siblings is NULL, that slot is skipped
and then our sibling detection is interrupted:
V preceding slot
struct radix_tree_node.slots[] = [entry][NULL][sibling][sibling]
^ current slot
This means that the sibling pointers aren't recognized since they point
all the way back to 'entry', so we think that they are normal internal
radix tree pointers. This causes us to think we need to walk down to a
struct radix_tree_node starting at the address of 'entry'.
In a real running kernel this will crash the thread with a GP fault when
you try and dereference the slots in your broken node starting at
'entry'.
In the radix tree test suite this will be caught by the address
sanitizer:
==27063==ERROR: AddressSanitizer: heap-buffer-overflow on address
0x60c0008ae400 at pc 0x00000040ce4f bp 0x7fa89b8fcad0 sp 0x7fa89b8fcac0
READ of size 8 at 0x60c0008ae400 thread T3
#0 0x40ce4e in __radix_tree_next_slot /home/rzwisler/project/linux/tools/testing/radix-tree/radix-tree.c:1660
#1 0x4022cc in radix_tree_next_slot linux/../../../../include/linux/radix-tree.h:567
#2 0x4022cc in iterator_func /home/rzwisler/project/linux/tools/testing/radix-tree/multiorder.c:655
#3 0x7fa8a088d50a in start_thread (/lib64/libpthread.so.0+0x750a)
#4 0x7fa8a03bd16e in clone (/lib64/libc.so.6+0xf516e)
Link: http://lkml.kernel.org/r/20180503192430.7582-5-ross.zwisler@linux.intel.com
Signed-off-by: Ross Zwisler <ross.zwisler@linux.intel.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: CR, Sapthagirish <sapthagirish.cr@intel.com>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Dave Chinner <david@fromorbit.com>
Cc: Jan Kara <jack@suse.cz>
Cc: Matthew Wilcox <willy@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2018-05-18 16:09:01 -07:00
|
|
|
|
|
|
|
static void *creator_func(void *ptr)
|
|
|
|
{
|
|
|
|
/* 'order' is set up to ensure we have sibling entries */
|
|
|
|
unsigned int order = RADIX_TREE_MAP_SHIFT - 1;
|
|
|
|
struct radix_tree_root *tree = ptr;
|
|
|
|
int i;
|
|
|
|
|
|
|
|
for (i = 0; i < 10000; i++) {
|
|
|
|
item_insert_order(tree, 0, order);
|
|
|
|
item_delete_rcu(tree, 0);
|
|
|
|
}
|
|
|
|
|
|
|
|
stop_iteration = true;
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
|
|
|
|
static void *iterator_func(void *ptr)
|
|
|
|
{
|
2018-09-22 16:12:41 -04:00
|
|
|
XA_STATE(xas, ptr, 0);
|
radix tree test suite: multi-order iteration race
Add a test which shows a race in the multi-order iteration code. This
test reliably hits the race in under a second on my machine, and is the
result of a real bug report against kernel a production v4.15 based
kernel (4.15.6-300.fc27.x86_64). With a real kernel this issue is hit
when using order 9 PMD DAX radix tree entries.
The race has to do with how we tear down multi-order sibling entries
when we are removing an item from the tree. Remember that an order 2
entry looks like this:
struct radix_tree_node.slots[] = [entry][sibling][sibling][sibling]
where 'entry' is in some slot in the struct radix_tree_node, and the
three slots following 'entry' contain sibling pointers which point back
to 'entry.'
When we delete 'entry' from the tree, we call :
radix_tree_delete()
radix_tree_delete_item()
__radix_tree_delete()
replace_slot()
replace_slot() first removes the siblings in order from the first to the
last, then at then replaces 'entry' with NULL. This means that for a
brief period of time we end up with one or more of the siblings removed,
so:
struct radix_tree_node.slots[] = [entry][NULL][sibling][sibling]
This causes an issue if you have a reader iterating over the slots in
the tree via radix_tree_for_each_slot() while only under
rcu_read_lock()/rcu_read_unlock() protection. This is a common case in
mm/filemap.c.
The issue is that when __radix_tree_next_slot() => skip_siblings() tries
to skip over the sibling entries in the slots, it currently does so with
an exact match on the slot directly preceding our current slot.
Normally this works:
V preceding slot
struct radix_tree_node.slots[] = [entry][sibling][sibling][sibling]
^ current slot
This lets you find the first sibling, and you skip them all in order.
But in the case where one of the siblings is NULL, that slot is skipped
and then our sibling detection is interrupted:
V preceding slot
struct radix_tree_node.slots[] = [entry][NULL][sibling][sibling]
^ current slot
This means that the sibling pointers aren't recognized since they point
all the way back to 'entry', so we think that they are normal internal
radix tree pointers. This causes us to think we need to walk down to a
struct radix_tree_node starting at the address of 'entry'.
In a real running kernel this will crash the thread with a GP fault when
you try and dereference the slots in your broken node starting at
'entry'.
In the radix tree test suite this will be caught by the address
sanitizer:
==27063==ERROR: AddressSanitizer: heap-buffer-overflow on address
0x60c0008ae400 at pc 0x00000040ce4f bp 0x7fa89b8fcad0 sp 0x7fa89b8fcac0
READ of size 8 at 0x60c0008ae400 thread T3
#0 0x40ce4e in __radix_tree_next_slot /home/rzwisler/project/linux/tools/testing/radix-tree/radix-tree.c:1660
#1 0x4022cc in radix_tree_next_slot linux/../../../../include/linux/radix-tree.h:567
#2 0x4022cc in iterator_func /home/rzwisler/project/linux/tools/testing/radix-tree/multiorder.c:655
#3 0x7fa8a088d50a in start_thread (/lib64/libpthread.so.0+0x750a)
#4 0x7fa8a03bd16e in clone (/lib64/libc.so.6+0xf516e)
Link: http://lkml.kernel.org/r/20180503192430.7582-5-ross.zwisler@linux.intel.com
Signed-off-by: Ross Zwisler <ross.zwisler@linux.intel.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: CR, Sapthagirish <sapthagirish.cr@intel.com>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Dave Chinner <david@fromorbit.com>
Cc: Jan Kara <jack@suse.cz>
Cc: Matthew Wilcox <willy@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2018-05-18 16:09:01 -07:00
|
|
|
struct item *item;
|
|
|
|
|
|
|
|
while (!stop_iteration) {
|
|
|
|
rcu_read_lock();
|
2018-09-22 16:12:41 -04:00
|
|
|
xas_for_each(&xas, item, ULONG_MAX) {
|
|
|
|
if (xas_retry(&xas, item))
|
radix tree test suite: multi-order iteration race
Add a test which shows a race in the multi-order iteration code. This
test reliably hits the race in under a second on my machine, and is the
result of a real bug report against kernel a production v4.15 based
kernel (4.15.6-300.fc27.x86_64). With a real kernel this issue is hit
when using order 9 PMD DAX radix tree entries.
The race has to do with how we tear down multi-order sibling entries
when we are removing an item from the tree. Remember that an order 2
entry looks like this:
struct radix_tree_node.slots[] = [entry][sibling][sibling][sibling]
where 'entry' is in some slot in the struct radix_tree_node, and the
three slots following 'entry' contain sibling pointers which point back
to 'entry.'
When we delete 'entry' from the tree, we call :
radix_tree_delete()
radix_tree_delete_item()
__radix_tree_delete()
replace_slot()
replace_slot() first removes the siblings in order from the first to the
last, then at then replaces 'entry' with NULL. This means that for a
brief period of time we end up with one or more of the siblings removed,
so:
struct radix_tree_node.slots[] = [entry][NULL][sibling][sibling]
This causes an issue if you have a reader iterating over the slots in
the tree via radix_tree_for_each_slot() while only under
rcu_read_lock()/rcu_read_unlock() protection. This is a common case in
mm/filemap.c.
The issue is that when __radix_tree_next_slot() => skip_siblings() tries
to skip over the sibling entries in the slots, it currently does so with
an exact match on the slot directly preceding our current slot.
Normally this works:
V preceding slot
struct radix_tree_node.slots[] = [entry][sibling][sibling][sibling]
^ current slot
This lets you find the first sibling, and you skip them all in order.
But in the case where one of the siblings is NULL, that slot is skipped
and then our sibling detection is interrupted:
V preceding slot
struct radix_tree_node.slots[] = [entry][NULL][sibling][sibling]
^ current slot
This means that the sibling pointers aren't recognized since they point
all the way back to 'entry', so we think that they are normal internal
radix tree pointers. This causes us to think we need to walk down to a
struct radix_tree_node starting at the address of 'entry'.
In a real running kernel this will crash the thread with a GP fault when
you try and dereference the slots in your broken node starting at
'entry'.
In the radix tree test suite this will be caught by the address
sanitizer:
==27063==ERROR: AddressSanitizer: heap-buffer-overflow on address
0x60c0008ae400 at pc 0x00000040ce4f bp 0x7fa89b8fcad0 sp 0x7fa89b8fcac0
READ of size 8 at 0x60c0008ae400 thread T3
#0 0x40ce4e in __radix_tree_next_slot /home/rzwisler/project/linux/tools/testing/radix-tree/radix-tree.c:1660
#1 0x4022cc in radix_tree_next_slot linux/../../../../include/linux/radix-tree.h:567
#2 0x4022cc in iterator_func /home/rzwisler/project/linux/tools/testing/radix-tree/multiorder.c:655
#3 0x7fa8a088d50a in start_thread (/lib64/libpthread.so.0+0x750a)
#4 0x7fa8a03bd16e in clone (/lib64/libc.so.6+0xf516e)
Link: http://lkml.kernel.org/r/20180503192430.7582-5-ross.zwisler@linux.intel.com
Signed-off-by: Ross Zwisler <ross.zwisler@linux.intel.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: CR, Sapthagirish <sapthagirish.cr@intel.com>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Dave Chinner <david@fromorbit.com>
Cc: Jan Kara <jack@suse.cz>
Cc: Matthew Wilcox <willy@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2018-05-18 16:09:01 -07:00
|
|
|
continue;
|
|
|
|
|
2018-09-22 16:12:41 -04:00
|
|
|
item_sanity(item, xas.xa_index);
|
radix tree test suite: multi-order iteration race
Add a test which shows a race in the multi-order iteration code. This
test reliably hits the race in under a second on my machine, and is the
result of a real bug report against kernel a production v4.15 based
kernel (4.15.6-300.fc27.x86_64). With a real kernel this issue is hit
when using order 9 PMD DAX radix tree entries.
The race has to do with how we tear down multi-order sibling entries
when we are removing an item from the tree. Remember that an order 2
entry looks like this:
struct radix_tree_node.slots[] = [entry][sibling][sibling][sibling]
where 'entry' is in some slot in the struct radix_tree_node, and the
three slots following 'entry' contain sibling pointers which point back
to 'entry.'
When we delete 'entry' from the tree, we call :
radix_tree_delete()
radix_tree_delete_item()
__radix_tree_delete()
replace_slot()
replace_slot() first removes the siblings in order from the first to the
last, then at then replaces 'entry' with NULL. This means that for a
brief period of time we end up with one or more of the siblings removed,
so:
struct radix_tree_node.slots[] = [entry][NULL][sibling][sibling]
This causes an issue if you have a reader iterating over the slots in
the tree via radix_tree_for_each_slot() while only under
rcu_read_lock()/rcu_read_unlock() protection. This is a common case in
mm/filemap.c.
The issue is that when __radix_tree_next_slot() => skip_siblings() tries
to skip over the sibling entries in the slots, it currently does so with
an exact match on the slot directly preceding our current slot.
Normally this works:
V preceding slot
struct radix_tree_node.slots[] = [entry][sibling][sibling][sibling]
^ current slot
This lets you find the first sibling, and you skip them all in order.
But in the case where one of the siblings is NULL, that slot is skipped
and then our sibling detection is interrupted:
V preceding slot
struct radix_tree_node.slots[] = [entry][NULL][sibling][sibling]
^ current slot
This means that the sibling pointers aren't recognized since they point
all the way back to 'entry', so we think that they are normal internal
radix tree pointers. This causes us to think we need to walk down to a
struct radix_tree_node starting at the address of 'entry'.
In a real running kernel this will crash the thread with a GP fault when
you try and dereference the slots in your broken node starting at
'entry'.
In the radix tree test suite this will be caught by the address
sanitizer:
==27063==ERROR: AddressSanitizer: heap-buffer-overflow on address
0x60c0008ae400 at pc 0x00000040ce4f bp 0x7fa89b8fcad0 sp 0x7fa89b8fcac0
READ of size 8 at 0x60c0008ae400 thread T3
#0 0x40ce4e in __radix_tree_next_slot /home/rzwisler/project/linux/tools/testing/radix-tree/radix-tree.c:1660
#1 0x4022cc in radix_tree_next_slot linux/../../../../include/linux/radix-tree.h:567
#2 0x4022cc in iterator_func /home/rzwisler/project/linux/tools/testing/radix-tree/multiorder.c:655
#3 0x7fa8a088d50a in start_thread (/lib64/libpthread.so.0+0x750a)
#4 0x7fa8a03bd16e in clone (/lib64/libc.so.6+0xf516e)
Link: http://lkml.kernel.org/r/20180503192430.7582-5-ross.zwisler@linux.intel.com
Signed-off-by: Ross Zwisler <ross.zwisler@linux.intel.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: CR, Sapthagirish <sapthagirish.cr@intel.com>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Dave Chinner <david@fromorbit.com>
Cc: Jan Kara <jack@suse.cz>
Cc: Matthew Wilcox <willy@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2018-05-18 16:09:01 -07:00
|
|
|
}
|
|
|
|
rcu_read_unlock();
|
|
|
|
}
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
|
2018-09-22 16:12:41 -04:00
|
|
|
static void multiorder_iteration_race(struct xarray *xa)
|
radix tree test suite: multi-order iteration race
Add a test which shows a race in the multi-order iteration code. This
test reliably hits the race in under a second on my machine, and is the
result of a real bug report against kernel a production v4.15 based
kernel (4.15.6-300.fc27.x86_64). With a real kernel this issue is hit
when using order 9 PMD DAX radix tree entries.
The race has to do with how we tear down multi-order sibling entries
when we are removing an item from the tree. Remember that an order 2
entry looks like this:
struct radix_tree_node.slots[] = [entry][sibling][sibling][sibling]
where 'entry' is in some slot in the struct radix_tree_node, and the
three slots following 'entry' contain sibling pointers which point back
to 'entry.'
When we delete 'entry' from the tree, we call :
radix_tree_delete()
radix_tree_delete_item()
__radix_tree_delete()
replace_slot()
replace_slot() first removes the siblings in order from the first to the
last, then at then replaces 'entry' with NULL. This means that for a
brief period of time we end up with one or more of the siblings removed,
so:
struct radix_tree_node.slots[] = [entry][NULL][sibling][sibling]
This causes an issue if you have a reader iterating over the slots in
the tree via radix_tree_for_each_slot() while only under
rcu_read_lock()/rcu_read_unlock() protection. This is a common case in
mm/filemap.c.
The issue is that when __radix_tree_next_slot() => skip_siblings() tries
to skip over the sibling entries in the slots, it currently does so with
an exact match on the slot directly preceding our current slot.
Normally this works:
V preceding slot
struct radix_tree_node.slots[] = [entry][sibling][sibling][sibling]
^ current slot
This lets you find the first sibling, and you skip them all in order.
But in the case where one of the siblings is NULL, that slot is skipped
and then our sibling detection is interrupted:
V preceding slot
struct radix_tree_node.slots[] = [entry][NULL][sibling][sibling]
^ current slot
This means that the sibling pointers aren't recognized since they point
all the way back to 'entry', so we think that they are normal internal
radix tree pointers. This causes us to think we need to walk down to a
struct radix_tree_node starting at the address of 'entry'.
In a real running kernel this will crash the thread with a GP fault when
you try and dereference the slots in your broken node starting at
'entry'.
In the radix tree test suite this will be caught by the address
sanitizer:
==27063==ERROR: AddressSanitizer: heap-buffer-overflow on address
0x60c0008ae400 at pc 0x00000040ce4f bp 0x7fa89b8fcad0 sp 0x7fa89b8fcac0
READ of size 8 at 0x60c0008ae400 thread T3
#0 0x40ce4e in __radix_tree_next_slot /home/rzwisler/project/linux/tools/testing/radix-tree/radix-tree.c:1660
#1 0x4022cc in radix_tree_next_slot linux/../../../../include/linux/radix-tree.h:567
#2 0x4022cc in iterator_func /home/rzwisler/project/linux/tools/testing/radix-tree/multiorder.c:655
#3 0x7fa8a088d50a in start_thread (/lib64/libpthread.so.0+0x750a)
#4 0x7fa8a03bd16e in clone (/lib64/libc.so.6+0xf516e)
Link: http://lkml.kernel.org/r/20180503192430.7582-5-ross.zwisler@linux.intel.com
Signed-off-by: Ross Zwisler <ross.zwisler@linux.intel.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: CR, Sapthagirish <sapthagirish.cr@intel.com>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Dave Chinner <david@fromorbit.com>
Cc: Jan Kara <jack@suse.cz>
Cc: Matthew Wilcox <willy@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2018-05-18 16:09:01 -07:00
|
|
|
{
|
|
|
|
const int num_threads = sysconf(_SC_NPROCESSORS_ONLN);
|
|
|
|
pthread_t worker_thread[num_threads];
|
|
|
|
int i;
|
|
|
|
|
XArray: Do not return sibling entries from xa_load()
It is possible for xa_load() to observe a sibling entry pointing to
another sibling entry. An example:
Thread A: Thread B:
xa_store_range(xa, entry, 188, 191, gfp);
xa_load(xa, 191);
entry = xa_entry(xa, node, 63);
[entry is a sibling of 188]
xa_store_range(xa, entry, 184, 191, gfp);
if (xa_is_sibling(entry))
offset = xa_to_sibling(entry);
entry = xa_entry(xas->xa, node, offset);
[entry is now a sibling of 184]
It is sufficient to go around this loop until we hit a non-sibling entry.
Sibling entries always point earlier in the node, so we are guaranteed
to terminate this search.
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Fixes: 6b24ca4a1a8d ("mm: Use multi-index entries in the page cache")
Cc: stable@vger.kernel.org
2023-07-26 22:58:17 -04:00
|
|
|
stop_iteration = false;
|
2018-09-22 16:12:41 -04:00
|
|
|
pthread_create(&worker_thread[0], NULL, &creator_func, xa);
|
radix tree test suite: multi-order iteration race
Add a test which shows a race in the multi-order iteration code. This
test reliably hits the race in under a second on my machine, and is the
result of a real bug report against kernel a production v4.15 based
kernel (4.15.6-300.fc27.x86_64). With a real kernel this issue is hit
when using order 9 PMD DAX radix tree entries.
The race has to do with how we tear down multi-order sibling entries
when we are removing an item from the tree. Remember that an order 2
entry looks like this:
struct radix_tree_node.slots[] = [entry][sibling][sibling][sibling]
where 'entry' is in some slot in the struct radix_tree_node, and the
three slots following 'entry' contain sibling pointers which point back
to 'entry.'
When we delete 'entry' from the tree, we call :
radix_tree_delete()
radix_tree_delete_item()
__radix_tree_delete()
replace_slot()
replace_slot() first removes the siblings in order from the first to the
last, then at then replaces 'entry' with NULL. This means that for a
brief period of time we end up with one or more of the siblings removed,
so:
struct radix_tree_node.slots[] = [entry][NULL][sibling][sibling]
This causes an issue if you have a reader iterating over the slots in
the tree via radix_tree_for_each_slot() while only under
rcu_read_lock()/rcu_read_unlock() protection. This is a common case in
mm/filemap.c.
The issue is that when __radix_tree_next_slot() => skip_siblings() tries
to skip over the sibling entries in the slots, it currently does so with
an exact match on the slot directly preceding our current slot.
Normally this works:
V preceding slot
struct radix_tree_node.slots[] = [entry][sibling][sibling][sibling]
^ current slot
This lets you find the first sibling, and you skip them all in order.
But in the case where one of the siblings is NULL, that slot is skipped
and then our sibling detection is interrupted:
V preceding slot
struct radix_tree_node.slots[] = [entry][NULL][sibling][sibling]
^ current slot
This means that the sibling pointers aren't recognized since they point
all the way back to 'entry', so we think that they are normal internal
radix tree pointers. This causes us to think we need to walk down to a
struct radix_tree_node starting at the address of 'entry'.
In a real running kernel this will crash the thread with a GP fault when
you try and dereference the slots in your broken node starting at
'entry'.
In the radix tree test suite this will be caught by the address
sanitizer:
==27063==ERROR: AddressSanitizer: heap-buffer-overflow on address
0x60c0008ae400 at pc 0x00000040ce4f bp 0x7fa89b8fcad0 sp 0x7fa89b8fcac0
READ of size 8 at 0x60c0008ae400 thread T3
#0 0x40ce4e in __radix_tree_next_slot /home/rzwisler/project/linux/tools/testing/radix-tree/radix-tree.c:1660
#1 0x4022cc in radix_tree_next_slot linux/../../../../include/linux/radix-tree.h:567
#2 0x4022cc in iterator_func /home/rzwisler/project/linux/tools/testing/radix-tree/multiorder.c:655
#3 0x7fa8a088d50a in start_thread (/lib64/libpthread.so.0+0x750a)
#4 0x7fa8a03bd16e in clone (/lib64/libc.so.6+0xf516e)
Link: http://lkml.kernel.org/r/20180503192430.7582-5-ross.zwisler@linux.intel.com
Signed-off-by: Ross Zwisler <ross.zwisler@linux.intel.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: CR, Sapthagirish <sapthagirish.cr@intel.com>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Dave Chinner <david@fromorbit.com>
Cc: Jan Kara <jack@suse.cz>
Cc: Matthew Wilcox <willy@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2018-05-18 16:09:01 -07:00
|
|
|
for (i = 1; i < num_threads; i++)
|
2018-09-22 16:12:41 -04:00
|
|
|
pthread_create(&worker_thread[i], NULL, &iterator_func, xa);
|
radix tree test suite: multi-order iteration race
Add a test which shows a race in the multi-order iteration code. This
test reliably hits the race in under a second on my machine, and is the
result of a real bug report against kernel a production v4.15 based
kernel (4.15.6-300.fc27.x86_64). With a real kernel this issue is hit
when using order 9 PMD DAX radix tree entries.
The race has to do with how we tear down multi-order sibling entries
when we are removing an item from the tree. Remember that an order 2
entry looks like this:
struct radix_tree_node.slots[] = [entry][sibling][sibling][sibling]
where 'entry' is in some slot in the struct radix_tree_node, and the
three slots following 'entry' contain sibling pointers which point back
to 'entry.'
When we delete 'entry' from the tree, we call :
radix_tree_delete()
radix_tree_delete_item()
__radix_tree_delete()
replace_slot()
replace_slot() first removes the siblings in order from the first to the
last, then at then replaces 'entry' with NULL. This means that for a
brief period of time we end up with one or more of the siblings removed,
so:
struct radix_tree_node.slots[] = [entry][NULL][sibling][sibling]
This causes an issue if you have a reader iterating over the slots in
the tree via radix_tree_for_each_slot() while only under
rcu_read_lock()/rcu_read_unlock() protection. This is a common case in
mm/filemap.c.
The issue is that when __radix_tree_next_slot() => skip_siblings() tries
to skip over the sibling entries in the slots, it currently does so with
an exact match on the slot directly preceding our current slot.
Normally this works:
V preceding slot
struct radix_tree_node.slots[] = [entry][sibling][sibling][sibling]
^ current slot
This lets you find the first sibling, and you skip them all in order.
But in the case where one of the siblings is NULL, that slot is skipped
and then our sibling detection is interrupted:
V preceding slot
struct radix_tree_node.slots[] = [entry][NULL][sibling][sibling]
^ current slot
This means that the sibling pointers aren't recognized since they point
all the way back to 'entry', so we think that they are normal internal
radix tree pointers. This causes us to think we need to walk down to a
struct radix_tree_node starting at the address of 'entry'.
In a real running kernel this will crash the thread with a GP fault when
you try and dereference the slots in your broken node starting at
'entry'.
In the radix tree test suite this will be caught by the address
sanitizer:
==27063==ERROR: AddressSanitizer: heap-buffer-overflow on address
0x60c0008ae400 at pc 0x00000040ce4f bp 0x7fa89b8fcad0 sp 0x7fa89b8fcac0
READ of size 8 at 0x60c0008ae400 thread T3
#0 0x40ce4e in __radix_tree_next_slot /home/rzwisler/project/linux/tools/testing/radix-tree/radix-tree.c:1660
#1 0x4022cc in radix_tree_next_slot linux/../../../../include/linux/radix-tree.h:567
#2 0x4022cc in iterator_func /home/rzwisler/project/linux/tools/testing/radix-tree/multiorder.c:655
#3 0x7fa8a088d50a in start_thread (/lib64/libpthread.so.0+0x750a)
#4 0x7fa8a03bd16e in clone (/lib64/libc.so.6+0xf516e)
Link: http://lkml.kernel.org/r/20180503192430.7582-5-ross.zwisler@linux.intel.com
Signed-off-by: Ross Zwisler <ross.zwisler@linux.intel.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: CR, Sapthagirish <sapthagirish.cr@intel.com>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Dave Chinner <david@fromorbit.com>
Cc: Jan Kara <jack@suse.cz>
Cc: Matthew Wilcox <willy@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2018-05-18 16:09:01 -07:00
|
|
|
|
|
|
|
for (i = 0; i < num_threads; i++)
|
|
|
|
pthread_join(worker_thread[i], NULL);
|
|
|
|
|
2018-09-22 16:12:41 -04:00
|
|
|
item_kill_tree(xa);
|
radix tree test suite: multi-order iteration race
Add a test which shows a race in the multi-order iteration code. This
test reliably hits the race in under a second on my machine, and is the
result of a real bug report against kernel a production v4.15 based
kernel (4.15.6-300.fc27.x86_64). With a real kernel this issue is hit
when using order 9 PMD DAX radix tree entries.
The race has to do with how we tear down multi-order sibling entries
when we are removing an item from the tree. Remember that an order 2
entry looks like this:
struct radix_tree_node.slots[] = [entry][sibling][sibling][sibling]
where 'entry' is in some slot in the struct radix_tree_node, and the
three slots following 'entry' contain sibling pointers which point back
to 'entry.'
When we delete 'entry' from the tree, we call :
radix_tree_delete()
radix_tree_delete_item()
__radix_tree_delete()
replace_slot()
replace_slot() first removes the siblings in order from the first to the
last, then at then replaces 'entry' with NULL. This means that for a
brief period of time we end up with one or more of the siblings removed,
so:
struct radix_tree_node.slots[] = [entry][NULL][sibling][sibling]
This causes an issue if you have a reader iterating over the slots in
the tree via radix_tree_for_each_slot() while only under
rcu_read_lock()/rcu_read_unlock() protection. This is a common case in
mm/filemap.c.
The issue is that when __radix_tree_next_slot() => skip_siblings() tries
to skip over the sibling entries in the slots, it currently does so with
an exact match on the slot directly preceding our current slot.
Normally this works:
V preceding slot
struct radix_tree_node.slots[] = [entry][sibling][sibling][sibling]
^ current slot
This lets you find the first sibling, and you skip them all in order.
But in the case where one of the siblings is NULL, that slot is skipped
and then our sibling detection is interrupted:
V preceding slot
struct radix_tree_node.slots[] = [entry][NULL][sibling][sibling]
^ current slot
This means that the sibling pointers aren't recognized since they point
all the way back to 'entry', so we think that they are normal internal
radix tree pointers. This causes us to think we need to walk down to a
struct radix_tree_node starting at the address of 'entry'.
In a real running kernel this will crash the thread with a GP fault when
you try and dereference the slots in your broken node starting at
'entry'.
In the radix tree test suite this will be caught by the address
sanitizer:
==27063==ERROR: AddressSanitizer: heap-buffer-overflow on address
0x60c0008ae400 at pc 0x00000040ce4f bp 0x7fa89b8fcad0 sp 0x7fa89b8fcac0
READ of size 8 at 0x60c0008ae400 thread T3
#0 0x40ce4e in __radix_tree_next_slot /home/rzwisler/project/linux/tools/testing/radix-tree/radix-tree.c:1660
#1 0x4022cc in radix_tree_next_slot linux/../../../../include/linux/radix-tree.h:567
#2 0x4022cc in iterator_func /home/rzwisler/project/linux/tools/testing/radix-tree/multiorder.c:655
#3 0x7fa8a088d50a in start_thread (/lib64/libpthread.so.0+0x750a)
#4 0x7fa8a03bd16e in clone (/lib64/libc.so.6+0xf516e)
Link: http://lkml.kernel.org/r/20180503192430.7582-5-ross.zwisler@linux.intel.com
Signed-off-by: Ross Zwisler <ross.zwisler@linux.intel.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: CR, Sapthagirish <sapthagirish.cr@intel.com>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Dave Chinner <david@fromorbit.com>
Cc: Jan Kara <jack@suse.cz>
Cc: Matthew Wilcox <willy@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2018-05-18 16:09:01 -07:00
|
|
|
}
|
|
|
|
|
XArray: Do not return sibling entries from xa_load()
It is possible for xa_load() to observe a sibling entry pointing to
another sibling entry. An example:
Thread A: Thread B:
xa_store_range(xa, entry, 188, 191, gfp);
xa_load(xa, 191);
entry = xa_entry(xa, node, 63);
[entry is a sibling of 188]
xa_store_range(xa, entry, 184, 191, gfp);
if (xa_is_sibling(entry))
offset = xa_to_sibling(entry);
entry = xa_entry(xas->xa, node, offset);
[entry is now a sibling of 184]
It is sufficient to go around this loop until we hit a non-sibling entry.
Sibling entries always point earlier in the node, so we are guaranteed
to terminate this search.
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Fixes: 6b24ca4a1a8d ("mm: Use multi-index entries in the page cache")
Cc: stable@vger.kernel.org
2023-07-26 22:58:17 -04:00
|
|
|
static void *load_creator(void *ptr)
|
|
|
|
{
|
|
|
|
/* 'order' is set up to ensure we have sibling entries */
|
|
|
|
unsigned int order;
|
|
|
|
struct radix_tree_root *tree = ptr;
|
|
|
|
int i;
|
|
|
|
|
|
|
|
rcu_register_thread();
|
|
|
|
item_insert_order(tree, 3 << RADIX_TREE_MAP_SHIFT, 0);
|
|
|
|
item_insert_order(tree, 2 << RADIX_TREE_MAP_SHIFT, 0);
|
|
|
|
for (i = 0; i < 10000; i++) {
|
|
|
|
for (order = 1; order < RADIX_TREE_MAP_SHIFT; order++) {
|
|
|
|
unsigned long index = (3 << RADIX_TREE_MAP_SHIFT) -
|
|
|
|
(1 << order);
|
|
|
|
item_insert_order(tree, index, order);
|
Xarray: do not return sibling entries from xas_find_marked()
Patch series "Fixes and cleanups to xarray", v5.
This series contains some random fixes and cleanups to xarray. Patch 1-2
are fixes and patch 3-6 are cleanups. More details can be found in
respective patches.
This patch (of 5):
Similar to issue fixed in commit cbc02854331ed ("XArray: Do not return
sibling entries from xa_load()"), we may return sibling entries from
xas_find_marked as following:
Thread A: Thread B:
xa_store_range(xa, entry, 6, 7, gfp);
xa_set_mark(xa, 6, mark)
XA_STATE(xas, xa, 6);
xas_find_marked(&xas, 7, mark);
offset = xas_find_chunk(xas, advance, mark);
[offset is 6 which points to a valid entry]
xa_store_range(xa, entry, 4, 7, gfp);
entry = xa_entry(xa, node, 6);
[entry is a sibling of 4]
if (!xa_is_node(entry))
return entry;
Skip sibling entry like xas_find() does to protect caller from seeing
sibling entry from xas_find_marked() or caller may use sibling entry
as a valid entry and crash the kernel.
Besides, load_race() test is modified to catch mentioned issue and modified
load_race() only passes after this fix is merged.
Here is an example how this bug could be triggerred in tmpfs which
enables large folio in mapping:
Let's take a look at involved racer:
1. How pages could be created and dirtied in shmem file.
write
ksys_write
vfs_write
new_sync_write
shmem_file_write_iter
generic_perform_write
shmem_write_begin
shmem_get_folio
shmem_allowable_huge_orders
shmem_alloc_and_add_folios
shmem_alloc_folio
__folio_set_locked
shmem_add_to_page_cache
XA_STATE_ORDER(..., index, order)
xax_store()
shmem_write_end
folio_mark_dirty()
2. How dirty pages could be deleted in shmem file.
ioctl
do_vfs_ioctl
file_ioctl
ioctl_preallocate
vfs_fallocate
shmem_fallocate
shmem_truncate_range
shmem_undo_range
truncate_inode_folio
filemap_remove_folio
page_cache_delete
xas_store(&xas, NULL);
3. How dirty pages could be lockless searched
sync_file_range
ksys_sync_file_range
__filemap_fdatawrite_range
filemap_fdatawrite_wbc
do_writepages
writeback_use_writepage
writeback_iter
writeback_get_folio
filemap_get_folios_tag
find_get_entry
folio = xas_find_marked()
folio_try_get(folio)
Kernel will crash as following:
1.Create 2.Search 3.Delete
/* write page 2,3 */
write
...
shmem_write_begin
XA_STATE_ORDER(xas, i_pages, index = 2, order = 1)
xa_store(&xas, folio)
shmem_write_end
folio_mark_dirty()
/* sync page 2 and page 3 */
sync_file_range
...
find_get_entry
folio = xas_find_marked()
/* offset will be 2 */
offset = xas_find_chunk()
/* delete page 2 and page 3 */
ioctl
...
xas_store(&xas, NULL);
/* write page 0-3 */
write
...
shmem_write_begin
XA_STATE_ORDER(xas, i_pages, index = 0, order = 2)
xa_store(&xas, folio)
shmem_write_end
folio_mark_dirty(folio)
/* get sibling entry from offset 2 */
entry = xa_entry(.., 2)
/* use sibling entry as folio and crash kernel */
folio_try_get(folio)
Link: https://lkml.kernel.org/r/20241213122523.12764-1-shikemeng@huaweicloud.com
Link: https://lkml.kernel.org/r/20241213122523.12764-2-shikemeng@huaweicloud.com
Signed-off-by: Kemeng Shi <shikemeng@huaweicloud.com>
Cc: Mattew Wilcox <willy@infradead.org> [English fixes]
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2024-12-13 20:25:19 +08:00
|
|
|
xa_set_mark(tree, index, XA_MARK_1);
|
XArray: Do not return sibling entries from xa_load()
It is possible for xa_load() to observe a sibling entry pointing to
another sibling entry. An example:
Thread A: Thread B:
xa_store_range(xa, entry, 188, 191, gfp);
xa_load(xa, 191);
entry = xa_entry(xa, node, 63);
[entry is a sibling of 188]
xa_store_range(xa, entry, 184, 191, gfp);
if (xa_is_sibling(entry))
offset = xa_to_sibling(entry);
entry = xa_entry(xas->xa, node, offset);
[entry is now a sibling of 184]
It is sufficient to go around this loop until we hit a non-sibling entry.
Sibling entries always point earlier in the node, so we are guaranteed
to terminate this search.
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Fixes: 6b24ca4a1a8d ("mm: Use multi-index entries in the page cache")
Cc: stable@vger.kernel.org
2023-07-26 22:58:17 -04:00
|
|
|
item_delete_rcu(tree, index);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
rcu_unregister_thread();
|
|
|
|
|
|
|
|
stop_iteration = true;
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
|
|
|
|
static void *load_worker(void *ptr)
|
|
|
|
{
|
|
|
|
unsigned long index = (3 << RADIX_TREE_MAP_SHIFT) - 1;
|
|
|
|
|
|
|
|
rcu_register_thread();
|
|
|
|
while (!stop_iteration) {
|
Xarray: do not return sibling entries from xas_find_marked()
Patch series "Fixes and cleanups to xarray", v5.
This series contains some random fixes and cleanups to xarray. Patch 1-2
are fixes and patch 3-6 are cleanups. More details can be found in
respective patches.
This patch (of 5):
Similar to issue fixed in commit cbc02854331ed ("XArray: Do not return
sibling entries from xa_load()"), we may return sibling entries from
xas_find_marked as following:
Thread A: Thread B:
xa_store_range(xa, entry, 6, 7, gfp);
xa_set_mark(xa, 6, mark)
XA_STATE(xas, xa, 6);
xas_find_marked(&xas, 7, mark);
offset = xas_find_chunk(xas, advance, mark);
[offset is 6 which points to a valid entry]
xa_store_range(xa, entry, 4, 7, gfp);
entry = xa_entry(xa, node, 6);
[entry is a sibling of 4]
if (!xa_is_node(entry))
return entry;
Skip sibling entry like xas_find() does to protect caller from seeing
sibling entry from xas_find_marked() or caller may use sibling entry
as a valid entry and crash the kernel.
Besides, load_race() test is modified to catch mentioned issue and modified
load_race() only passes after this fix is merged.
Here is an example how this bug could be triggerred in tmpfs which
enables large folio in mapping:
Let's take a look at involved racer:
1. How pages could be created and dirtied in shmem file.
write
ksys_write
vfs_write
new_sync_write
shmem_file_write_iter
generic_perform_write
shmem_write_begin
shmem_get_folio
shmem_allowable_huge_orders
shmem_alloc_and_add_folios
shmem_alloc_folio
__folio_set_locked
shmem_add_to_page_cache
XA_STATE_ORDER(..., index, order)
xax_store()
shmem_write_end
folio_mark_dirty()
2. How dirty pages could be deleted in shmem file.
ioctl
do_vfs_ioctl
file_ioctl
ioctl_preallocate
vfs_fallocate
shmem_fallocate
shmem_truncate_range
shmem_undo_range
truncate_inode_folio
filemap_remove_folio
page_cache_delete
xas_store(&xas, NULL);
3. How dirty pages could be lockless searched
sync_file_range
ksys_sync_file_range
__filemap_fdatawrite_range
filemap_fdatawrite_wbc
do_writepages
writeback_use_writepage
writeback_iter
writeback_get_folio
filemap_get_folios_tag
find_get_entry
folio = xas_find_marked()
folio_try_get(folio)
Kernel will crash as following:
1.Create 2.Search 3.Delete
/* write page 2,3 */
write
...
shmem_write_begin
XA_STATE_ORDER(xas, i_pages, index = 2, order = 1)
xa_store(&xas, folio)
shmem_write_end
folio_mark_dirty()
/* sync page 2 and page 3 */
sync_file_range
...
find_get_entry
folio = xas_find_marked()
/* offset will be 2 */
offset = xas_find_chunk()
/* delete page 2 and page 3 */
ioctl
...
xas_store(&xas, NULL);
/* write page 0-3 */
write
...
shmem_write_begin
XA_STATE_ORDER(xas, i_pages, index = 0, order = 2)
xa_store(&xas, folio)
shmem_write_end
folio_mark_dirty(folio)
/* get sibling entry from offset 2 */
entry = xa_entry(.., 2)
/* use sibling entry as folio and crash kernel */
folio_try_get(folio)
Link: https://lkml.kernel.org/r/20241213122523.12764-1-shikemeng@huaweicloud.com
Link: https://lkml.kernel.org/r/20241213122523.12764-2-shikemeng@huaweicloud.com
Signed-off-by: Kemeng Shi <shikemeng@huaweicloud.com>
Cc: Mattew Wilcox <willy@infradead.org> [English fixes]
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2024-12-13 20:25:19 +08:00
|
|
|
unsigned long find_index = (2 << RADIX_TREE_MAP_SHIFT) + 1;
|
XArray: Do not return sibling entries from xa_load()
It is possible for xa_load() to observe a sibling entry pointing to
another sibling entry. An example:
Thread A: Thread B:
xa_store_range(xa, entry, 188, 191, gfp);
xa_load(xa, 191);
entry = xa_entry(xa, node, 63);
[entry is a sibling of 188]
xa_store_range(xa, entry, 184, 191, gfp);
if (xa_is_sibling(entry))
offset = xa_to_sibling(entry);
entry = xa_entry(xas->xa, node, offset);
[entry is now a sibling of 184]
It is sufficient to go around this loop until we hit a non-sibling entry.
Sibling entries always point earlier in the node, so we are guaranteed
to terminate this search.
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Fixes: 6b24ca4a1a8d ("mm: Use multi-index entries in the page cache")
Cc: stable@vger.kernel.org
2023-07-26 22:58:17 -04:00
|
|
|
struct item *item = xa_load(ptr, index);
|
|
|
|
assert(!xa_is_internal(item));
|
Xarray: do not return sibling entries from xas_find_marked()
Patch series "Fixes and cleanups to xarray", v5.
This series contains some random fixes and cleanups to xarray. Patch 1-2
are fixes and patch 3-6 are cleanups. More details can be found in
respective patches.
This patch (of 5):
Similar to issue fixed in commit cbc02854331ed ("XArray: Do not return
sibling entries from xa_load()"), we may return sibling entries from
xas_find_marked as following:
Thread A: Thread B:
xa_store_range(xa, entry, 6, 7, gfp);
xa_set_mark(xa, 6, mark)
XA_STATE(xas, xa, 6);
xas_find_marked(&xas, 7, mark);
offset = xas_find_chunk(xas, advance, mark);
[offset is 6 which points to a valid entry]
xa_store_range(xa, entry, 4, 7, gfp);
entry = xa_entry(xa, node, 6);
[entry is a sibling of 4]
if (!xa_is_node(entry))
return entry;
Skip sibling entry like xas_find() does to protect caller from seeing
sibling entry from xas_find_marked() or caller may use sibling entry
as a valid entry and crash the kernel.
Besides, load_race() test is modified to catch mentioned issue and modified
load_race() only passes after this fix is merged.
Here is an example how this bug could be triggerred in tmpfs which
enables large folio in mapping:
Let's take a look at involved racer:
1. How pages could be created and dirtied in shmem file.
write
ksys_write
vfs_write
new_sync_write
shmem_file_write_iter
generic_perform_write
shmem_write_begin
shmem_get_folio
shmem_allowable_huge_orders
shmem_alloc_and_add_folios
shmem_alloc_folio
__folio_set_locked
shmem_add_to_page_cache
XA_STATE_ORDER(..., index, order)
xax_store()
shmem_write_end
folio_mark_dirty()
2. How dirty pages could be deleted in shmem file.
ioctl
do_vfs_ioctl
file_ioctl
ioctl_preallocate
vfs_fallocate
shmem_fallocate
shmem_truncate_range
shmem_undo_range
truncate_inode_folio
filemap_remove_folio
page_cache_delete
xas_store(&xas, NULL);
3. How dirty pages could be lockless searched
sync_file_range
ksys_sync_file_range
__filemap_fdatawrite_range
filemap_fdatawrite_wbc
do_writepages
writeback_use_writepage
writeback_iter
writeback_get_folio
filemap_get_folios_tag
find_get_entry
folio = xas_find_marked()
folio_try_get(folio)
Kernel will crash as following:
1.Create 2.Search 3.Delete
/* write page 2,3 */
write
...
shmem_write_begin
XA_STATE_ORDER(xas, i_pages, index = 2, order = 1)
xa_store(&xas, folio)
shmem_write_end
folio_mark_dirty()
/* sync page 2 and page 3 */
sync_file_range
...
find_get_entry
folio = xas_find_marked()
/* offset will be 2 */
offset = xas_find_chunk()
/* delete page 2 and page 3 */
ioctl
...
xas_store(&xas, NULL);
/* write page 0-3 */
write
...
shmem_write_begin
XA_STATE_ORDER(xas, i_pages, index = 0, order = 2)
xa_store(&xas, folio)
shmem_write_end
folio_mark_dirty(folio)
/* get sibling entry from offset 2 */
entry = xa_entry(.., 2)
/* use sibling entry as folio and crash kernel */
folio_try_get(folio)
Link: https://lkml.kernel.org/r/20241213122523.12764-1-shikemeng@huaweicloud.com
Link: https://lkml.kernel.org/r/20241213122523.12764-2-shikemeng@huaweicloud.com
Signed-off-by: Kemeng Shi <shikemeng@huaweicloud.com>
Cc: Mattew Wilcox <willy@infradead.org> [English fixes]
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2024-12-13 20:25:19 +08:00
|
|
|
item = xa_find(ptr, &find_index, index, XA_MARK_1);
|
|
|
|
assert(!xa_is_internal(item));
|
XArray: Do not return sibling entries from xa_load()
It is possible for xa_load() to observe a sibling entry pointing to
another sibling entry. An example:
Thread A: Thread B:
xa_store_range(xa, entry, 188, 191, gfp);
xa_load(xa, 191);
entry = xa_entry(xa, node, 63);
[entry is a sibling of 188]
xa_store_range(xa, entry, 184, 191, gfp);
if (xa_is_sibling(entry))
offset = xa_to_sibling(entry);
entry = xa_entry(xas->xa, node, offset);
[entry is now a sibling of 184]
It is sufficient to go around this loop until we hit a non-sibling entry.
Sibling entries always point earlier in the node, so we are guaranteed
to terminate this search.
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Fixes: 6b24ca4a1a8d ("mm: Use multi-index entries in the page cache")
Cc: stable@vger.kernel.org
2023-07-26 22:58:17 -04:00
|
|
|
}
|
|
|
|
rcu_unregister_thread();
|
|
|
|
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
|
|
|
|
static void load_race(struct xarray *xa)
|
|
|
|
{
|
|
|
|
const int num_threads = sysconf(_SC_NPROCESSORS_ONLN) * 4;
|
|
|
|
pthread_t worker_thread[num_threads];
|
|
|
|
int i;
|
|
|
|
|
|
|
|
stop_iteration = false;
|
|
|
|
pthread_create(&worker_thread[0], NULL, &load_creator, xa);
|
|
|
|
for (i = 1; i < num_threads; i++)
|
|
|
|
pthread_create(&worker_thread[i], NULL, &load_worker, xa);
|
|
|
|
|
|
|
|
for (i = 0; i < num_threads; i++)
|
|
|
|
pthread_join(worker_thread[i], NULL);
|
|
|
|
|
|
|
|
item_kill_tree(xa);
|
|
|
|
}
|
|
|
|
|
2018-09-22 16:12:41 -04:00
|
|
|
static DEFINE_XARRAY(array);
|
|
|
|
|
2016-05-20 17:02:14 -07:00
|
|
|
void multiorder_checks(void)
|
|
|
|
{
|
2018-09-22 16:12:41 -04:00
|
|
|
multiorder_iteration(&array);
|
|
|
|
multiorder_tagged_iteration(&array);
|
|
|
|
multiorder_iteration_race(&array);
|
XArray: Do not return sibling entries from xa_load()
It is possible for xa_load() to observe a sibling entry pointing to
another sibling entry. An example:
Thread A: Thread B:
xa_store_range(xa, entry, 188, 191, gfp);
xa_load(xa, 191);
entry = xa_entry(xa, node, 63);
[entry is a sibling of 188]
xa_store_range(xa, entry, 184, 191, gfp);
if (xa_is_sibling(entry))
offset = xa_to_sibling(entry);
entry = xa_entry(xas->xa, node, offset);
[entry is now a sibling of 184]
It is sufficient to go around this loop until we hit a non-sibling entry.
Sibling entries always point earlier in the node, so we are guaranteed
to terminate this search.
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Fixes: 6b24ca4a1a8d ("mm: Use multi-index entries in the page cache")
Cc: stable@vger.kernel.org
2023-07-26 22:58:17 -04:00
|
|
|
load_race(&array);
|
2016-12-14 15:09:04 -08:00
|
|
|
|
|
|
|
radix_tree_cpu_dead(0);
|
2016-05-20 17:02:14 -07:00
|
|
|
}
|
2016-12-18 22:56:05 -05:00
|
|
|
|
XArray: Do not return sibling entries from xa_load()
It is possible for xa_load() to observe a sibling entry pointing to
another sibling entry. An example:
Thread A: Thread B:
xa_store_range(xa, entry, 188, 191, gfp);
xa_load(xa, 191);
entry = xa_entry(xa, node, 63);
[entry is a sibling of 188]
xa_store_range(xa, entry, 184, 191, gfp);
if (xa_is_sibling(entry))
offset = xa_to_sibling(entry);
entry = xa_entry(xas->xa, node, offset);
[entry is now a sibling of 184]
It is sufficient to go around this loop until we hit a non-sibling entry.
Sibling entries always point earlier in the node, so we are guaranteed
to terminate this search.
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Fixes: 6b24ca4a1a8d ("mm: Use multi-index entries in the page cache")
Cc: stable@vger.kernel.org
2023-07-26 22:58:17 -04:00
|
|
|
int __weak main(int argc, char **argv)
|
2016-12-18 22:56:05 -05:00
|
|
|
{
|
XArray: Do not return sibling entries from xa_load()
It is possible for xa_load() to observe a sibling entry pointing to
another sibling entry. An example:
Thread A: Thread B:
xa_store_range(xa, entry, 188, 191, gfp);
xa_load(xa, 191);
entry = xa_entry(xa, node, 63);
[entry is a sibling of 188]
xa_store_range(xa, entry, 184, 191, gfp);
if (xa_is_sibling(entry))
offset = xa_to_sibling(entry);
entry = xa_entry(xas->xa, node, offset);
[entry is now a sibling of 184]
It is sufficient to go around this loop until we hit a non-sibling entry.
Sibling entries always point earlier in the node, so we are guaranteed
to terminate this search.
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Fixes: 6b24ca4a1a8d ("mm: Use multi-index entries in the page cache")
Cc: stable@vger.kernel.org
2023-07-26 22:58:17 -04:00
|
|
|
int opt;
|
|
|
|
|
|
|
|
while ((opt = getopt(argc, argv, "ls:v")) != -1) {
|
|
|
|
if (opt == 'v')
|
|
|
|
test_verbose++;
|
|
|
|
}
|
|
|
|
|
2021-03-31 14:59:19 -04:00
|
|
|
rcu_register_thread();
|
2016-12-18 22:56:05 -05:00
|
|
|
radix_tree_init();
|
|
|
|
multiorder_checks();
|
2021-03-31 14:59:19 -04:00
|
|
|
rcu_unregister_thread();
|
2016-12-18 22:56:05 -05:00
|
|
|
return 0;
|
|
|
|
}
|