2023-04-15 13:18:11 -07:00
|
|
|
// SPDX-License-Identifier: GPL-2.0
|
|
|
|
/* Copyright (c) 2023 Meta Platforms, Inc. and affiliates. */
|
|
|
|
|
|
|
|
#include <vmlinux.h>
|
|
|
|
#include <bpf/bpf_tracing.h>
|
|
|
|
#include <bpf/bpf_helpers.h>
|
|
|
|
#include <bpf/bpf_core_read.h>
|
|
|
|
#include "bpf_misc.h"
|
|
|
|
#include "bpf_experimental.h"
|
|
|
|
|
2023-08-21 12:33:11 -07:00
|
|
|
extern void bpf_rcu_read_lock(void) __ksym;
|
|
|
|
extern void bpf_rcu_read_unlock(void) __ksym;
|
|
|
|
|
2023-04-15 13:18:11 -07:00
|
|
|
struct node_data {
|
|
|
|
long key;
|
|
|
|
long list_data;
|
|
|
|
struct bpf_rb_node r;
|
|
|
|
struct bpf_list_node l;
|
|
|
|
struct bpf_refcount ref;
|
|
|
|
};
|
|
|
|
|
|
|
|
struct map_value {
|
|
|
|
struct node_data __kptr *node;
|
|
|
|
};
|
|
|
|
|
|
|
|
struct {
|
|
|
|
__uint(type, BPF_MAP_TYPE_ARRAY);
|
|
|
|
__type(key, int);
|
|
|
|
__type(value, struct map_value);
|
selftests/bpf: Add rbtree test exercising race which 'owner' field prevents
This patch adds a runnable version of one of the races described by
Kumar in [0]. Specifically, this interleaving:
(rbtree1 and list head protected by lock1, rbtree2 protected by lock2)
Prog A Prog B
======================================
n = bpf_obj_new(...)
m = bpf_refcount_acquire(n)
kptr_xchg(map, m)
m = kptr_xchg(map, NULL)
lock(lock2)
bpf_rbtree_add(rbtree2, m->r, less)
unlock(lock2)
lock(lock1)
bpf_list_push_back(head, n->l)
/* make n non-owning ref */
bpf_rbtree_remove(rbtree1, n->r)
unlock(lock1)
The above interleaving, the node's struct bpf_rb_node *r can be used to
add it to either rbtree1 or rbtree2, which are protected by different
locks. If the node has been added to rbtree2, we should not be allowed
to remove it while holding rbtree1's lock.
Before changes in the previous patch in this series, the rbtree_remove
in the second part of Prog A would succeed as the verifier has no way of
knowing which tree owns a particular node at verification time. The
addition of 'owner' field results in bpf_rbtree_remove correctly
failing.
The test added in this patch splits "Prog A" above into two separate BPF
programs - A1 and A2 - and uses a second mapval + kptr_xchg to pass n
from A1 to A2 similarly to the pass from A1 to B. If the test is run
without the fix applied, the remove will succeed.
Kumar's example had the two programs running on separate CPUs. This
patch doesn't do this as it's not necessary to exercise the broken
behavior / validate fixed behavior.
[0]: https://lore.kernel.org/bpf/d7hyspcow5wtjcmw4fugdgyp3fwhljwuscp3xyut5qnwivyeru@ysdq543otzv2
Signed-off-by: Dave Marchevsky <davemarchevsky@fb.com>
Suggested-by: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Link: https://lore.kernel.org/r/20230718083813.3416104-5-davemarchevsky@fb.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2023-07-18 01:38:11 -07:00
|
|
|
__uint(max_entries, 2);
|
2023-04-15 13:18:11 -07:00
|
|
|
} stashed_nodes SEC(".maps");
|
|
|
|
|
|
|
|
struct node_acquire {
|
|
|
|
long key;
|
|
|
|
long data;
|
|
|
|
struct bpf_rb_node node;
|
|
|
|
struct bpf_refcount refcount;
|
|
|
|
};
|
|
|
|
|
|
|
|
#define private(name) SEC(".bss." #name) __hidden __attribute__((aligned(8)))
|
|
|
|
private(A) struct bpf_spin_lock lock;
|
|
|
|
private(A) struct bpf_rb_root root __contains(node_data, r);
|
|
|
|
private(A) struct bpf_list_head head __contains(node_data, l);
|
|
|
|
|
|
|
|
private(B) struct bpf_spin_lock alock;
|
|
|
|
private(B) struct bpf_rb_root aroot __contains(node_acquire, node);
|
|
|
|
|
selftests/bpf: Add rbtree test exercising race which 'owner' field prevents
This patch adds a runnable version of one of the races described by
Kumar in [0]. Specifically, this interleaving:
(rbtree1 and list head protected by lock1, rbtree2 protected by lock2)
Prog A Prog B
======================================
n = bpf_obj_new(...)
m = bpf_refcount_acquire(n)
kptr_xchg(map, m)
m = kptr_xchg(map, NULL)
lock(lock2)
bpf_rbtree_add(rbtree2, m->r, less)
unlock(lock2)
lock(lock1)
bpf_list_push_back(head, n->l)
/* make n non-owning ref */
bpf_rbtree_remove(rbtree1, n->r)
unlock(lock1)
The above interleaving, the node's struct bpf_rb_node *r can be used to
add it to either rbtree1 or rbtree2, which are protected by different
locks. If the node has been added to rbtree2, we should not be allowed
to remove it while holding rbtree1's lock.
Before changes in the previous patch in this series, the rbtree_remove
in the second part of Prog A would succeed as the verifier has no way of
knowing which tree owns a particular node at verification time. The
addition of 'owner' field results in bpf_rbtree_remove correctly
failing.
The test added in this patch splits "Prog A" above into two separate BPF
programs - A1 and A2 - and uses a second mapval + kptr_xchg to pass n
from A1 to A2 similarly to the pass from A1 to B. If the test is run
without the fix applied, the remove will succeed.
Kumar's example had the two programs running on separate CPUs. This
patch doesn't do this as it's not necessary to exercise the broken
behavior / validate fixed behavior.
[0]: https://lore.kernel.org/bpf/d7hyspcow5wtjcmw4fugdgyp3fwhljwuscp3xyut5qnwivyeru@ysdq543otzv2
Signed-off-by: Dave Marchevsky <davemarchevsky@fb.com>
Suggested-by: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Link: https://lore.kernel.org/r/20230718083813.3416104-5-davemarchevsky@fb.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2023-07-18 01:38:11 -07:00
|
|
|
private(C) struct bpf_spin_lock block;
|
|
|
|
private(C) struct bpf_rb_root broot __contains(node_data, r);
|
|
|
|
|
2023-04-15 13:18:11 -07:00
|
|
|
static bool less(struct bpf_rb_node *node_a, const struct bpf_rb_node *node_b)
|
|
|
|
{
|
|
|
|
struct node_data *a;
|
|
|
|
struct node_data *b;
|
|
|
|
|
|
|
|
a = container_of(node_a, struct node_data, r);
|
|
|
|
b = container_of(node_b, struct node_data, r);
|
|
|
|
|
|
|
|
return a->key < b->key;
|
|
|
|
}
|
|
|
|
|
|
|
|
static bool less_a(struct bpf_rb_node *a, const struct bpf_rb_node *b)
|
|
|
|
{
|
|
|
|
struct node_acquire *node_a;
|
|
|
|
struct node_acquire *node_b;
|
|
|
|
|
|
|
|
node_a = container_of(a, struct node_acquire, node);
|
|
|
|
node_b = container_of(b, struct node_acquire, node);
|
|
|
|
|
|
|
|
return node_a->key < node_b->key;
|
|
|
|
}
|
|
|
|
|
|
|
|
static long __insert_in_tree_and_list(struct bpf_list_head *head,
|
|
|
|
struct bpf_rb_root *root,
|
|
|
|
struct bpf_spin_lock *lock)
|
|
|
|
{
|
|
|
|
struct node_data *n, *m;
|
|
|
|
|
|
|
|
n = bpf_obj_new(typeof(*n));
|
|
|
|
if (!n)
|
|
|
|
return -1;
|
|
|
|
|
|
|
|
m = bpf_refcount_acquire(n);
|
|
|
|
m->key = 123;
|
|
|
|
m->list_data = 456;
|
|
|
|
|
|
|
|
bpf_spin_lock(lock);
|
|
|
|
if (bpf_rbtree_add(root, &n->r, less)) {
|
|
|
|
/* Failure to insert - unexpected */
|
|
|
|
bpf_spin_unlock(lock);
|
|
|
|
bpf_obj_drop(m);
|
|
|
|
return -2;
|
|
|
|
}
|
|
|
|
bpf_spin_unlock(lock);
|
|
|
|
|
|
|
|
bpf_spin_lock(lock);
|
|
|
|
if (bpf_list_push_front(head, &m->l)) {
|
|
|
|
/* Failure to insert - unexpected */
|
|
|
|
bpf_spin_unlock(lock);
|
|
|
|
return -3;
|
|
|
|
}
|
|
|
|
bpf_spin_unlock(lock);
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
static long __stash_map_insert_tree(int idx, int val, struct bpf_rb_root *root,
|
|
|
|
struct bpf_spin_lock *lock)
|
|
|
|
{
|
|
|
|
struct map_value *mapval;
|
|
|
|
struct node_data *n, *m;
|
|
|
|
|
|
|
|
mapval = bpf_map_lookup_elem(&stashed_nodes, &idx);
|
|
|
|
if (!mapval)
|
|
|
|
return -1;
|
|
|
|
|
|
|
|
n = bpf_obj_new(typeof(*n));
|
|
|
|
if (!n)
|
|
|
|
return -2;
|
|
|
|
|
|
|
|
n->key = val;
|
|
|
|
m = bpf_refcount_acquire(n);
|
|
|
|
|
|
|
|
n = bpf_kptr_xchg(&mapval->node, n);
|
|
|
|
if (n) {
|
|
|
|
bpf_obj_drop(n);
|
|
|
|
bpf_obj_drop(m);
|
|
|
|
return -3;
|
|
|
|
}
|
|
|
|
|
|
|
|
bpf_spin_lock(lock);
|
|
|
|
if (bpf_rbtree_add(root, &m->r, less)) {
|
|
|
|
/* Failure to insert - unexpected */
|
|
|
|
bpf_spin_unlock(lock);
|
|
|
|
return -4;
|
|
|
|
}
|
|
|
|
bpf_spin_unlock(lock);
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
static long __read_from_tree(struct bpf_rb_root *root,
|
|
|
|
struct bpf_spin_lock *lock,
|
|
|
|
bool remove_from_tree)
|
|
|
|
{
|
|
|
|
struct bpf_rb_node *rb;
|
|
|
|
struct node_data *n;
|
|
|
|
long res = -99;
|
|
|
|
|
|
|
|
bpf_spin_lock(lock);
|
|
|
|
|
|
|
|
rb = bpf_rbtree_first(root);
|
|
|
|
if (!rb) {
|
|
|
|
bpf_spin_unlock(lock);
|
|
|
|
return -1;
|
|
|
|
}
|
|
|
|
|
|
|
|
n = container_of(rb, struct node_data, r);
|
|
|
|
res = n->key;
|
|
|
|
|
|
|
|
if (!remove_from_tree) {
|
|
|
|
bpf_spin_unlock(lock);
|
|
|
|
return res;
|
|
|
|
}
|
|
|
|
|
|
|
|
rb = bpf_rbtree_remove(root, rb);
|
|
|
|
bpf_spin_unlock(lock);
|
|
|
|
if (!rb)
|
|
|
|
return -2;
|
|
|
|
n = container_of(rb, struct node_data, r);
|
|
|
|
bpf_obj_drop(n);
|
|
|
|
return res;
|
|
|
|
}
|
|
|
|
|
|
|
|
static long __read_from_list(struct bpf_list_head *head,
|
|
|
|
struct bpf_spin_lock *lock,
|
|
|
|
bool remove_from_list)
|
|
|
|
{
|
|
|
|
struct bpf_list_node *l;
|
|
|
|
struct node_data *n;
|
|
|
|
long res = -99;
|
|
|
|
|
|
|
|
bpf_spin_lock(lock);
|
|
|
|
|
|
|
|
l = bpf_list_pop_front(head);
|
|
|
|
if (!l) {
|
|
|
|
bpf_spin_unlock(lock);
|
|
|
|
return -1;
|
|
|
|
}
|
|
|
|
|
|
|
|
n = container_of(l, struct node_data, l);
|
|
|
|
res = n->list_data;
|
|
|
|
|
|
|
|
if (!remove_from_list) {
|
|
|
|
if (bpf_list_push_back(head, &n->l)) {
|
|
|
|
bpf_spin_unlock(lock);
|
|
|
|
return -2;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
bpf_spin_unlock(lock);
|
|
|
|
|
|
|
|
if (remove_from_list)
|
|
|
|
bpf_obj_drop(n);
|
|
|
|
return res;
|
|
|
|
}
|
|
|
|
|
|
|
|
static long __read_from_unstash(int idx)
|
|
|
|
{
|
|
|
|
struct node_data *n = NULL;
|
|
|
|
struct map_value *mapval;
|
|
|
|
long val = -99;
|
|
|
|
|
|
|
|
mapval = bpf_map_lookup_elem(&stashed_nodes, &idx);
|
|
|
|
if (!mapval)
|
|
|
|
return -1;
|
|
|
|
|
|
|
|
n = bpf_kptr_xchg(&mapval->node, n);
|
|
|
|
if (!n)
|
|
|
|
return -2;
|
|
|
|
|
|
|
|
val = n->key;
|
|
|
|
bpf_obj_drop(n);
|
|
|
|
return val;
|
|
|
|
}
|
|
|
|
|
|
|
|
#define INSERT_READ_BOTH(rem_tree, rem_list, desc) \
|
|
|
|
SEC("tc") \
|
|
|
|
__description(desc) \
|
bpf: Fix bpf_refcount_acquire's refcount_t address calculation
When calculating the address of the refcount_t struct within a local
kptr, bpf_refcount_acquire_impl should add refcount_off bytes to the
address of the local kptr. Due to some missing parens, the function is
incorrectly adding sizeof(refcount_t) * refcount_off bytes. This patch
fixes the calculation.
Due to the incorrect calculation, bpf_refcount_acquire_impl was trying
to refcount_inc some memory well past the end of local kptrs, resulting
in kasan and refcount complaints, as reported in [0]. In that thread,
Florian and Eduard discovered that bpf selftests written in the new
style - with __success and an expected __retval, specifically - were
not actually being run. As a result, selftests added in bpf_refcount
series weren't really exercising this behavior, and thus didn't unearth
the bug.
With this fixed behavior it's safe to revert commit 7c4b96c00043
("selftests/bpf: disable program test run for progs/refcounted_kptr.c"),
this patch does so.
[0] https://lore.kernel.org/bpf/ZEEp+j22imoN6rn9@strlen.de/
Fixes: 7c50b1cb76ac ("bpf: Add bpf_refcount_acquire kfunc")
Reported-by: Florian Westphal <fw@strlen.de>
Reported-by: Eduard Zingerman <eddyz87@gmail.com>
Signed-off-by: Dave Marchevsky <davemarchevsky@fb.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Tested-by: Eduard Zingerman <eddyz87@gmail.com>
Link: https://lore.kernel.org/bpf/20230421074431.3548349-1-davemarchevsky@fb.com
2023-04-21 00:44:31 -07:00
|
|
|
__success __retval(579) \
|
2023-04-15 13:18:11 -07:00
|
|
|
long insert_and_remove_tree_##rem_tree##_list_##rem_list(void *ctx) \
|
|
|
|
{ \
|
|
|
|
long err, tree_data, list_data; \
|
|
|
|
\
|
|
|
|
err = __insert_in_tree_and_list(&head, &root, &lock); \
|
|
|
|
if (err) \
|
|
|
|
return err; \
|
|
|
|
\
|
|
|
|
err = __read_from_tree(&root, &lock, rem_tree); \
|
|
|
|
if (err < 0) \
|
|
|
|
return err; \
|
|
|
|
else \
|
|
|
|
tree_data = err; \
|
|
|
|
\
|
|
|
|
err = __read_from_list(&head, &lock, rem_list); \
|
|
|
|
if (err < 0) \
|
|
|
|
return err; \
|
|
|
|
else \
|
|
|
|
list_data = err; \
|
|
|
|
\
|
|
|
|
return tree_data + list_data; \
|
|
|
|
}
|
|
|
|
|
|
|
|
/* After successful insert of struct node_data into both collections:
|
|
|
|
* - it should have refcount = 2
|
|
|
|
* - removing / not removing the node_data from a collection after
|
|
|
|
* reading should have no effect on ability to read / remove from
|
|
|
|
* the other collection
|
|
|
|
*/
|
|
|
|
INSERT_READ_BOTH(true, true, "insert_read_both: remove from tree + list");
|
|
|
|
INSERT_READ_BOTH(false, false, "insert_read_both: remove from neither");
|
|
|
|
INSERT_READ_BOTH(true, false, "insert_read_both: remove from tree");
|
|
|
|
INSERT_READ_BOTH(false, true, "insert_read_both: remove from list");
|
|
|
|
|
|
|
|
#undef INSERT_READ_BOTH
|
|
|
|
#define INSERT_READ_BOTH(rem_tree, rem_list, desc) \
|
|
|
|
SEC("tc") \
|
|
|
|
__description(desc) \
|
bpf: Fix bpf_refcount_acquire's refcount_t address calculation
When calculating the address of the refcount_t struct within a local
kptr, bpf_refcount_acquire_impl should add refcount_off bytes to the
address of the local kptr. Due to some missing parens, the function is
incorrectly adding sizeof(refcount_t) * refcount_off bytes. This patch
fixes the calculation.
Due to the incorrect calculation, bpf_refcount_acquire_impl was trying
to refcount_inc some memory well past the end of local kptrs, resulting
in kasan and refcount complaints, as reported in [0]. In that thread,
Florian and Eduard discovered that bpf selftests written in the new
style - with __success and an expected __retval, specifically - were
not actually being run. As a result, selftests added in bpf_refcount
series weren't really exercising this behavior, and thus didn't unearth
the bug.
With this fixed behavior it's safe to revert commit 7c4b96c00043
("selftests/bpf: disable program test run for progs/refcounted_kptr.c"),
this patch does so.
[0] https://lore.kernel.org/bpf/ZEEp+j22imoN6rn9@strlen.de/
Fixes: 7c50b1cb76ac ("bpf: Add bpf_refcount_acquire kfunc")
Reported-by: Florian Westphal <fw@strlen.de>
Reported-by: Eduard Zingerman <eddyz87@gmail.com>
Signed-off-by: Dave Marchevsky <davemarchevsky@fb.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Tested-by: Eduard Zingerman <eddyz87@gmail.com>
Link: https://lore.kernel.org/bpf/20230421074431.3548349-1-davemarchevsky@fb.com
2023-04-21 00:44:31 -07:00
|
|
|
__success __retval(579) \
|
2023-04-15 13:18:11 -07:00
|
|
|
long insert_and_remove_lf_tree_##rem_tree##_list_##rem_list(void *ctx) \
|
|
|
|
{ \
|
|
|
|
long err, tree_data, list_data; \
|
|
|
|
\
|
|
|
|
err = __insert_in_tree_and_list(&head, &root, &lock); \
|
|
|
|
if (err) \
|
|
|
|
return err; \
|
|
|
|
\
|
|
|
|
err = __read_from_list(&head, &lock, rem_list); \
|
|
|
|
if (err < 0) \
|
|
|
|
return err; \
|
|
|
|
else \
|
|
|
|
list_data = err; \
|
|
|
|
\
|
|
|
|
err = __read_from_tree(&root, &lock, rem_tree); \
|
|
|
|
if (err < 0) \
|
|
|
|
return err; \
|
|
|
|
else \
|
|
|
|
tree_data = err; \
|
|
|
|
\
|
|
|
|
return tree_data + list_data; \
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Similar to insert_read_both, but list data is read and possibly removed
|
|
|
|
* first
|
|
|
|
*
|
|
|
|
* Results should be no different than reading and possibly removing rbtree
|
|
|
|
* node first
|
|
|
|
*/
|
|
|
|
INSERT_READ_BOTH(true, true, "insert_read_both_list_first: remove from tree + list");
|
|
|
|
INSERT_READ_BOTH(false, false, "insert_read_both_list_first: remove from neither");
|
|
|
|
INSERT_READ_BOTH(true, false, "insert_read_both_list_first: remove from tree");
|
|
|
|
INSERT_READ_BOTH(false, true, "insert_read_both_list_first: remove from list");
|
|
|
|
|
|
|
|
#define INSERT_DOUBLE_READ_AND_DEL(read_fn, read_root, desc) \
|
|
|
|
SEC("tc") \
|
|
|
|
__description(desc) \
|
bpf: Fix bpf_refcount_acquire's refcount_t address calculation
When calculating the address of the refcount_t struct within a local
kptr, bpf_refcount_acquire_impl should add refcount_off bytes to the
address of the local kptr. Due to some missing parens, the function is
incorrectly adding sizeof(refcount_t) * refcount_off bytes. This patch
fixes the calculation.
Due to the incorrect calculation, bpf_refcount_acquire_impl was trying
to refcount_inc some memory well past the end of local kptrs, resulting
in kasan and refcount complaints, as reported in [0]. In that thread,
Florian and Eduard discovered that bpf selftests written in the new
style - with __success and an expected __retval, specifically - were
not actually being run. As a result, selftests added in bpf_refcount
series weren't really exercising this behavior, and thus didn't unearth
the bug.
With this fixed behavior it's safe to revert commit 7c4b96c00043
("selftests/bpf: disable program test run for progs/refcounted_kptr.c"),
this patch does so.
[0] https://lore.kernel.org/bpf/ZEEp+j22imoN6rn9@strlen.de/
Fixes: 7c50b1cb76ac ("bpf: Add bpf_refcount_acquire kfunc")
Reported-by: Florian Westphal <fw@strlen.de>
Reported-by: Eduard Zingerman <eddyz87@gmail.com>
Signed-off-by: Dave Marchevsky <davemarchevsky@fb.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Tested-by: Eduard Zingerman <eddyz87@gmail.com>
Link: https://lore.kernel.org/bpf/20230421074431.3548349-1-davemarchevsky@fb.com
2023-04-21 00:44:31 -07:00
|
|
|
__success __retval(-1) \
|
2023-04-15 13:18:11 -07:00
|
|
|
long insert_double_##read_fn##_and_del_##read_root(void *ctx) \
|
|
|
|
{ \
|
|
|
|
long err, list_data; \
|
|
|
|
\
|
|
|
|
err = __insert_in_tree_and_list(&head, &root, &lock); \
|
|
|
|
if (err) \
|
|
|
|
return err; \
|
|
|
|
\
|
|
|
|
err = read_fn(&read_root, &lock, true); \
|
|
|
|
if (err < 0) \
|
|
|
|
return err; \
|
|
|
|
else \
|
|
|
|
list_data = err; \
|
|
|
|
\
|
|
|
|
err = read_fn(&read_root, &lock, true); \
|
|
|
|
if (err < 0) \
|
|
|
|
return err; \
|
|
|
|
\
|
|
|
|
return err + list_data; \
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Insert into both tree and list, then try reading-and-removing from either twice
|
|
|
|
*
|
|
|
|
* The second read-and-remove should fail on read step since the node has
|
|
|
|
* already been removed
|
|
|
|
*/
|
|
|
|
INSERT_DOUBLE_READ_AND_DEL(__read_from_tree, root, "insert_double_del: 2x read-and-del from tree");
|
|
|
|
INSERT_DOUBLE_READ_AND_DEL(__read_from_list, head, "insert_double_del: 2x read-and-del from list");
|
|
|
|
|
|
|
|
#define INSERT_STASH_READ(rem_tree, desc) \
|
|
|
|
SEC("tc") \
|
|
|
|
__description(desc) \
|
bpf: Fix bpf_refcount_acquire's refcount_t address calculation
When calculating the address of the refcount_t struct within a local
kptr, bpf_refcount_acquire_impl should add refcount_off bytes to the
address of the local kptr. Due to some missing parens, the function is
incorrectly adding sizeof(refcount_t) * refcount_off bytes. This patch
fixes the calculation.
Due to the incorrect calculation, bpf_refcount_acquire_impl was trying
to refcount_inc some memory well past the end of local kptrs, resulting
in kasan and refcount complaints, as reported in [0]. In that thread,
Florian and Eduard discovered that bpf selftests written in the new
style - with __success and an expected __retval, specifically - were
not actually being run. As a result, selftests added in bpf_refcount
series weren't really exercising this behavior, and thus didn't unearth
the bug.
With this fixed behavior it's safe to revert commit 7c4b96c00043
("selftests/bpf: disable program test run for progs/refcounted_kptr.c"),
this patch does so.
[0] https://lore.kernel.org/bpf/ZEEp+j22imoN6rn9@strlen.de/
Fixes: 7c50b1cb76ac ("bpf: Add bpf_refcount_acquire kfunc")
Reported-by: Florian Westphal <fw@strlen.de>
Reported-by: Eduard Zingerman <eddyz87@gmail.com>
Signed-off-by: Dave Marchevsky <davemarchevsky@fb.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Tested-by: Eduard Zingerman <eddyz87@gmail.com>
Link: https://lore.kernel.org/bpf/20230421074431.3548349-1-davemarchevsky@fb.com
2023-04-21 00:44:31 -07:00
|
|
|
__success __retval(84) \
|
2023-04-15 13:18:11 -07:00
|
|
|
long insert_rbtree_and_stash__del_tree_##rem_tree(void *ctx) \
|
|
|
|
{ \
|
|
|
|
long err, tree_data, map_data; \
|
|
|
|
\
|
|
|
|
err = __stash_map_insert_tree(0, 42, &root, &lock); \
|
|
|
|
if (err) \
|
|
|
|
return err; \
|
|
|
|
\
|
|
|
|
err = __read_from_tree(&root, &lock, rem_tree); \
|
|
|
|
if (err < 0) \
|
|
|
|
return err; \
|
|
|
|
else \
|
|
|
|
tree_data = err; \
|
|
|
|
\
|
|
|
|
err = __read_from_unstash(0); \
|
|
|
|
if (err < 0) \
|
|
|
|
return err; \
|
|
|
|
else \
|
|
|
|
map_data = err; \
|
|
|
|
\
|
|
|
|
return tree_data + map_data; \
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Stash a refcounted node in map_val, insert same node into tree, then try
|
|
|
|
* reading data from tree then unstashed map_val, possibly removing from tree
|
|
|
|
*
|
|
|
|
* Removing from tree should have no effect on map_val kptr validity
|
|
|
|
*/
|
|
|
|
INSERT_STASH_READ(true, "insert_stash_read: remove from tree");
|
|
|
|
INSERT_STASH_READ(false, "insert_stash_read: don't remove from tree");
|
|
|
|
|
|
|
|
SEC("tc")
|
|
|
|
__success
|
|
|
|
long rbtree_refcounted_node_ref_escapes(void *ctx)
|
|
|
|
{
|
|
|
|
struct node_acquire *n, *m;
|
|
|
|
|
|
|
|
n = bpf_obj_new(typeof(*n));
|
|
|
|
if (!n)
|
|
|
|
return 1;
|
|
|
|
|
|
|
|
bpf_spin_lock(&alock);
|
|
|
|
bpf_rbtree_add(&aroot, &n->node, less_a);
|
|
|
|
m = bpf_refcount_acquire(n);
|
|
|
|
bpf_spin_unlock(&alock);
|
bpf: Make bpf_refcount_acquire fallible for non-owning refs
This patch fixes an incorrect assumption made in the original
bpf_refcount series [0], specifically that the BPF program calling
bpf_refcount_acquire on some node can always guarantee that the node is
alive. In that series, the patch adding failure behavior to rbtree_add
and list_push_{front, back} breaks this assumption for non-owning
references.
Consider the following program:
n = bpf_kptr_xchg(&mapval, NULL);
/* skip error checking */
bpf_spin_lock(&l);
if(bpf_rbtree_add(&t, &n->rb, less)) {
bpf_refcount_acquire(n);
/* Failed to add, do something else with the node */
}
bpf_spin_unlock(&l);
It's incorrect to assume that bpf_refcount_acquire will always succeed in this
scenario. bpf_refcount_acquire is being called in a critical section
here, but the lock being held is associated with rbtree t, which isn't
necessarily the lock associated with the tree that the node is already
in. So after bpf_rbtree_add fails to add the node and calls bpf_obj_drop
in it, the program has no ownership of the node's lifetime. Therefore
the node's refcount can be decr'd to 0 at any time after the failing
rbtree_add. If this happens before the refcount_acquire above, the node
might be free'd, and regardless refcount_acquire will be incrementing a
0 refcount.
Later patches in the series exercise this scenario, resulting in the
expected complaint from the kernel (without this patch's changes):
refcount_t: addition on 0; use-after-free.
WARNING: CPU: 1 PID: 207 at lib/refcount.c:25 refcount_warn_saturate+0xbc/0x110
Modules linked in: bpf_testmod(O)
CPU: 1 PID: 207 Comm: test_progs Tainted: G O 6.3.0-rc7-02231-g723de1a718a2-dirty #371
Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.15.0-0-g2dd4b9b3f840-prebuilt.qemu.org 04/01/2014
RIP: 0010:refcount_warn_saturate+0xbc/0x110
Code: 6f 64 f6 02 01 e8 84 a3 5c ff 0f 0b eb 9d 80 3d 5e 64 f6 02 00 75 94 48 c7 c7 e0 13 d2 82 c6 05 4e 64 f6 02 01 e8 64 a3 5c ff <0f> 0b e9 7a ff ff ff 80 3d 38 64 f6 02 00 0f 85 6d ff ff ff 48 c7
RSP: 0018:ffff88810b9179b0 EFLAGS: 00010082
RAX: 0000000000000000 RBX: 0000000000000002 RCX: 0000000000000000
RDX: 0000000000000202 RSI: 0000000000000008 RDI: ffffffff857c3680
RBP: ffff88810027d3c0 R08: ffffffff8125f2a4 R09: ffff88810b9176e7
R10: ffffed1021722edc R11: 746e756f63666572 R12: ffff88810027d388
R13: ffff88810027d3c0 R14: ffffc900005fe030 R15: ffffc900005fe048
FS: 00007fee0584a700(0000) GS:ffff88811b280000(0000) knlGS:0000000000000000
CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: 00005634a96f6c58 CR3: 0000000108ce9002 CR4: 0000000000770ee0
DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
PKRU: 55555554
Call Trace:
<TASK>
bpf_refcount_acquire_impl+0xb5/0xc0
(rest of output snipped)
The patch addresses this by changing bpf_refcount_acquire_impl to use
refcount_inc_not_zero instead of refcount_inc and marking
bpf_refcount_acquire KF_RET_NULL.
For owning references, though, we know the above scenario is not possible
and thus that bpf_refcount_acquire will always succeed. Some verifier
bookkeeping is added to track "is input owning ref?" for bpf_refcount_acquire
calls and return false from is_kfunc_ret_null for bpf_refcount_acquire on
owning refs despite it being marked KF_RET_NULL.
Existing selftests using bpf_refcount_acquire are modified where
necessary to NULL-check its return value.
[0]: https://lore.kernel.org/bpf/20230415201811.343116-1-davemarchevsky@fb.com/
Fixes: d2dcc67df910 ("bpf: Migrate bpf_rbtree_add and bpf_list_push_{front,back} to possibly fail")
Reported-by: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Signed-off-by: Dave Marchevsky <davemarchevsky@fb.com>
Link: https://lore.kernel.org/r/20230602022647.1571784-5-davemarchevsky@fb.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2023-06-01 19:26:42 -07:00
|
|
|
if (!m)
|
|
|
|
return 2;
|
2023-04-15 13:18:11 -07:00
|
|
|
|
|
|
|
m->key = 2;
|
|
|
|
bpf_obj_drop(m);
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
SEC("tc")
|
|
|
|
__success
|
|
|
|
long rbtree_refcounted_node_ref_escapes_owning_input(void *ctx)
|
|
|
|
{
|
|
|
|
struct node_acquire *n, *m;
|
|
|
|
|
|
|
|
n = bpf_obj_new(typeof(*n));
|
|
|
|
if (!n)
|
|
|
|
return 1;
|
|
|
|
|
|
|
|
m = bpf_refcount_acquire(n);
|
|
|
|
m->key = 2;
|
|
|
|
|
|
|
|
bpf_spin_lock(&alock);
|
|
|
|
bpf_rbtree_add(&aroot, &n->node, less_a);
|
|
|
|
bpf_spin_unlock(&alock);
|
|
|
|
|
|
|
|
bpf_obj_drop(m);
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
selftests/bpf: Add rbtree test exercising race which 'owner' field prevents
This patch adds a runnable version of one of the races described by
Kumar in [0]. Specifically, this interleaving:
(rbtree1 and list head protected by lock1, rbtree2 protected by lock2)
Prog A Prog B
======================================
n = bpf_obj_new(...)
m = bpf_refcount_acquire(n)
kptr_xchg(map, m)
m = kptr_xchg(map, NULL)
lock(lock2)
bpf_rbtree_add(rbtree2, m->r, less)
unlock(lock2)
lock(lock1)
bpf_list_push_back(head, n->l)
/* make n non-owning ref */
bpf_rbtree_remove(rbtree1, n->r)
unlock(lock1)
The above interleaving, the node's struct bpf_rb_node *r can be used to
add it to either rbtree1 or rbtree2, which are protected by different
locks. If the node has been added to rbtree2, we should not be allowed
to remove it while holding rbtree1's lock.
Before changes in the previous patch in this series, the rbtree_remove
in the second part of Prog A would succeed as the verifier has no way of
knowing which tree owns a particular node at verification time. The
addition of 'owner' field results in bpf_rbtree_remove correctly
failing.
The test added in this patch splits "Prog A" above into two separate BPF
programs - A1 and A2 - and uses a second mapval + kptr_xchg to pass n
from A1 to A2 similarly to the pass from A1 to B. If the test is run
without the fix applied, the remove will succeed.
Kumar's example had the two programs running on separate CPUs. This
patch doesn't do this as it's not necessary to exercise the broken
behavior / validate fixed behavior.
[0]: https://lore.kernel.org/bpf/d7hyspcow5wtjcmw4fugdgyp3fwhljwuscp3xyut5qnwivyeru@ysdq543otzv2
Signed-off-by: Dave Marchevsky <davemarchevsky@fb.com>
Suggested-by: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Link: https://lore.kernel.org/r/20230718083813.3416104-5-davemarchevsky@fb.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2023-07-18 01:38:11 -07:00
|
|
|
static long __stash_map_empty_xchg(struct node_data *n, int idx)
|
|
|
|
{
|
|
|
|
struct map_value *mapval = bpf_map_lookup_elem(&stashed_nodes, &idx);
|
|
|
|
|
|
|
|
if (!mapval) {
|
|
|
|
bpf_obj_drop(n);
|
|
|
|
return 1;
|
|
|
|
}
|
|
|
|
n = bpf_kptr_xchg(&mapval->node, n);
|
|
|
|
if (n) {
|
|
|
|
bpf_obj_drop(n);
|
|
|
|
return 2;
|
|
|
|
}
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
SEC("tc")
|
|
|
|
long rbtree_wrong_owner_remove_fail_a1(void *ctx)
|
|
|
|
{
|
|
|
|
struct node_data *n, *m;
|
|
|
|
|
|
|
|
n = bpf_obj_new(typeof(*n));
|
|
|
|
if (!n)
|
|
|
|
return 1;
|
|
|
|
m = bpf_refcount_acquire(n);
|
|
|
|
|
|
|
|
if (__stash_map_empty_xchg(n, 0)) {
|
|
|
|
bpf_obj_drop(m);
|
|
|
|
return 2;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (__stash_map_empty_xchg(m, 1))
|
|
|
|
return 3;
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
SEC("tc")
|
|
|
|
long rbtree_wrong_owner_remove_fail_b(void *ctx)
|
|
|
|
{
|
|
|
|
struct map_value *mapval;
|
|
|
|
struct node_data *n;
|
|
|
|
int idx = 0;
|
|
|
|
|
|
|
|
mapval = bpf_map_lookup_elem(&stashed_nodes, &idx);
|
|
|
|
if (!mapval)
|
|
|
|
return 1;
|
|
|
|
|
|
|
|
n = bpf_kptr_xchg(&mapval->node, NULL);
|
|
|
|
if (!n)
|
|
|
|
return 2;
|
|
|
|
|
|
|
|
bpf_spin_lock(&block);
|
|
|
|
|
|
|
|
bpf_rbtree_add(&broot, &n->r, less);
|
|
|
|
|
|
|
|
bpf_spin_unlock(&block);
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
SEC("tc")
|
|
|
|
long rbtree_wrong_owner_remove_fail_a2(void *ctx)
|
|
|
|
{
|
|
|
|
struct map_value *mapval;
|
|
|
|
struct bpf_rb_node *res;
|
|
|
|
struct node_data *m;
|
|
|
|
int idx = 1;
|
|
|
|
|
|
|
|
mapval = bpf_map_lookup_elem(&stashed_nodes, &idx);
|
|
|
|
if (!mapval)
|
|
|
|
return 1;
|
|
|
|
|
|
|
|
m = bpf_kptr_xchg(&mapval->node, NULL);
|
|
|
|
if (!m)
|
|
|
|
return 2;
|
|
|
|
bpf_spin_lock(&lock);
|
|
|
|
|
|
|
|
/* make m non-owning ref */
|
|
|
|
bpf_list_push_back(&head, &m->l);
|
|
|
|
res = bpf_rbtree_remove(&root, &m->r);
|
|
|
|
|
|
|
|
bpf_spin_unlock(&lock);
|
|
|
|
if (res) {
|
|
|
|
bpf_obj_drop(container_of(res, struct node_data, r));
|
|
|
|
return 3;
|
|
|
|
}
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2023-08-21 12:33:11 -07:00
|
|
|
SEC("?fentry.s/bpf_testmod_test_read")
|
|
|
|
__success
|
|
|
|
int BPF_PROG(rbtree_sleepable_rcu,
|
|
|
|
struct file *file, struct kobject *kobj,
|
|
|
|
struct bin_attribute *bin_attr, char *buf, loff_t off, size_t len)
|
|
|
|
{
|
|
|
|
struct bpf_rb_node *rb;
|
|
|
|
struct node_data *n, *m = NULL;
|
|
|
|
|
|
|
|
n = bpf_obj_new(typeof(*n));
|
|
|
|
if (!n)
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
bpf_rcu_read_lock();
|
|
|
|
bpf_spin_lock(&lock);
|
|
|
|
bpf_rbtree_add(&root, &n->r, less);
|
|
|
|
rb = bpf_rbtree_first(&root);
|
|
|
|
if (!rb)
|
|
|
|
goto err_out;
|
|
|
|
|
|
|
|
rb = bpf_rbtree_remove(&root, rb);
|
|
|
|
if (!rb)
|
|
|
|
goto err_out;
|
|
|
|
|
|
|
|
m = container_of(rb, struct node_data, r);
|
|
|
|
|
|
|
|
err_out:
|
|
|
|
bpf_spin_unlock(&lock);
|
|
|
|
bpf_rcu_read_unlock();
|
|
|
|
if (m)
|
|
|
|
bpf_obj_drop(m);
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
SEC("?fentry.s/bpf_testmod_test_read")
|
|
|
|
__success
|
|
|
|
int BPF_PROG(rbtree_sleepable_rcu_no_explicit_rcu_lock,
|
|
|
|
struct file *file, struct kobject *kobj,
|
|
|
|
struct bin_attribute *bin_attr, char *buf, loff_t off, size_t len)
|
|
|
|
{
|
|
|
|
struct bpf_rb_node *rb;
|
|
|
|
struct node_data *n, *m = NULL;
|
|
|
|
|
|
|
|
n = bpf_obj_new(typeof(*n));
|
|
|
|
if (!n)
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
/* No explicit bpf_rcu_read_lock */
|
|
|
|
bpf_spin_lock(&lock);
|
|
|
|
bpf_rbtree_add(&root, &n->r, less);
|
|
|
|
rb = bpf_rbtree_first(&root);
|
|
|
|
if (!rb)
|
|
|
|
goto err_out;
|
|
|
|
|
|
|
|
rb = bpf_rbtree_remove(&root, rb);
|
|
|
|
if (!rb)
|
|
|
|
goto err_out;
|
|
|
|
|
|
|
|
m = container_of(rb, struct node_data, r);
|
|
|
|
|
|
|
|
err_out:
|
|
|
|
bpf_spin_unlock(&lock);
|
|
|
|
/* No explicit bpf_rcu_read_unlock */
|
|
|
|
if (m)
|
|
|
|
bpf_obj_drop(m);
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2023-04-15 13:18:11 -07:00
|
|
|
char _license[] SEC("license") = "GPL";
|