mirror of
git://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
synced 2025-08-05 16:54:27 +00:00

BPF_MAP_TYPE_LRU_HASH can recycle most recent elements well before the map is full, due to percpu reservations and force shrink before neighbor stealing. Once a CPU is unable to borrow from the global map, it will once steal one elem from a neighbor and after that each time flush this one element to the global list and immediately recycle it. Batch value LOCAL_FREE_TARGET (128) will exhaust a 10K element map with 79 CPUs. CPU 79 will observe this behavior even while its neighbors hold 78 * 127 + 1 * 15 == 9921 free elements (99%). CPUs need not be active concurrently. The issue can appear with affinity migration, e.g., irqbalance. Each CPU can reserve and then hold onto its 128 elements indefinitely. Avoid global list exhaustion by limiting aggregate percpu caches to half of map size, by adjusting LOCAL_FREE_TARGET based on cpu count. This change has no effect on sufficiently large tables. Similar to LOCAL_NR_SCANS and lru->nr_scans, introduce a map variable lru->free_target. The extra field fits in a hole in struct bpf_lru. The cacheline is already warm where read in the hot path. The field is only accessed with the lru lock held. Tested-by: Anton Protopopov <a.s.protopopov@gmail.com> Signed-off-by: Willem de Bruijn <willemb@google.com> Acked-by: Stanislav Fomichev <sdf@fomichev.me> Link: https://lore.kernel.org/r/20250618215803.3587312-1-willemdebruijn.kernel@gmail.com Signed-off-by: Alexei Starovoitov <ast@kernel.org>
80 lines
1.9 KiB
C
80 lines
1.9 KiB
C
/* SPDX-License-Identifier: GPL-2.0-only */
|
|
/* Copyright (c) 2016 Facebook
|
|
*/
|
|
#ifndef __BPF_LRU_LIST_H_
|
|
#define __BPF_LRU_LIST_H_
|
|
|
|
#include <linux/cache.h>
|
|
#include <linux/list.h>
|
|
#include <linux/spinlock_types.h>
|
|
|
|
#define NR_BPF_LRU_LIST_T (3)
|
|
#define NR_BPF_LRU_LIST_COUNT (2)
|
|
#define NR_BPF_LRU_LOCAL_LIST_T (2)
|
|
#define BPF_LOCAL_LIST_T_OFFSET NR_BPF_LRU_LIST_T
|
|
|
|
enum bpf_lru_list_type {
|
|
BPF_LRU_LIST_T_ACTIVE,
|
|
BPF_LRU_LIST_T_INACTIVE,
|
|
BPF_LRU_LIST_T_FREE,
|
|
BPF_LRU_LOCAL_LIST_T_FREE,
|
|
BPF_LRU_LOCAL_LIST_T_PENDING,
|
|
};
|
|
|
|
struct bpf_lru_node {
|
|
struct list_head list;
|
|
u16 cpu;
|
|
u8 type;
|
|
u8 ref;
|
|
};
|
|
|
|
struct bpf_lru_list {
|
|
struct list_head lists[NR_BPF_LRU_LIST_T];
|
|
unsigned int counts[NR_BPF_LRU_LIST_COUNT];
|
|
/* The next inactive list rotation starts from here */
|
|
struct list_head *next_inactive_rotation;
|
|
|
|
raw_spinlock_t lock ____cacheline_aligned_in_smp;
|
|
};
|
|
|
|
struct bpf_lru_locallist {
|
|
struct list_head lists[NR_BPF_LRU_LOCAL_LIST_T];
|
|
u16 next_steal;
|
|
raw_spinlock_t lock;
|
|
};
|
|
|
|
struct bpf_common_lru {
|
|
struct bpf_lru_list lru_list;
|
|
struct bpf_lru_locallist __percpu *local_list;
|
|
};
|
|
|
|
typedef bool (*del_from_htab_func)(void *arg, struct bpf_lru_node *node);
|
|
|
|
struct bpf_lru {
|
|
union {
|
|
struct bpf_common_lru common_lru;
|
|
struct bpf_lru_list __percpu *percpu_lru;
|
|
};
|
|
del_from_htab_func del_from_htab;
|
|
void *del_arg;
|
|
unsigned int hash_offset;
|
|
unsigned int target_free;
|
|
unsigned int nr_scans;
|
|
bool percpu;
|
|
};
|
|
|
|
static inline void bpf_lru_node_set_ref(struct bpf_lru_node *node)
|
|
{
|
|
if (!READ_ONCE(node->ref))
|
|
WRITE_ONCE(node->ref, 1);
|
|
}
|
|
|
|
int bpf_lru_init(struct bpf_lru *lru, bool percpu, u32 hash_offset,
|
|
del_from_htab_func del_from_htab, void *delete_arg);
|
|
void bpf_lru_populate(struct bpf_lru *lru, void *buf, u32 node_offset,
|
|
u32 elem_size, u32 nr_elems);
|
|
void bpf_lru_destroy(struct bpf_lru *lru);
|
|
struct bpf_lru_node *bpf_lru_pop_free(struct bpf_lru *lru, u32 hash);
|
|
void bpf_lru_push_free(struct bpf_lru *lru, struct bpf_lru_node *node);
|
|
|
|
#endif
|