mirror of
git://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
synced 2025-08-05 16:54:27 +00:00

BPF_MAP_TYPE_LRU_HASH can recycle most recent elements well before the map is full, due to percpu reservations and force shrink before neighbor stealing. Once a CPU is unable to borrow from the global map, it will once steal one elem from a neighbor and after that each time flush this one element to the global list and immediately recycle it. Batch value LOCAL_FREE_TARGET (128) will exhaust a 10K element map with 79 CPUs. CPU 79 will observe this behavior even while its neighbors hold 78 * 127 + 1 * 15 == 9921 free elements (99%). CPUs need not be active concurrently. The issue can appear with affinity migration, e.g., irqbalance. Each CPU can reserve and then hold onto its 128 elements indefinitely. Avoid global list exhaustion by limiting aggregate percpu caches to half of map size, by adjusting LOCAL_FREE_TARGET based on cpu count. This change has no effect on sufficiently large tables. Similar to LOCAL_NR_SCANS and lru->nr_scans, introduce a map variable lru->free_target. The extra field fits in a hole in struct bpf_lru. The cacheline is already warm where read in the hot path. The field is only accessed with the lru lock held. Tested-by: Anton Protopopov <a.s.protopopov@gmail.com> Signed-off-by: Willem de Bruijn <willemb@google.com> Acked-by: Stanislav Fomichev <sdf@fomichev.me> Link: https://lore.kernel.org/r/20250618215803.3587312-1-willemdebruijn.kernel@gmail.com Signed-off-by: Alexei Starovoitov <ast@kernel.org>
172 lines
7.3 KiB
Text
172 lines
7.3 KiB
Text
// SPDX-License-Identifier: GPL-2.0-only
|
|
// Copyright (C) 2022-2023 Isovalent, Inc.
|
|
digraph {
|
|
node [colorscheme=accent4,style=filled] # Apply colorscheme to all nodes
|
|
graph [splines=ortho, nodesep=1]
|
|
|
|
subgraph cluster_key {
|
|
label = "Key\n(locks held during operation)";
|
|
rankdir = TB;
|
|
|
|
remote_lock [shape=rectangle,fillcolor=4,label="remote CPU LRU lock"]
|
|
hash_lock [shape=rectangle,fillcolor=3,label="hashtab lock"]
|
|
lru_lock [shape=rectangle,fillcolor=2,label="LRU lock"]
|
|
local_lock [shape=rectangle,fillcolor=1,label="local CPU LRU lock"]
|
|
no_lock [shape=rectangle,label="no locks held"]
|
|
}
|
|
|
|
begin [shape=oval,label="begin\nbpf_map_update()"]
|
|
|
|
// Nodes below with an 'fn_' prefix are roughly labeled by the C function
|
|
// names that initiate the corresponding logic in kernel/bpf/bpf_lru_list.c.
|
|
// Number suffixes and errno suffixes handle subsections of the corresponding
|
|
// logic in the function as of the writing of this dot.
|
|
|
|
// cf. __local_list_pop_free() / bpf_percpu_lru_pop_free()
|
|
local_freelist_check [shape=diamond,fillcolor=1,
|
|
label="Local freelist\nnode available?"];
|
|
use_local_node [shape=rectangle,
|
|
label="Use node owned\nby this CPU"]
|
|
|
|
// cf. bpf_lru_pop_free()
|
|
common_lru_check [shape=diamond,
|
|
label="Map created with\ncommon LRU?\n(!BPF_F_NO_COMMON_LRU)"];
|
|
|
|
fn_bpf_lru_list_pop_free_to_local [shape=rectangle,fillcolor=2,
|
|
label="Flush local pending,
|
|
Rotate Global list, move
|
|
target_free
|
|
from global -> local"]
|
|
// Also corresponds to:
|
|
// fn__local_list_flush()
|
|
// fn_bpf_lru_list_rotate()
|
|
fn___bpf_lru_node_move_to_free[shape=diamond,fillcolor=2,
|
|
label="Able to free\ntarget_free\nnodes?"]
|
|
|
|
fn___bpf_lru_list_shrink_inactive [shape=rectangle,fillcolor=3,
|
|
label="Shrink inactive list
|
|
up to remaining
|
|
target_free
|
|
(global LRU -> local)"]
|
|
fn___bpf_lru_list_shrink [shape=diamond,fillcolor=2,
|
|
label="> 0 entries in\nlocal free list?"]
|
|
fn___bpf_lru_list_shrink2 [shape=rectangle,fillcolor=2,
|
|
label="Steal one node from
|
|
inactive, or if empty,
|
|
from active global list"]
|
|
fn___bpf_lru_list_shrink3 [shape=rectangle,fillcolor=3,
|
|
label="Try to remove\nnode from hashtab"]
|
|
|
|
local_freelist_check2 [shape=diamond,label="Htab removal\nsuccessful?"]
|
|
common_lru_check2 [shape=diamond,
|
|
label="Map created with\ncommon LRU?\n(!BPF_F_NO_COMMON_LRU)"];
|
|
|
|
subgraph cluster_remote_lock {
|
|
label = "Iterate through CPUs\n(start from current)";
|
|
style = dashed;
|
|
rankdir=LR;
|
|
|
|
local_freelist_check5 [shape=diamond,fillcolor=4,
|
|
label="Steal a node from\nper-cpu freelist?"]
|
|
local_freelist_check6 [shape=rectangle,fillcolor=4,
|
|
label="Steal a node from
|
|
(1) Unreferenced pending, or
|
|
(2) Any pending node"]
|
|
local_freelist_check7 [shape=rectangle,fillcolor=3,
|
|
label="Try to remove\nnode from hashtab"]
|
|
fn_htab_lru_map_update_elem [shape=diamond,
|
|
label="Stole node\nfrom remote\nCPU?"]
|
|
fn_htab_lru_map_update_elem2 [shape=diamond,label="Iterated\nall CPUs?"]
|
|
// Also corresponds to:
|
|
// use_local_node()
|
|
// fn__local_list_pop_pending()
|
|
}
|
|
|
|
fn_bpf_lru_list_pop_free_to_local2 [shape=rectangle,
|
|
label="Use node that was\nnot recently referenced"]
|
|
local_freelist_check4 [shape=rectangle,
|
|
label="Use node that was\nactively referenced\nin global list"]
|
|
fn_htab_lru_map_update_elem_ENOMEM [shape=oval,label="return -ENOMEM"]
|
|
fn_htab_lru_map_update_elem3 [shape=rectangle,
|
|
label="Use node that was\nactively referenced\nin (another?) CPU's cache"]
|
|
fn_htab_lru_map_update_elem4 [shape=rectangle,fillcolor=3,
|
|
label="Update hashmap\nwith new element"]
|
|
fn_htab_lru_map_update_elem5 [shape=oval,label="return 0"]
|
|
fn_htab_lru_map_update_elem_EBUSY [shape=oval,label="return -EBUSY"]
|
|
fn_htab_lru_map_update_elem_EEXIST [shape=oval,label="return -EEXIST"]
|
|
fn_htab_lru_map_update_elem_ENOENT [shape=oval,label="return -ENOENT"]
|
|
|
|
begin -> local_freelist_check
|
|
local_freelist_check -> use_local_node [xlabel="Y"]
|
|
local_freelist_check -> common_lru_check [xlabel="N"]
|
|
common_lru_check -> fn_bpf_lru_list_pop_free_to_local [xlabel="Y"]
|
|
common_lru_check -> fn___bpf_lru_list_shrink_inactive [xlabel="N"]
|
|
fn_bpf_lru_list_pop_free_to_local -> fn___bpf_lru_node_move_to_free
|
|
fn___bpf_lru_node_move_to_free ->
|
|
fn_bpf_lru_list_pop_free_to_local2 [xlabel="Y"]
|
|
fn___bpf_lru_node_move_to_free ->
|
|
fn___bpf_lru_list_shrink_inactive [xlabel="N"]
|
|
fn___bpf_lru_list_shrink_inactive -> fn___bpf_lru_list_shrink
|
|
fn___bpf_lru_list_shrink -> fn_bpf_lru_list_pop_free_to_local2 [xlabel = "Y"]
|
|
fn___bpf_lru_list_shrink -> fn___bpf_lru_list_shrink2 [xlabel="N"]
|
|
fn___bpf_lru_list_shrink2 -> fn___bpf_lru_list_shrink3
|
|
fn___bpf_lru_list_shrink3 -> local_freelist_check2
|
|
local_freelist_check2 -> local_freelist_check4 [xlabel = "Y"]
|
|
local_freelist_check2 -> common_lru_check2 [xlabel = "N"]
|
|
common_lru_check2 -> local_freelist_check5 [xlabel = "Y"]
|
|
common_lru_check2 -> fn_htab_lru_map_update_elem_ENOMEM [xlabel = "N"]
|
|
local_freelist_check5 -> fn_htab_lru_map_update_elem [xlabel = "Y"]
|
|
local_freelist_check5 -> local_freelist_check6 [xlabel = "N"]
|
|
local_freelist_check6 -> local_freelist_check7
|
|
local_freelist_check7 -> fn_htab_lru_map_update_elem
|
|
|
|
fn_htab_lru_map_update_elem -> fn_htab_lru_map_update_elem3 [xlabel = "Y"]
|
|
fn_htab_lru_map_update_elem -> fn_htab_lru_map_update_elem2 [xlabel = "N"]
|
|
fn_htab_lru_map_update_elem2 ->
|
|
fn_htab_lru_map_update_elem_ENOMEM [xlabel = "Y"]
|
|
fn_htab_lru_map_update_elem2 -> local_freelist_check5 [xlabel = "N"]
|
|
fn_htab_lru_map_update_elem3 -> fn_htab_lru_map_update_elem4
|
|
|
|
use_local_node -> fn_htab_lru_map_update_elem4
|
|
fn_bpf_lru_list_pop_free_to_local2 -> fn_htab_lru_map_update_elem4
|
|
local_freelist_check4 -> fn_htab_lru_map_update_elem4
|
|
|
|
fn_htab_lru_map_update_elem4 -> fn_htab_lru_map_update_elem5 [headlabel="Success"]
|
|
fn_htab_lru_map_update_elem4 ->
|
|
fn_htab_lru_map_update_elem_EBUSY [xlabel="Hashtab lock failed"]
|
|
fn_htab_lru_map_update_elem4 ->
|
|
fn_htab_lru_map_update_elem_EEXIST [xlabel="BPF_EXIST set and\nkey already exists"]
|
|
fn_htab_lru_map_update_elem4 ->
|
|
fn_htab_lru_map_update_elem_ENOENT [headlabel="BPF_NOEXIST set\nand no such entry"]
|
|
|
|
// Create invisible pad nodes to line up various nodes
|
|
pad0 [style=invis]
|
|
pad1 [style=invis]
|
|
pad2 [style=invis]
|
|
pad3 [style=invis]
|
|
pad4 [style=invis]
|
|
|
|
// Line up the key with the top of the graph
|
|
no_lock -> local_lock [style=invis]
|
|
local_lock -> lru_lock [style=invis]
|
|
lru_lock -> hash_lock [style=invis]
|
|
hash_lock -> remote_lock [style=invis]
|
|
remote_lock -> local_freelist_check5 [style=invis]
|
|
remote_lock -> fn___bpf_lru_list_shrink [style=invis]
|
|
|
|
// Line up return code nodes at the bottom of the graph
|
|
fn_htab_lru_map_update_elem -> pad0 [style=invis]
|
|
pad0 -> pad1 [style=invis]
|
|
pad1 -> pad2 [style=invis]
|
|
//pad2-> fn_htab_lru_map_update_elem_ENOMEM [style=invis]
|
|
fn_htab_lru_map_update_elem4 -> pad3 [style=invis]
|
|
pad3 -> fn_htab_lru_map_update_elem5 [style=invis]
|
|
pad3 -> fn_htab_lru_map_update_elem_EBUSY [style=invis]
|
|
pad3 -> fn_htab_lru_map_update_elem_EEXIST [style=invis]
|
|
pad3 -> fn_htab_lru_map_update_elem_ENOENT [style=invis]
|
|
|
|
// Reduce diagram width by forcing some nodes to appear above others
|
|
local_freelist_check4 -> fn_htab_lru_map_update_elem3 [style=invis]
|
|
common_lru_check2 -> pad4 [style=invis]
|
|
pad4 -> local_freelist_check5 [style=invis]
|
|
}
|