linux/Documentation/bpf/map_lru_hash_update.dot
Willem de Bruijn d4adf1c9ee bpf: Adjust free target to avoid global starvation of LRU map
BPF_MAP_TYPE_LRU_HASH can recycle most recent elements well before the
map is full, due to percpu reservations and force shrink before
neighbor stealing. Once a CPU is unable to borrow from the global map,
it will once steal one elem from a neighbor and after that each time
flush this one element to the global list and immediately recycle it.

Batch value LOCAL_FREE_TARGET (128) will exhaust a 10K element map
with 79 CPUs. CPU 79 will observe this behavior even while its
neighbors hold 78 * 127 + 1 * 15 == 9921 free elements (99%).

CPUs need not be active concurrently. The issue can appear with
affinity migration, e.g., irqbalance. Each CPU can reserve and then
hold onto its 128 elements indefinitely.

Avoid global list exhaustion by limiting aggregate percpu caches to
half of map size, by adjusting LOCAL_FREE_TARGET based on cpu count.
This change has no effect on sufficiently large tables.

Similar to LOCAL_NR_SCANS and lru->nr_scans, introduce a map variable
lru->free_target. The extra field fits in a hole in struct bpf_lru.
The cacheline is already warm where read in the hot path. The field is
only accessed with the lru lock held.

Tested-by: Anton Protopopov <a.s.protopopov@gmail.com>
Signed-off-by: Willem de Bruijn <willemb@google.com>
Acked-by: Stanislav Fomichev <sdf@fomichev.me>
Link: https://lore.kernel.org/r/20250618215803.3587312-1-willemdebruijn.kernel@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2025-06-18 18:50:14 -07:00

172 lines
7.3 KiB
Text

// SPDX-License-Identifier: GPL-2.0-only
// Copyright (C) 2022-2023 Isovalent, Inc.
digraph {
node [colorscheme=accent4,style=filled] # Apply colorscheme to all nodes
graph [splines=ortho, nodesep=1]
subgraph cluster_key {
label = "Key\n(locks held during operation)";
rankdir = TB;
remote_lock [shape=rectangle,fillcolor=4,label="remote CPU LRU lock"]
hash_lock [shape=rectangle,fillcolor=3,label="hashtab lock"]
lru_lock [shape=rectangle,fillcolor=2,label="LRU lock"]
local_lock [shape=rectangle,fillcolor=1,label="local CPU LRU lock"]
no_lock [shape=rectangle,label="no locks held"]
}
begin [shape=oval,label="begin\nbpf_map_update()"]
// Nodes below with an 'fn_' prefix are roughly labeled by the C function
// names that initiate the corresponding logic in kernel/bpf/bpf_lru_list.c.
// Number suffixes and errno suffixes handle subsections of the corresponding
// logic in the function as of the writing of this dot.
// cf. __local_list_pop_free() / bpf_percpu_lru_pop_free()
local_freelist_check [shape=diamond,fillcolor=1,
label="Local freelist\nnode available?"];
use_local_node [shape=rectangle,
label="Use node owned\nby this CPU"]
// cf. bpf_lru_pop_free()
common_lru_check [shape=diamond,
label="Map created with\ncommon LRU?\n(!BPF_F_NO_COMMON_LRU)"];
fn_bpf_lru_list_pop_free_to_local [shape=rectangle,fillcolor=2,
label="Flush local pending,
Rotate Global list, move
target_free
from global -> local"]
// Also corresponds to:
// fn__local_list_flush()
// fn_bpf_lru_list_rotate()
fn___bpf_lru_node_move_to_free[shape=diamond,fillcolor=2,
label="Able to free\ntarget_free\nnodes?"]
fn___bpf_lru_list_shrink_inactive [shape=rectangle,fillcolor=3,
label="Shrink inactive list
up to remaining
target_free
(global LRU -> local)"]
fn___bpf_lru_list_shrink [shape=diamond,fillcolor=2,
label="> 0 entries in\nlocal free list?"]
fn___bpf_lru_list_shrink2 [shape=rectangle,fillcolor=2,
label="Steal one node from
inactive, or if empty,
from active global list"]
fn___bpf_lru_list_shrink3 [shape=rectangle,fillcolor=3,
label="Try to remove\nnode from hashtab"]
local_freelist_check2 [shape=diamond,label="Htab removal\nsuccessful?"]
common_lru_check2 [shape=diamond,
label="Map created with\ncommon LRU?\n(!BPF_F_NO_COMMON_LRU)"];
subgraph cluster_remote_lock {
label = "Iterate through CPUs\n(start from current)";
style = dashed;
rankdir=LR;
local_freelist_check5 [shape=diamond,fillcolor=4,
label="Steal a node from\nper-cpu freelist?"]
local_freelist_check6 [shape=rectangle,fillcolor=4,
label="Steal a node from
(1) Unreferenced pending, or
(2) Any pending node"]
local_freelist_check7 [shape=rectangle,fillcolor=3,
label="Try to remove\nnode from hashtab"]
fn_htab_lru_map_update_elem [shape=diamond,
label="Stole node\nfrom remote\nCPU?"]
fn_htab_lru_map_update_elem2 [shape=diamond,label="Iterated\nall CPUs?"]
// Also corresponds to:
// use_local_node()
// fn__local_list_pop_pending()
}
fn_bpf_lru_list_pop_free_to_local2 [shape=rectangle,
label="Use node that was\nnot recently referenced"]
local_freelist_check4 [shape=rectangle,
label="Use node that was\nactively referenced\nin global list"]
fn_htab_lru_map_update_elem_ENOMEM [shape=oval,label="return -ENOMEM"]
fn_htab_lru_map_update_elem3 [shape=rectangle,
label="Use node that was\nactively referenced\nin (another?) CPU's cache"]
fn_htab_lru_map_update_elem4 [shape=rectangle,fillcolor=3,
label="Update hashmap\nwith new element"]
fn_htab_lru_map_update_elem5 [shape=oval,label="return 0"]
fn_htab_lru_map_update_elem_EBUSY [shape=oval,label="return -EBUSY"]
fn_htab_lru_map_update_elem_EEXIST [shape=oval,label="return -EEXIST"]
fn_htab_lru_map_update_elem_ENOENT [shape=oval,label="return -ENOENT"]
begin -> local_freelist_check
local_freelist_check -> use_local_node [xlabel="Y"]
local_freelist_check -> common_lru_check [xlabel="N"]
common_lru_check -> fn_bpf_lru_list_pop_free_to_local [xlabel="Y"]
common_lru_check -> fn___bpf_lru_list_shrink_inactive [xlabel="N"]
fn_bpf_lru_list_pop_free_to_local -> fn___bpf_lru_node_move_to_free
fn___bpf_lru_node_move_to_free ->
fn_bpf_lru_list_pop_free_to_local2 [xlabel="Y"]
fn___bpf_lru_node_move_to_free ->
fn___bpf_lru_list_shrink_inactive [xlabel="N"]
fn___bpf_lru_list_shrink_inactive -> fn___bpf_lru_list_shrink
fn___bpf_lru_list_shrink -> fn_bpf_lru_list_pop_free_to_local2 [xlabel = "Y"]
fn___bpf_lru_list_shrink -> fn___bpf_lru_list_shrink2 [xlabel="N"]
fn___bpf_lru_list_shrink2 -> fn___bpf_lru_list_shrink3
fn___bpf_lru_list_shrink3 -> local_freelist_check2
local_freelist_check2 -> local_freelist_check4 [xlabel = "Y"]
local_freelist_check2 -> common_lru_check2 [xlabel = "N"]
common_lru_check2 -> local_freelist_check5 [xlabel = "Y"]
common_lru_check2 -> fn_htab_lru_map_update_elem_ENOMEM [xlabel = "N"]
local_freelist_check5 -> fn_htab_lru_map_update_elem [xlabel = "Y"]
local_freelist_check5 -> local_freelist_check6 [xlabel = "N"]
local_freelist_check6 -> local_freelist_check7
local_freelist_check7 -> fn_htab_lru_map_update_elem
fn_htab_lru_map_update_elem -> fn_htab_lru_map_update_elem3 [xlabel = "Y"]
fn_htab_lru_map_update_elem -> fn_htab_lru_map_update_elem2 [xlabel = "N"]
fn_htab_lru_map_update_elem2 ->
fn_htab_lru_map_update_elem_ENOMEM [xlabel = "Y"]
fn_htab_lru_map_update_elem2 -> local_freelist_check5 [xlabel = "N"]
fn_htab_lru_map_update_elem3 -> fn_htab_lru_map_update_elem4
use_local_node -> fn_htab_lru_map_update_elem4
fn_bpf_lru_list_pop_free_to_local2 -> fn_htab_lru_map_update_elem4
local_freelist_check4 -> fn_htab_lru_map_update_elem4
fn_htab_lru_map_update_elem4 -> fn_htab_lru_map_update_elem5 [headlabel="Success"]
fn_htab_lru_map_update_elem4 ->
fn_htab_lru_map_update_elem_EBUSY [xlabel="Hashtab lock failed"]
fn_htab_lru_map_update_elem4 ->
fn_htab_lru_map_update_elem_EEXIST [xlabel="BPF_EXIST set and\nkey already exists"]
fn_htab_lru_map_update_elem4 ->
fn_htab_lru_map_update_elem_ENOENT [headlabel="BPF_NOEXIST set\nand no such entry"]
// Create invisible pad nodes to line up various nodes
pad0 [style=invis]
pad1 [style=invis]
pad2 [style=invis]
pad3 [style=invis]
pad4 [style=invis]
// Line up the key with the top of the graph
no_lock -> local_lock [style=invis]
local_lock -> lru_lock [style=invis]
lru_lock -> hash_lock [style=invis]
hash_lock -> remote_lock [style=invis]
remote_lock -> local_freelist_check5 [style=invis]
remote_lock -> fn___bpf_lru_list_shrink [style=invis]
// Line up return code nodes at the bottom of the graph
fn_htab_lru_map_update_elem -> pad0 [style=invis]
pad0 -> pad1 [style=invis]
pad1 -> pad2 [style=invis]
//pad2-> fn_htab_lru_map_update_elem_ENOMEM [style=invis]
fn_htab_lru_map_update_elem4 -> pad3 [style=invis]
pad3 -> fn_htab_lru_map_update_elem5 [style=invis]
pad3 -> fn_htab_lru_map_update_elem_EBUSY [style=invis]
pad3 -> fn_htab_lru_map_update_elem_EEXIST [style=invis]
pad3 -> fn_htab_lru_map_update_elem_ENOENT [style=invis]
// Reduce diagram width by forcing some nodes to appear above others
local_freelist_check4 -> fn_htab_lru_map_update_elem3 [style=invis]
common_lru_check2 -> pad4 [style=invis]
pad4 -> local_freelist_check5 [style=invis]
}