2025-04-09 14:46:05 -07:00
|
|
|
// SPDX-License-Identifier: GPL-2.0
|
|
|
|
|
|
|
|
/* bpf_fq is intended for testing the bpf qdisc infrastructure and not a direct
|
|
|
|
* copy of sch_fq. bpf_fq implements the scheduling algorithm of sch_fq before
|
|
|
|
* 29f834aa326e ("net_sched: sch_fq: add 3 bands and WRR scheduling") was
|
|
|
|
* introduced. It gives each flow a fair chance to transmit packets in a
|
|
|
|
* round-robin fashion. Note that for flow pacing, bpf_fq currently only
|
|
|
|
* respects skb->tstamp but not skb->sk->sk_pacing_rate. In addition, if there
|
|
|
|
* are multiple bpf_fq instances, they will have a shared view of flows and
|
|
|
|
* configuration since some key data structure such as fq_prio_flows,
|
|
|
|
* fq_nonprio_flows, and fq_bpf_data are global.
|
|
|
|
*
|
|
|
|
* To use bpf_fq alone without running selftests, use the following commands.
|
|
|
|
*
|
|
|
|
* 1. Register bpf_fq to the kernel
|
|
|
|
* bpftool struct_ops register bpf_qdisc_fq.bpf.o /sys/fs/bpf
|
|
|
|
* 2. Add bpf_fq to an interface
|
|
|
|
* tc qdisc add dev <interface name> root handle <handle> bpf_fq
|
|
|
|
* 3. Delete bpf_fq attached to the interface
|
|
|
|
* tc qdisc delete dev <interface name> root
|
|
|
|
* 4. Unregister bpf_fq
|
|
|
|
* bpftool struct_ops unregister name fq
|
|
|
|
*
|
|
|
|
* The qdisc name, bpf_fq, used in tc commands is defined by Qdisc_ops.id.
|
|
|
|
* The struct_ops_map_name, fq, used in the bpftool command is the name of the
|
|
|
|
* Qdisc_ops.
|
|
|
|
*
|
|
|
|
* SEC(".struct_ops")
|
|
|
|
* struct Qdisc_ops fq = {
|
|
|
|
* ...
|
|
|
|
* .id = "bpf_fq",
|
|
|
|
* };
|
|
|
|
*/
|
|
|
|
|
|
|
|
#include <vmlinux.h>
|
|
|
|
#include <errno.h>
|
|
|
|
#include <bpf/bpf_helpers.h>
|
|
|
|
#include "bpf_experimental.h"
|
|
|
|
#include "bpf_qdisc_common.h"
|
|
|
|
|
|
|
|
char _license[] SEC("license") = "GPL";
|
|
|
|
|
|
|
|
#define NSEC_PER_USEC 1000L
|
|
|
|
#define NSEC_PER_SEC 1000000000L
|
|
|
|
|
|
|
|
#define NUM_QUEUE (1 << 20)
|
|
|
|
|
|
|
|
struct fq_bpf_data {
|
|
|
|
u32 quantum;
|
|
|
|
u32 initial_quantum;
|
|
|
|
u32 flow_refill_delay;
|
|
|
|
u32 flow_plimit;
|
|
|
|
u64 horizon;
|
|
|
|
u32 orphan_mask;
|
|
|
|
u32 timer_slack;
|
|
|
|
u64 time_next_delayed_flow;
|
|
|
|
u64 unthrottle_latency_ns;
|
|
|
|
u8 horizon_drop;
|
|
|
|
u32 new_flow_cnt;
|
|
|
|
u32 old_flow_cnt;
|
|
|
|
u64 ktime_cache;
|
|
|
|
};
|
|
|
|
|
|
|
|
enum {
|
|
|
|
CLS_RET_PRIO = 0,
|
|
|
|
CLS_RET_NONPRIO = 1,
|
|
|
|
CLS_RET_ERR = 2,
|
|
|
|
};
|
|
|
|
|
|
|
|
struct skb_node {
|
|
|
|
u64 tstamp;
|
|
|
|
struct sk_buff __kptr * skb;
|
|
|
|
struct bpf_rb_node node;
|
|
|
|
};
|
|
|
|
|
|
|
|
struct fq_flow_node {
|
|
|
|
int credit;
|
|
|
|
u32 qlen;
|
|
|
|
u64 age;
|
|
|
|
u64 time_next_packet;
|
|
|
|
struct bpf_list_node list_node;
|
|
|
|
struct bpf_rb_node rb_node;
|
|
|
|
struct bpf_rb_root queue __contains(skb_node, node);
|
|
|
|
struct bpf_spin_lock lock;
|
|
|
|
struct bpf_refcount refcount;
|
|
|
|
};
|
|
|
|
|
|
|
|
struct dequeue_nonprio_ctx {
|
|
|
|
bool stop_iter;
|
|
|
|
u64 expire;
|
|
|
|
u64 now;
|
|
|
|
};
|
|
|
|
|
|
|
|
struct remove_flows_ctx {
|
|
|
|
bool gc_only;
|
|
|
|
u32 reset_cnt;
|
|
|
|
u32 reset_max;
|
|
|
|
};
|
|
|
|
|
|
|
|
struct unset_throttled_flows_ctx {
|
|
|
|
bool unset_all;
|
|
|
|
u64 now;
|
|
|
|
};
|
|
|
|
|
|
|
|
struct fq_stashed_flow {
|
|
|
|
struct fq_flow_node __kptr * flow;
|
|
|
|
};
|
|
|
|
|
|
|
|
struct {
|
|
|
|
__uint(type, BPF_MAP_TYPE_HASH);
|
|
|
|
__type(key, __u64);
|
|
|
|
__type(value, struct fq_stashed_flow);
|
|
|
|
__uint(max_entries, NUM_QUEUE);
|
|
|
|
} fq_nonprio_flows SEC(".maps");
|
|
|
|
|
|
|
|
struct {
|
|
|
|
__uint(type, BPF_MAP_TYPE_HASH);
|
|
|
|
__type(key, __u64);
|
|
|
|
__type(value, struct fq_stashed_flow);
|
|
|
|
__uint(max_entries, 1);
|
|
|
|
} fq_prio_flows SEC(".maps");
|
|
|
|
|
|
|
|
private(A) struct bpf_spin_lock fq_delayed_lock;
|
|
|
|
private(A) struct bpf_rb_root fq_delayed __contains(fq_flow_node, rb_node);
|
|
|
|
|
|
|
|
private(B) struct bpf_spin_lock fq_new_flows_lock;
|
|
|
|
private(B) struct bpf_list_head fq_new_flows __contains(fq_flow_node, list_node);
|
|
|
|
|
|
|
|
private(C) struct bpf_spin_lock fq_old_flows_lock;
|
|
|
|
private(C) struct bpf_list_head fq_old_flows __contains(fq_flow_node, list_node);
|
|
|
|
|
|
|
|
private(D) struct fq_bpf_data q;
|
|
|
|
|
|
|
|
/* Wrapper for bpf_kptr_xchg that expects NULL dst */
|
|
|
|
static void bpf_kptr_xchg_back(void *map_val, void *ptr)
|
|
|
|
{
|
|
|
|
void *ret;
|
|
|
|
|
|
|
|
ret = bpf_kptr_xchg(map_val, ptr);
|
|
|
|
if (ret)
|
|
|
|
bpf_obj_drop(ret);
|
|
|
|
}
|
|
|
|
|
|
|
|
static bool skbn_tstamp_less(struct bpf_rb_node *a, const struct bpf_rb_node *b)
|
|
|
|
{
|
|
|
|
struct skb_node *skbn_a;
|
|
|
|
struct skb_node *skbn_b;
|
|
|
|
|
|
|
|
skbn_a = container_of(a, struct skb_node, node);
|
|
|
|
skbn_b = container_of(b, struct skb_node, node);
|
|
|
|
|
|
|
|
return skbn_a->tstamp < skbn_b->tstamp;
|
|
|
|
}
|
|
|
|
|
|
|
|
static bool fn_time_next_packet_less(struct bpf_rb_node *a, const struct bpf_rb_node *b)
|
|
|
|
{
|
|
|
|
struct fq_flow_node *flow_a;
|
|
|
|
struct fq_flow_node *flow_b;
|
|
|
|
|
|
|
|
flow_a = container_of(a, struct fq_flow_node, rb_node);
|
|
|
|
flow_b = container_of(b, struct fq_flow_node, rb_node);
|
|
|
|
|
|
|
|
return flow_a->time_next_packet < flow_b->time_next_packet;
|
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
|
|
|
fq_flows_add_head(struct bpf_list_head *head, struct bpf_spin_lock *lock,
|
|
|
|
struct fq_flow_node *flow, u32 *flow_cnt)
|
|
|
|
{
|
|
|
|
bpf_spin_lock(lock);
|
|
|
|
bpf_list_push_front(head, &flow->list_node);
|
|
|
|
bpf_spin_unlock(lock);
|
|
|
|
*flow_cnt += 1;
|
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
|
|
|
fq_flows_add_tail(struct bpf_list_head *head, struct bpf_spin_lock *lock,
|
|
|
|
struct fq_flow_node *flow, u32 *flow_cnt)
|
|
|
|
{
|
|
|
|
bpf_spin_lock(lock);
|
|
|
|
bpf_list_push_back(head, &flow->list_node);
|
|
|
|
bpf_spin_unlock(lock);
|
|
|
|
*flow_cnt += 1;
|
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
|
|
|
fq_flows_remove_front(struct bpf_list_head *head, struct bpf_spin_lock *lock,
|
|
|
|
struct bpf_list_node **node, u32 *flow_cnt)
|
|
|
|
{
|
|
|
|
bpf_spin_lock(lock);
|
|
|
|
*node = bpf_list_pop_front(head);
|
|
|
|
bpf_spin_unlock(lock);
|
|
|
|
*flow_cnt -= 1;
|
|
|
|
}
|
|
|
|
|
|
|
|
static bool
|
|
|
|
fq_flows_is_empty(struct bpf_list_head *head, struct bpf_spin_lock *lock)
|
|
|
|
{
|
|
|
|
struct bpf_list_node *node;
|
|
|
|
|
|
|
|
bpf_spin_lock(lock);
|
|
|
|
node = bpf_list_pop_front(head);
|
|
|
|
if (node) {
|
|
|
|
bpf_list_push_front(head, node);
|
|
|
|
bpf_spin_unlock(lock);
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
bpf_spin_unlock(lock);
|
|
|
|
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* flow->age is used to denote the state of the flow (not-detached, detached, throttled)
|
|
|
|
* as well as the timestamp when the flow is detached.
|
|
|
|
*
|
|
|
|
* 0: not-detached
|
|
|
|
* 1 - (~0ULL-1): detached
|
|
|
|
* ~0ULL: throttled
|
|
|
|
*/
|
|
|
|
static void fq_flow_set_detached(struct fq_flow_node *flow)
|
|
|
|
{
|
|
|
|
flow->age = bpf_jiffies64();
|
|
|
|
}
|
|
|
|
|
|
|
|
static bool fq_flow_is_detached(struct fq_flow_node *flow)
|
|
|
|
{
|
|
|
|
return flow->age != 0 && flow->age != ~0ULL;
|
|
|
|
}
|
|
|
|
|
|
|
|
static bool sk_listener(struct sock *sk)
|
|
|
|
{
|
|
|
|
return (1 << sk->__sk_common.skc_state) & (TCPF_LISTEN | TCPF_NEW_SYN_RECV);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void fq_gc(void);
|
|
|
|
|
|
|
|
static int fq_new_flow(void *flow_map, struct fq_stashed_flow **sflow, u64 hash)
|
|
|
|
{
|
|
|
|
struct fq_stashed_flow tmp = {};
|
|
|
|
struct fq_flow_node *flow;
|
|
|
|
int ret;
|
|
|
|
|
|
|
|
flow = bpf_obj_new(typeof(*flow));
|
|
|
|
if (!flow)
|
|
|
|
return -ENOMEM;
|
|
|
|
|
|
|
|
flow->credit = q.initial_quantum,
|
|
|
|
flow->qlen = 0,
|
|
|
|
flow->age = 1,
|
|
|
|
flow->time_next_packet = 0,
|
|
|
|
|
|
|
|
ret = bpf_map_update_elem(flow_map, &hash, &tmp, 0);
|
|
|
|
if (ret == -ENOMEM || ret == -E2BIG) {
|
|
|
|
fq_gc();
|
|
|
|
bpf_map_update_elem(&fq_nonprio_flows, &hash, &tmp, 0);
|
|
|
|
}
|
|
|
|
|
|
|
|
*sflow = bpf_map_lookup_elem(flow_map, &hash);
|
|
|
|
if (!*sflow) {
|
|
|
|
bpf_obj_drop(flow);
|
|
|
|
return -ENOMEM;
|
|
|
|
}
|
|
|
|
|
|
|
|
bpf_kptr_xchg_back(&(*sflow)->flow, flow);
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
static int
|
|
|
|
fq_classify(struct sk_buff *skb, struct fq_stashed_flow **sflow)
|
|
|
|
{
|
|
|
|
struct sock *sk = skb->sk;
|
|
|
|
int ret = CLS_RET_NONPRIO;
|
|
|
|
u64 hash = 0;
|
|
|
|
|
|
|
|
if ((skb->priority & TC_PRIO_MAX) == TC_PRIO_CONTROL) {
|
|
|
|
*sflow = bpf_map_lookup_elem(&fq_prio_flows, &hash);
|
|
|
|
ret = CLS_RET_PRIO;
|
|
|
|
} else {
|
|
|
|
if (!sk || sk_listener(sk)) {
|
|
|
|
hash = bpf_skb_get_hash(skb) & q.orphan_mask;
|
|
|
|
/* Avoid collision with an existing flow hash, which
|
|
|
|
* only uses the lower 32 bits of hash, by setting the
|
|
|
|
* upper half of hash to 1.
|
|
|
|
*/
|
|
|
|
hash |= (1ULL << 32);
|
|
|
|
} else if (sk->__sk_common.skc_state == TCP_CLOSE) {
|
|
|
|
hash = bpf_skb_get_hash(skb) & q.orphan_mask;
|
|
|
|
hash |= (1ULL << 32);
|
|
|
|
} else {
|
|
|
|
hash = sk->__sk_common.skc_hash;
|
|
|
|
}
|
|
|
|
*sflow = bpf_map_lookup_elem(&fq_nonprio_flows, &hash);
|
|
|
|
}
|
|
|
|
|
|
|
|
if (!*sflow)
|
|
|
|
ret = fq_new_flow(&fq_nonprio_flows, sflow, hash) < 0 ?
|
|
|
|
CLS_RET_ERR : CLS_RET_NONPRIO;
|
|
|
|
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
static bool fq_packet_beyond_horizon(struct sk_buff *skb)
|
|
|
|
{
|
|
|
|
return (s64)skb->tstamp > (s64)(q.ktime_cache + q.horizon);
|
|
|
|
}
|
|
|
|
|
|
|
|
SEC("struct_ops/bpf_fq_enqueue")
|
|
|
|
int BPF_PROG(bpf_fq_enqueue, struct sk_buff *skb, struct Qdisc *sch,
|
|
|
|
struct bpf_sk_buff_ptr *to_free)
|
|
|
|
{
|
|
|
|
struct fq_flow_node *flow = NULL, *flow_copy;
|
|
|
|
struct fq_stashed_flow *sflow;
|
|
|
|
u64 time_to_send, jiffies;
|
|
|
|
struct skb_node *skbn;
|
|
|
|
int ret;
|
|
|
|
|
|
|
|
if (sch->q.qlen >= sch->limit)
|
|
|
|
goto drop;
|
|
|
|
|
|
|
|
if (!skb->tstamp) {
|
|
|
|
time_to_send = q.ktime_cache = bpf_ktime_get_ns();
|
|
|
|
} else {
|
|
|
|
if (fq_packet_beyond_horizon(skb)) {
|
|
|
|
q.ktime_cache = bpf_ktime_get_ns();
|
|
|
|
if (fq_packet_beyond_horizon(skb)) {
|
|
|
|
if (q.horizon_drop)
|
|
|
|
goto drop;
|
|
|
|
|
|
|
|
skb->tstamp = q.ktime_cache + q.horizon;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
time_to_send = skb->tstamp;
|
|
|
|
}
|
|
|
|
|
|
|
|
ret = fq_classify(skb, &sflow);
|
|
|
|
if (ret == CLS_RET_ERR)
|
|
|
|
goto drop;
|
|
|
|
|
|
|
|
flow = bpf_kptr_xchg(&sflow->flow, flow);
|
|
|
|
if (!flow)
|
|
|
|
goto drop;
|
|
|
|
|
|
|
|
if (ret == CLS_RET_NONPRIO) {
|
|
|
|
if (flow->qlen >= q.flow_plimit) {
|
|
|
|
bpf_kptr_xchg_back(&sflow->flow, flow);
|
|
|
|
goto drop;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (fq_flow_is_detached(flow)) {
|
|
|
|
flow_copy = bpf_refcount_acquire(flow);
|
|
|
|
|
|
|
|
jiffies = bpf_jiffies64();
|
|
|
|
if ((s64)(jiffies - (flow_copy->age + q.flow_refill_delay)) > 0) {
|
|
|
|
if (flow_copy->credit < q.quantum)
|
|
|
|
flow_copy->credit = q.quantum;
|
|
|
|
}
|
|
|
|
flow_copy->age = 0;
|
|
|
|
fq_flows_add_tail(&fq_new_flows, &fq_new_flows_lock, flow_copy,
|
|
|
|
&q.new_flow_cnt);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
skbn = bpf_obj_new(typeof(*skbn));
|
|
|
|
if (!skbn) {
|
|
|
|
bpf_kptr_xchg_back(&sflow->flow, flow);
|
|
|
|
goto drop;
|
|
|
|
}
|
|
|
|
|
|
|
|
skbn->tstamp = skb->tstamp = time_to_send;
|
|
|
|
|
|
|
|
sch->qstats.backlog += qdisc_pkt_len(skb);
|
|
|
|
|
|
|
|
skb = bpf_kptr_xchg(&skbn->skb, skb);
|
|
|
|
if (skb)
|
|
|
|
bpf_qdisc_skb_drop(skb, to_free);
|
|
|
|
|
|
|
|
bpf_spin_lock(&flow->lock);
|
|
|
|
bpf_rbtree_add(&flow->queue, &skbn->node, skbn_tstamp_less);
|
|
|
|
bpf_spin_unlock(&flow->lock);
|
|
|
|
|
|
|
|
flow->qlen++;
|
|
|
|
bpf_kptr_xchg_back(&sflow->flow, flow);
|
|
|
|
|
|
|
|
sch->q.qlen++;
|
|
|
|
return NET_XMIT_SUCCESS;
|
|
|
|
|
|
|
|
drop:
|
|
|
|
bpf_qdisc_skb_drop(skb, to_free);
|
|
|
|
sch->qstats.drops++;
|
|
|
|
return NET_XMIT_DROP;
|
|
|
|
}
|
|
|
|
|
|
|
|
static int fq_unset_throttled_flows(u32 index, struct unset_throttled_flows_ctx *ctx)
|
|
|
|
{
|
|
|
|
struct bpf_rb_node *node = NULL;
|
|
|
|
struct fq_flow_node *flow;
|
|
|
|
|
|
|
|
bpf_spin_lock(&fq_delayed_lock);
|
|
|
|
|
|
|
|
node = bpf_rbtree_first(&fq_delayed);
|
|
|
|
if (!node) {
|
|
|
|
bpf_spin_unlock(&fq_delayed_lock);
|
|
|
|
return 1;
|
|
|
|
}
|
|
|
|
|
|
|
|
flow = container_of(node, struct fq_flow_node, rb_node);
|
|
|
|
if (!ctx->unset_all && flow->time_next_packet > ctx->now) {
|
|
|
|
q.time_next_delayed_flow = flow->time_next_packet;
|
|
|
|
bpf_spin_unlock(&fq_delayed_lock);
|
|
|
|
return 1;
|
|
|
|
}
|
|
|
|
|
|
|
|
node = bpf_rbtree_remove(&fq_delayed, &flow->rb_node);
|
|
|
|
|
|
|
|
bpf_spin_unlock(&fq_delayed_lock);
|
|
|
|
|
|
|
|
if (!node)
|
|
|
|
return 1;
|
|
|
|
|
|
|
|
flow = container_of(node, struct fq_flow_node, rb_node);
|
|
|
|
flow->age = 0;
|
|
|
|
fq_flows_add_tail(&fq_old_flows, &fq_old_flows_lock, flow, &q.old_flow_cnt);
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
static void fq_flow_set_throttled(struct fq_flow_node *flow)
|
|
|
|
{
|
|
|
|
flow->age = ~0ULL;
|
|
|
|
|
|
|
|
if (q.time_next_delayed_flow > flow->time_next_packet)
|
|
|
|
q.time_next_delayed_flow = flow->time_next_packet;
|
|
|
|
|
|
|
|
bpf_spin_lock(&fq_delayed_lock);
|
|
|
|
bpf_rbtree_add(&fq_delayed, &flow->rb_node, fn_time_next_packet_less);
|
|
|
|
bpf_spin_unlock(&fq_delayed_lock);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void fq_check_throttled(u64 now)
|
|
|
|
{
|
|
|
|
struct unset_throttled_flows_ctx ctx = {
|
|
|
|
.unset_all = false,
|
|
|
|
.now = now,
|
|
|
|
};
|
|
|
|
unsigned long sample;
|
|
|
|
|
|
|
|
if (q.time_next_delayed_flow > now)
|
|
|
|
return;
|
|
|
|
|
|
|
|
sample = (unsigned long)(now - q.time_next_delayed_flow);
|
|
|
|
q.unthrottle_latency_ns -= q.unthrottle_latency_ns >> 3;
|
|
|
|
q.unthrottle_latency_ns += sample >> 3;
|
|
|
|
|
|
|
|
q.time_next_delayed_flow = ~0ULL;
|
|
|
|
bpf_loop(NUM_QUEUE, fq_unset_throttled_flows, &ctx, 0);
|
|
|
|
}
|
|
|
|
|
|
|
|
static struct sk_buff*
|
|
|
|
fq_dequeue_nonprio_flows(u32 index, struct dequeue_nonprio_ctx *ctx)
|
|
|
|
{
|
|
|
|
u64 time_next_packet, time_to_send;
|
|
|
|
struct bpf_rb_node *rb_node;
|
|
|
|
struct sk_buff *skb = NULL;
|
|
|
|
struct bpf_list_head *head;
|
|
|
|
struct bpf_list_node *node;
|
|
|
|
struct bpf_spin_lock *lock;
|
|
|
|
struct fq_flow_node *flow;
|
|
|
|
struct skb_node *skbn;
|
|
|
|
bool is_empty;
|
|
|
|
u32 *cnt;
|
|
|
|
|
|
|
|
if (q.new_flow_cnt) {
|
|
|
|
head = &fq_new_flows;
|
|
|
|
lock = &fq_new_flows_lock;
|
|
|
|
cnt = &q.new_flow_cnt;
|
|
|
|
} else if (q.old_flow_cnt) {
|
|
|
|
head = &fq_old_flows;
|
|
|
|
lock = &fq_old_flows_lock;
|
|
|
|
cnt = &q.old_flow_cnt;
|
|
|
|
} else {
|
|
|
|
if (q.time_next_delayed_flow != ~0ULL)
|
|
|
|
ctx->expire = q.time_next_delayed_flow;
|
|
|
|
goto break_loop;
|
|
|
|
}
|
|
|
|
|
|
|
|
fq_flows_remove_front(head, lock, &node, cnt);
|
|
|
|
if (!node)
|
|
|
|
goto break_loop;
|
|
|
|
|
|
|
|
flow = container_of(node, struct fq_flow_node, list_node);
|
|
|
|
if (flow->credit <= 0) {
|
|
|
|
flow->credit += q.quantum;
|
|
|
|
fq_flows_add_tail(&fq_old_flows, &fq_old_flows_lock, flow, &q.old_flow_cnt);
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
|
|
|
|
bpf_spin_lock(&flow->lock);
|
|
|
|
rb_node = bpf_rbtree_first(&flow->queue);
|
|
|
|
if (!rb_node) {
|
|
|
|
bpf_spin_unlock(&flow->lock);
|
|
|
|
is_empty = fq_flows_is_empty(&fq_old_flows, &fq_old_flows_lock);
|
|
|
|
if (head == &fq_new_flows && !is_empty) {
|
|
|
|
fq_flows_add_tail(&fq_old_flows, &fq_old_flows_lock, flow, &q.old_flow_cnt);
|
|
|
|
} else {
|
|
|
|
fq_flow_set_detached(flow);
|
|
|
|
bpf_obj_drop(flow);
|
|
|
|
}
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
|
|
|
|
skbn = container_of(rb_node, struct skb_node, node);
|
|
|
|
time_to_send = skbn->tstamp;
|
|
|
|
|
|
|
|
time_next_packet = (time_to_send > flow->time_next_packet) ?
|
|
|
|
time_to_send : flow->time_next_packet;
|
|
|
|
if (ctx->now < time_next_packet) {
|
|
|
|
bpf_spin_unlock(&flow->lock);
|
|
|
|
flow->time_next_packet = time_next_packet;
|
|
|
|
fq_flow_set_throttled(flow);
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
|
|
|
|
rb_node = bpf_rbtree_remove(&flow->queue, rb_node);
|
|
|
|
bpf_spin_unlock(&flow->lock);
|
|
|
|
|
|
|
|
if (!rb_node)
|
|
|
|
goto add_flow_and_break;
|
|
|
|
|
|
|
|
skbn = container_of(rb_node, struct skb_node, node);
|
|
|
|
skb = bpf_kptr_xchg(&skbn->skb, skb);
|
|
|
|
bpf_obj_drop(skbn);
|
|
|
|
|
|
|
|
if (!skb)
|
|
|
|
goto add_flow_and_break;
|
|
|
|
|
|
|
|
flow->credit -= qdisc_skb_cb(skb)->pkt_len;
|
|
|
|
flow->qlen--;
|
|
|
|
|
|
|
|
add_flow_and_break:
|
|
|
|
fq_flows_add_head(head, lock, flow, cnt);
|
|
|
|
|
|
|
|
break_loop:
|
|
|
|
ctx->stop_iter = true;
|
|
|
|
return skb;
|
|
|
|
}
|
|
|
|
|
|
|
|
static struct sk_buff *fq_dequeue_prio(void)
|
|
|
|
{
|
|
|
|
struct fq_flow_node *flow = NULL;
|
|
|
|
struct fq_stashed_flow *sflow;
|
|
|
|
struct bpf_rb_node *rb_node;
|
|
|
|
struct sk_buff *skb = NULL;
|
|
|
|
struct skb_node *skbn;
|
|
|
|
u64 hash = 0;
|
|
|
|
|
|
|
|
sflow = bpf_map_lookup_elem(&fq_prio_flows, &hash);
|
|
|
|
if (!sflow)
|
|
|
|
return NULL;
|
|
|
|
|
|
|
|
flow = bpf_kptr_xchg(&sflow->flow, flow);
|
|
|
|
if (!flow)
|
|
|
|
return NULL;
|
|
|
|
|
|
|
|
bpf_spin_lock(&flow->lock);
|
|
|
|
rb_node = bpf_rbtree_first(&flow->queue);
|
|
|
|
if (!rb_node) {
|
|
|
|
bpf_spin_unlock(&flow->lock);
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
|
|
|
|
skbn = container_of(rb_node, struct skb_node, node);
|
|
|
|
rb_node = bpf_rbtree_remove(&flow->queue, &skbn->node);
|
|
|
|
bpf_spin_unlock(&flow->lock);
|
|
|
|
|
|
|
|
if (!rb_node)
|
|
|
|
goto out;
|
|
|
|
|
|
|
|
skbn = container_of(rb_node, struct skb_node, node);
|
|
|
|
skb = bpf_kptr_xchg(&skbn->skb, skb);
|
|
|
|
bpf_obj_drop(skbn);
|
|
|
|
|
|
|
|
out:
|
|
|
|
bpf_kptr_xchg_back(&sflow->flow, flow);
|
|
|
|
|
|
|
|
return skb;
|
|
|
|
}
|
|
|
|
|
|
|
|
SEC("struct_ops/bpf_fq_dequeue")
|
|
|
|
struct sk_buff *BPF_PROG(bpf_fq_dequeue, struct Qdisc *sch)
|
|
|
|
{
|
|
|
|
struct dequeue_nonprio_ctx cb_ctx = {};
|
|
|
|
struct sk_buff *skb = NULL;
|
|
|
|
int i;
|
|
|
|
|
|
|
|
if (!sch->q.qlen)
|
|
|
|
goto out;
|
|
|
|
|
|
|
|
skb = fq_dequeue_prio();
|
|
|
|
if (skb)
|
|
|
|
goto dequeue;
|
|
|
|
|
|
|
|
q.ktime_cache = cb_ctx.now = bpf_ktime_get_ns();
|
|
|
|
fq_check_throttled(q.ktime_cache);
|
|
|
|
bpf_for(i, 0, sch->limit) {
|
|
|
|
skb = fq_dequeue_nonprio_flows(i, &cb_ctx);
|
|
|
|
if (cb_ctx.stop_iter)
|
|
|
|
break;
|
|
|
|
};
|
|
|
|
|
|
|
|
if (skb) {
|
|
|
|
dequeue:
|
|
|
|
sch->q.qlen--;
|
|
|
|
sch->qstats.backlog -= qdisc_pkt_len(skb);
|
|
|
|
bpf_qdisc_bstats_update(sch, skb);
|
|
|
|
return skb;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (cb_ctx.expire)
|
|
|
|
bpf_qdisc_watchdog_schedule(sch, cb_ctx.expire, q.timer_slack);
|
|
|
|
out:
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
|
|
|
|
static int fq_remove_flows_in_list(u32 index, void *ctx)
|
|
|
|
{
|
|
|
|
struct bpf_list_node *node;
|
|
|
|
struct fq_flow_node *flow;
|
|
|
|
|
|
|
|
bpf_spin_lock(&fq_new_flows_lock);
|
|
|
|
node = bpf_list_pop_front(&fq_new_flows);
|
|
|
|
bpf_spin_unlock(&fq_new_flows_lock);
|
|
|
|
if (!node) {
|
|
|
|
bpf_spin_lock(&fq_old_flows_lock);
|
|
|
|
node = bpf_list_pop_front(&fq_old_flows);
|
|
|
|
bpf_spin_unlock(&fq_old_flows_lock);
|
|
|
|
if (!node)
|
|
|
|
return 1;
|
|
|
|
}
|
|
|
|
|
|
|
|
flow = container_of(node, struct fq_flow_node, list_node);
|
|
|
|
bpf_obj_drop(flow);
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
extern unsigned CONFIG_HZ __kconfig;
|
|
|
|
|
|
|
|
/* limit number of collected flows per round */
|
|
|
|
#define FQ_GC_MAX 8
|
|
|
|
#define FQ_GC_AGE (3*CONFIG_HZ)
|
|
|
|
|
|
|
|
static bool fq_gc_candidate(struct fq_flow_node *flow)
|
|
|
|
{
|
|
|
|
u64 jiffies = bpf_jiffies64();
|
|
|
|
|
|
|
|
return fq_flow_is_detached(flow) &&
|
|
|
|
((s64)(jiffies - (flow->age + FQ_GC_AGE)) > 0);
|
|
|
|
}
|
|
|
|
|
|
|
|
static int
|
|
|
|
fq_remove_flows(struct bpf_map *flow_map, u64 *hash,
|
|
|
|
struct fq_stashed_flow *sflow, struct remove_flows_ctx *ctx)
|
|
|
|
{
|
|
|
|
if (sflow->flow &&
|
|
|
|
(!ctx->gc_only || fq_gc_candidate(sflow->flow))) {
|
|
|
|
bpf_map_delete_elem(flow_map, hash);
|
|
|
|
ctx->reset_cnt++;
|
|
|
|
}
|
|
|
|
|
|
|
|
return ctx->reset_cnt < ctx->reset_max ? 0 : 1;
|
|
|
|
}
|
|
|
|
|
|
|
|
static void fq_gc(void)
|
|
|
|
{
|
|
|
|
struct remove_flows_ctx cb_ctx = {
|
|
|
|
.gc_only = true,
|
|
|
|
.reset_cnt = 0,
|
|
|
|
.reset_max = FQ_GC_MAX,
|
|
|
|
};
|
|
|
|
|
|
|
|
bpf_for_each_map_elem(&fq_nonprio_flows, fq_remove_flows, &cb_ctx, 0);
|
|
|
|
}
|
|
|
|
|
|
|
|
SEC("struct_ops/bpf_fq_reset")
|
|
|
|
void BPF_PROG(bpf_fq_reset, struct Qdisc *sch)
|
|
|
|
{
|
|
|
|
struct unset_throttled_flows_ctx utf_ctx = {
|
|
|
|
.unset_all = true,
|
|
|
|
};
|
|
|
|
struct remove_flows_ctx rf_ctx = {
|
|
|
|
.gc_only = false,
|
|
|
|
.reset_cnt = 0,
|
|
|
|
.reset_max = NUM_QUEUE,
|
|
|
|
};
|
|
|
|
struct fq_stashed_flow *sflow;
|
|
|
|
u64 hash = 0;
|
|
|
|
|
|
|
|
sch->q.qlen = 0;
|
|
|
|
sch->qstats.backlog = 0;
|
|
|
|
|
|
|
|
bpf_for_each_map_elem(&fq_nonprio_flows, fq_remove_flows, &rf_ctx, 0);
|
|
|
|
|
|
|
|
rf_ctx.reset_cnt = 0;
|
|
|
|
bpf_for_each_map_elem(&fq_prio_flows, fq_remove_flows, &rf_ctx, 0);
|
|
|
|
fq_new_flow(&fq_prio_flows, &sflow, hash);
|
|
|
|
|
|
|
|
bpf_loop(NUM_QUEUE, fq_remove_flows_in_list, NULL, 0);
|
|
|
|
q.new_flow_cnt = 0;
|
|
|
|
q.old_flow_cnt = 0;
|
|
|
|
|
|
|
|
bpf_loop(NUM_QUEUE, fq_unset_throttled_flows, &utf_ctx, 0);
|
|
|
|
}
|
|
|
|
|
|
|
|
SEC("struct_ops/bpf_fq_init")
|
|
|
|
int BPF_PROG(bpf_fq_init, struct Qdisc *sch, struct nlattr *opt,
|
|
|
|
struct netlink_ext_ack *extack)
|
|
|
|
{
|
|
|
|
struct net_device *dev = sch->dev_queue->dev;
|
|
|
|
u32 psched_mtu = dev->mtu + dev->hard_header_len;
|
|
|
|
struct fq_stashed_flow *sflow;
|
|
|
|
u64 hash = 0;
|
|
|
|
|
|
|
|
if (fq_new_flow(&fq_prio_flows, &sflow, hash) < 0)
|
|
|
|
return -ENOMEM;
|
|
|
|
|
|
|
|
sch->limit = 10000;
|
|
|
|
q.initial_quantum = 10 * psched_mtu;
|
|
|
|
q.quantum = 2 * psched_mtu;
|
|
|
|
q.flow_refill_delay = 40;
|
|
|
|
q.flow_plimit = 100;
|
|
|
|
q.horizon = 10ULL * NSEC_PER_SEC;
|
|
|
|
q.horizon_drop = 1;
|
|
|
|
q.orphan_mask = 1024 - 1;
|
|
|
|
q.timer_slack = 10 * NSEC_PER_USEC;
|
|
|
|
q.time_next_delayed_flow = ~0ULL;
|
|
|
|
q.unthrottle_latency_ns = 0ULL;
|
|
|
|
q.new_flow_cnt = 0;
|
|
|
|
q.old_flow_cnt = 0;
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2025-05-02 13:16:23 -07:00
|
|
|
SEC("struct_ops")
|
|
|
|
void BPF_PROG(bpf_fq_destroy, struct Qdisc *sch)
|
|
|
|
{
|
|
|
|
}
|
|
|
|
|
2025-04-09 14:46:05 -07:00
|
|
|
SEC(".struct_ops")
|
|
|
|
struct Qdisc_ops fq = {
|
|
|
|
.enqueue = (void *)bpf_fq_enqueue,
|
|
|
|
.dequeue = (void *)bpf_fq_dequeue,
|
|
|
|
.reset = (void *)bpf_fq_reset,
|
|
|
|
.init = (void *)bpf_fq_init,
|
2025-05-02 13:16:23 -07:00
|
|
|
.destroy = (void *)bpf_fq_destroy,
|
2025-04-09 14:46:05 -07:00
|
|
|
.id = "bpf_fq",
|
|
|
|
};
|