mirror of
git://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
synced 2025-08-05 16:54:27 +00:00

This patch provides a setsockopt method to let applications leverage to adjust how many descs to be handled at most in one send syscall. It mitigates the situation where the default value (32) that is too small leads to higher frequency of triggering send syscall. Considering the prosperity/complexity the applications have, there is no absolutely ideal suggestion fitting all cases. So keep 32 as its default value like before. The patch does the following things: - Add XDP_MAX_TX_SKB_BUDGET socket option. - Set max_tx_budget to 32 by default in the initialization phase as a per-socket granular control. - Set the range of max_tx_budget as [32, xs->tx->nentries]. The idea behind this comes out of real workloads in production. We use a user-level stack with xsk support to accelerate sending packets and minimize triggering syscalls. When the packets are aggregated, it's not hard to hit the upper bound (namely, 32). The moment user-space stack fetches the -EAGAIN error number passed from sendto(), it will loop to try again until all the expected descs from tx ring are sent out to the driver. Enlarging the XDP_MAX_TX_SKB_BUDGET value contributes to less frequency of sendto() and higher throughput/PPS. Here is what I did in production, along with some numbers as follows: For one application I saw lately, I suggested using 128 as max_tx_budget because I saw two limitations without changing any default configuration: 1) XDP_MAX_TX_SKB_BUDGET, 2) socket sndbuf which is 212992 decided by net.core.wmem_default. As to XDP_MAX_TX_SKB_BUDGET, the scenario behind this was I counted how many descs are transmitted to the driver at one time of sendto() based on [1] patch and then I calculated the possibility of hitting the upper bound. Finally I chose 128 as a suitable value because 1) it covers most of the cases, 2) a higher number would not bring evident results. After twisting the parameters, a stable improvement of around 4% for both PPS and throughput and less resources consumption were found to be observed by strace -c -p xxx: 1) %time was decreased by 7.8% 2) error counter was decreased from 18367 to 572 [1]: https://lore.kernel.org/all/20250619093641.70700-1-kerneljasonxing@gmail.com/ Signed-off-by: Jason Xing <kernelxing@tencent.com> Acked-by: Maciej Fijalkowski <maciej.fijalkowski@intel.com> Link: https://patch.msgid.link/20250704160138.48677-1-kerneljasonxing@gmail.com Signed-off-by: Paolo Abeni <pabeni@redhat.com>
239 lines
6.7 KiB
C
239 lines
6.7 KiB
C
/* SPDX-License-Identifier: GPL-2.0 */
|
|
/* AF_XDP internal functions
|
|
* Copyright(c) 2018 Intel Corporation.
|
|
*/
|
|
|
|
#ifndef _LINUX_XDP_SOCK_H
|
|
#define _LINUX_XDP_SOCK_H
|
|
|
|
#include <linux/bpf.h>
|
|
#include <linux/workqueue.h>
|
|
#include <linux/if_xdp.h>
|
|
#include <linux/mutex.h>
|
|
#include <linux/spinlock.h>
|
|
#include <linux/mm.h>
|
|
#include <net/sock.h>
|
|
|
|
#define XDP_UMEM_SG_FLAG (1 << 1)
|
|
|
|
struct net_device;
|
|
struct xsk_queue;
|
|
struct xdp_buff;
|
|
|
|
struct xdp_umem {
|
|
void *addrs;
|
|
u64 size;
|
|
u32 headroom;
|
|
u32 chunk_size;
|
|
u32 chunks;
|
|
u32 npgs;
|
|
struct user_struct *user;
|
|
refcount_t users;
|
|
u8 flags;
|
|
u8 tx_metadata_len;
|
|
bool zc;
|
|
struct page **pgs;
|
|
int id;
|
|
struct list_head xsk_dma_list;
|
|
struct work_struct work;
|
|
};
|
|
|
|
struct xsk_map {
|
|
struct bpf_map map;
|
|
spinlock_t lock; /* Synchronize map updates */
|
|
atomic_t count;
|
|
struct xdp_sock __rcu *xsk_map[];
|
|
};
|
|
|
|
struct xdp_sock {
|
|
/* struct sock must be the first member of struct xdp_sock */
|
|
struct sock sk;
|
|
struct xsk_queue *rx ____cacheline_aligned_in_smp;
|
|
struct net_device *dev;
|
|
struct xdp_umem *umem;
|
|
struct list_head flush_node;
|
|
struct xsk_buff_pool *pool;
|
|
u16 queue_id;
|
|
bool zc;
|
|
bool sg;
|
|
enum {
|
|
XSK_READY = 0,
|
|
XSK_BOUND,
|
|
XSK_UNBOUND,
|
|
} state;
|
|
|
|
struct xsk_queue *tx ____cacheline_aligned_in_smp;
|
|
struct list_head tx_list;
|
|
/* record the number of tx descriptors sent by this xsk and
|
|
* when it exceeds MAX_PER_SOCKET_BUDGET, an opportunity needs
|
|
* to be given to other xsks for sending tx descriptors, thereby
|
|
* preventing other XSKs from being starved.
|
|
*/
|
|
u32 tx_budget_spent;
|
|
|
|
/* Statistics */
|
|
u64 rx_dropped;
|
|
u64 rx_queue_full;
|
|
|
|
/* When __xsk_generic_xmit() must return before it sees the EOP descriptor for the current
|
|
* packet, the partially built skb is saved here so that packet building can resume in next
|
|
* call of __xsk_generic_xmit().
|
|
*/
|
|
struct sk_buff *skb;
|
|
|
|
struct list_head map_list;
|
|
/* Protects map_list */
|
|
spinlock_t map_list_lock;
|
|
u32 max_tx_budget;
|
|
/* Protects multiple processes in the control path */
|
|
struct mutex mutex;
|
|
struct xsk_queue *fq_tmp; /* Only as tmp storage before bind */
|
|
struct xsk_queue *cq_tmp; /* Only as tmp storage before bind */
|
|
};
|
|
|
|
/*
|
|
* AF_XDP TX metadata hooks for network devices.
|
|
* The following hooks can be defined; unless noted otherwise, they are
|
|
* optional and can be filled with a null pointer.
|
|
*
|
|
* void (*tmo_request_timestamp)(void *priv)
|
|
* Called when AF_XDP frame requested egress timestamp.
|
|
*
|
|
* u64 (*tmo_fill_timestamp)(void *priv)
|
|
* Called when AF_XDP frame, that had requested egress timestamp,
|
|
* received a completion. The hook needs to return the actual HW timestamp.
|
|
*
|
|
* void (*tmo_request_checksum)(u16 csum_start, u16 csum_offset, void *priv)
|
|
* Called when AF_XDP frame requested HW checksum offload. csum_start
|
|
* indicates position where checksumming should start.
|
|
* csum_offset indicates position where checksum should be stored.
|
|
*
|
|
* void (*tmo_request_launch_time)(u64 launch_time, void *priv)
|
|
* Called when AF_XDP frame requested launch time HW offload support.
|
|
* launch_time indicates the PTP time at which the device can schedule the
|
|
* packet for transmission.
|
|
*/
|
|
struct xsk_tx_metadata_ops {
|
|
void (*tmo_request_timestamp)(void *priv);
|
|
u64 (*tmo_fill_timestamp)(void *priv);
|
|
void (*tmo_request_checksum)(u16 csum_start, u16 csum_offset, void *priv);
|
|
void (*tmo_request_launch_time)(u64 launch_time, void *priv);
|
|
};
|
|
|
|
#ifdef CONFIG_XDP_SOCKETS
|
|
|
|
int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp);
|
|
int __xsk_map_redirect(struct xdp_sock *xs, struct xdp_buff *xdp);
|
|
void __xsk_map_flush(struct list_head *flush_list);
|
|
|
|
/**
|
|
* xsk_tx_metadata_to_compl - Save enough relevant metadata information
|
|
* to perform tx completion in the future.
|
|
* @meta: pointer to AF_XDP metadata area
|
|
* @compl: pointer to output struct xsk_tx_metadata_to_compl
|
|
*
|
|
* This function should be called by the networking device when
|
|
* it prepares AF_XDP egress packet. The value of @compl should be stored
|
|
* and passed to xsk_tx_metadata_complete upon TX completion.
|
|
*/
|
|
static inline void xsk_tx_metadata_to_compl(struct xsk_tx_metadata *meta,
|
|
struct xsk_tx_metadata_compl *compl)
|
|
{
|
|
if (!meta)
|
|
return;
|
|
|
|
if (meta->flags & XDP_TXMD_FLAGS_TIMESTAMP)
|
|
compl->tx_timestamp = &meta->completion.tx_timestamp;
|
|
else
|
|
compl->tx_timestamp = NULL;
|
|
}
|
|
|
|
/**
|
|
* xsk_tx_metadata_request - Evaluate AF_XDP TX metadata at submission
|
|
* and call appropriate xsk_tx_metadata_ops operation.
|
|
* @meta: pointer to AF_XDP metadata area
|
|
* @ops: pointer to struct xsk_tx_metadata_ops
|
|
* @priv: pointer to driver-private aread
|
|
*
|
|
* This function should be called by the networking device when
|
|
* it prepares AF_XDP egress packet.
|
|
*/
|
|
static inline void xsk_tx_metadata_request(const struct xsk_tx_metadata *meta,
|
|
const struct xsk_tx_metadata_ops *ops,
|
|
void *priv)
|
|
{
|
|
if (!meta)
|
|
return;
|
|
|
|
if (ops->tmo_request_launch_time)
|
|
if (meta->flags & XDP_TXMD_FLAGS_LAUNCH_TIME)
|
|
ops->tmo_request_launch_time(meta->request.launch_time,
|
|
priv);
|
|
|
|
if (ops->tmo_request_timestamp)
|
|
if (meta->flags & XDP_TXMD_FLAGS_TIMESTAMP)
|
|
ops->tmo_request_timestamp(priv);
|
|
|
|
if (ops->tmo_request_checksum)
|
|
if (meta->flags & XDP_TXMD_FLAGS_CHECKSUM)
|
|
ops->tmo_request_checksum(meta->request.csum_start,
|
|
meta->request.csum_offset, priv);
|
|
}
|
|
|
|
/**
|
|
* xsk_tx_metadata_complete - Evaluate AF_XDP TX metadata at completion
|
|
* and call appropriate xsk_tx_metadata_ops operation.
|
|
* @compl: pointer to completion metadata produced from xsk_tx_metadata_to_compl
|
|
* @ops: pointer to struct xsk_tx_metadata_ops
|
|
* @priv: pointer to driver-private aread
|
|
*
|
|
* This function should be called by the networking device upon
|
|
* AF_XDP egress completion.
|
|
*/
|
|
static inline void xsk_tx_metadata_complete(struct xsk_tx_metadata_compl *compl,
|
|
const struct xsk_tx_metadata_ops *ops,
|
|
void *priv)
|
|
{
|
|
if (!compl)
|
|
return;
|
|
if (!compl->tx_timestamp)
|
|
return;
|
|
|
|
*compl->tx_timestamp = ops->tmo_fill_timestamp(priv);
|
|
}
|
|
|
|
#else
|
|
|
|
static inline int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp)
|
|
{
|
|
return -ENOTSUPP;
|
|
}
|
|
|
|
static inline int __xsk_map_redirect(struct xdp_sock *xs, struct xdp_buff *xdp)
|
|
{
|
|
return -EOPNOTSUPP;
|
|
}
|
|
|
|
static inline void __xsk_map_flush(struct list_head *flush_list)
|
|
{
|
|
}
|
|
|
|
static inline void xsk_tx_metadata_to_compl(struct xsk_tx_metadata *meta,
|
|
struct xsk_tx_metadata_compl *compl)
|
|
{
|
|
}
|
|
|
|
static inline void xsk_tx_metadata_request(struct xsk_tx_metadata *meta,
|
|
const struct xsk_tx_metadata_ops *ops,
|
|
void *priv)
|
|
{
|
|
}
|
|
|
|
static inline void xsk_tx_metadata_complete(struct xsk_tx_metadata_compl *compl,
|
|
const struct xsk_tx_metadata_ops *ops,
|
|
void *priv)
|
|
{
|
|
}
|
|
|
|
#endif /* CONFIG_XDP_SOCKETS */
|
|
#endif /* _LINUX_XDP_SOCK_H */
|