mirror of
git://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
synced 2025-08-05 16:54:27 +00:00

In CLOS networks, as link failures occur at various points in the network, ECMP weights of the involved nodes are adjusted to compensate. With high fan-out of the involved nodes, and overall high number of nodes, a (non-)ECMP weight ratio that we would like to configure does not fit into 8 bits. Instead of, say, 255:254, we might like to configure something like 1000:999. For these deployments, the 8-bit weight may not be enough. To that end, in this patch increase the next hop weight from u8 to u16. Increasing the width of an integral type can be tricky, because while the code still compiles, the types may not check out anymore, and numerical errors come up. To prevent this, the conversion was done in two steps. First the type was changed from u8 to a single-member structure, which invalidated all uses of the field. This allowed going through them one by one and audit for type correctness. Then the structure was replaced with a vanilla u16 again. This should ensure that no place was missed. The UAPI for configuring nexthop group members is that an attribute NHA_GROUP carries an array of struct nexthop_grp entries: struct nexthop_grp { __u32 id; /* nexthop id - must exist */ __u8 weight; /* weight of this nexthop */ __u8 resvd1; __u16 resvd2; }; The field resvd1 is currently validated and required to be zero. We can lift this requirement and carry high-order bits of the weight in the reserved field: struct nexthop_grp { __u32 id; /* nexthop id - must exist */ __u8 weight; /* weight of this nexthop */ __u8 weight_high; __u16 resvd2; }; Keeping the fields split this way was chosen in case an existing userspace makes assumptions about the width of the weight field, and to sidestep any endianness issues. The weight field is currently encoded as the weight value minus one, because weight of 0 is invalid. This same trick is impossible for the new weight_high field, because zero must mean actual zero. With this in place: - Old userspace is guaranteed to carry weight_high of 0, therefore configuring 8-bit weights as appropriate. When dumping nexthops with 16-bit weight, it would only show the lower 8 bits. But configuring such nexthops implies existence of userspace aware of the extension in the first place. - New userspace talking to an old kernel will work as long as it only attempts to configure 8-bit weights, where the high-order bits are zero. Old kernel will bounce attempts at configuring >8-bit weights. Renaming reserved fields as they are allocated for some purpose is commonly done in Linux. Whoever touches a reserved field is doing so at their own risk. nexthop_grp::resvd1 in particular is currently used by at least strace, however they carry an own copy of UAPI headers, and the conversion should be trivial. A helper is provided for decoding the weight out of the two fields. Forcing a conversion seems preferable to bending backwards and introducing anonymous unions or whatever. Signed-off-by: Petr Machata <petrm@nvidia.com> Reviewed-by: Ido Schimmel <idosch@nvidia.com> Reviewed-by: David Ahern <dsahern@kernel.org> Reviewed-by: Przemek Kitszel <przemyslaw.kitszel@intel.com> Link: https://patch.msgid.link/483e2fcf4beb0d9135d62e7d27b46fa2685479d4.1723036486.git.petrm@nvidia.com Signed-off-by: Jakub Kicinski <kuba@kernel.org>
157 lines
4 KiB
C
157 lines
4 KiB
C
/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
|
|
#ifndef _UAPI_LINUX_NEXTHOP_H
|
|
#define _UAPI_LINUX_NEXTHOP_H
|
|
|
|
#include <linux/types.h>
|
|
|
|
struct nhmsg {
|
|
unsigned char nh_family;
|
|
unsigned char nh_scope; /* return only */
|
|
unsigned char nh_protocol; /* Routing protocol that installed nh */
|
|
unsigned char resvd;
|
|
unsigned int nh_flags; /* RTNH_F flags */
|
|
};
|
|
|
|
/* entry in a nexthop group */
|
|
struct nexthop_grp {
|
|
__u32 id; /* nexthop id - must exist */
|
|
__u8 weight; /* weight of this nexthop */
|
|
__u8 weight_high; /* high order bits of weight */
|
|
__u16 resvd2;
|
|
};
|
|
|
|
static inline __u16 nexthop_grp_weight(const struct nexthop_grp *entry)
|
|
{
|
|
return ((entry->weight_high << 8) | entry->weight) + 1;
|
|
}
|
|
|
|
enum {
|
|
NEXTHOP_GRP_TYPE_MPATH, /* hash-threshold nexthop group
|
|
* default type if not specified
|
|
*/
|
|
NEXTHOP_GRP_TYPE_RES, /* resilient nexthop group */
|
|
__NEXTHOP_GRP_TYPE_MAX,
|
|
};
|
|
|
|
#define NEXTHOP_GRP_TYPE_MAX (__NEXTHOP_GRP_TYPE_MAX - 1)
|
|
|
|
#define NHA_OP_FLAG_DUMP_STATS BIT(0)
|
|
#define NHA_OP_FLAG_DUMP_HW_STATS BIT(1)
|
|
|
|
/* Response OP_FLAGS. */
|
|
#define NHA_OP_FLAG_RESP_GRP_RESVD_0 BIT(31) /* Dump clears resvd fields. */
|
|
|
|
enum {
|
|
NHA_UNSPEC,
|
|
NHA_ID, /* u32; id for nexthop. id == 0 means auto-assign */
|
|
|
|
NHA_GROUP, /* array of nexthop_grp */
|
|
NHA_GROUP_TYPE, /* u16 one of NEXTHOP_GRP_TYPE */
|
|
/* if NHA_GROUP attribute is added, no other attributes can be set */
|
|
|
|
NHA_BLACKHOLE, /* flag; nexthop used to blackhole packets */
|
|
/* if NHA_BLACKHOLE is added, OIF, GATEWAY, ENCAP can not be set */
|
|
|
|
NHA_OIF, /* u32; nexthop device */
|
|
NHA_GATEWAY, /* be32 (IPv4) or in6_addr (IPv6) gw address */
|
|
NHA_ENCAP_TYPE, /* u16; lwt encap type */
|
|
NHA_ENCAP, /* lwt encap data */
|
|
|
|
/* NHA_OIF can be appended to dump request to return only
|
|
* nexthops using given device
|
|
*/
|
|
NHA_GROUPS, /* flag; only return nexthop groups in dump */
|
|
NHA_MASTER, /* u32; only return nexthops with given master dev */
|
|
|
|
NHA_FDB, /* flag; nexthop belongs to a bridge fdb */
|
|
/* if NHA_FDB is added, OIF, BLACKHOLE, ENCAP cannot be set */
|
|
|
|
/* nested; resilient nexthop group attributes */
|
|
NHA_RES_GROUP,
|
|
/* nested; nexthop bucket attributes */
|
|
NHA_RES_BUCKET,
|
|
|
|
/* u32; operation-specific flags */
|
|
NHA_OP_FLAGS,
|
|
|
|
/* nested; nexthop group stats */
|
|
NHA_GROUP_STATS,
|
|
|
|
/* u32; nexthop hardware stats enable */
|
|
NHA_HW_STATS_ENABLE,
|
|
|
|
/* u32; read-only; whether any driver collects HW stats */
|
|
NHA_HW_STATS_USED,
|
|
|
|
__NHA_MAX,
|
|
};
|
|
|
|
#define NHA_MAX (__NHA_MAX - 1)
|
|
|
|
enum {
|
|
NHA_RES_GROUP_UNSPEC,
|
|
/* Pad attribute for 64-bit alignment. */
|
|
NHA_RES_GROUP_PAD = NHA_RES_GROUP_UNSPEC,
|
|
|
|
/* u16; number of nexthop buckets in a resilient nexthop group */
|
|
NHA_RES_GROUP_BUCKETS,
|
|
/* clock_t as u32; nexthop bucket idle timer (per-group) */
|
|
NHA_RES_GROUP_IDLE_TIMER,
|
|
/* clock_t as u32; nexthop unbalanced timer */
|
|
NHA_RES_GROUP_UNBALANCED_TIMER,
|
|
/* clock_t as u64; nexthop unbalanced time */
|
|
NHA_RES_GROUP_UNBALANCED_TIME,
|
|
|
|
__NHA_RES_GROUP_MAX,
|
|
};
|
|
|
|
#define NHA_RES_GROUP_MAX (__NHA_RES_GROUP_MAX - 1)
|
|
|
|
enum {
|
|
NHA_RES_BUCKET_UNSPEC,
|
|
/* Pad attribute for 64-bit alignment. */
|
|
NHA_RES_BUCKET_PAD = NHA_RES_BUCKET_UNSPEC,
|
|
|
|
/* u16; nexthop bucket index */
|
|
NHA_RES_BUCKET_INDEX,
|
|
/* clock_t as u64; nexthop bucket idle time */
|
|
NHA_RES_BUCKET_IDLE_TIME,
|
|
/* u32; nexthop id assigned to the nexthop bucket */
|
|
NHA_RES_BUCKET_NH_ID,
|
|
|
|
__NHA_RES_BUCKET_MAX,
|
|
};
|
|
|
|
#define NHA_RES_BUCKET_MAX (__NHA_RES_BUCKET_MAX - 1)
|
|
|
|
enum {
|
|
NHA_GROUP_STATS_UNSPEC,
|
|
|
|
/* nested; nexthop group entry stats */
|
|
NHA_GROUP_STATS_ENTRY,
|
|
|
|
__NHA_GROUP_STATS_MAX,
|
|
};
|
|
|
|
#define NHA_GROUP_STATS_MAX (__NHA_GROUP_STATS_MAX - 1)
|
|
|
|
enum {
|
|
NHA_GROUP_STATS_ENTRY_UNSPEC,
|
|
|
|
/* u32; nexthop id of the nexthop group entry */
|
|
NHA_GROUP_STATS_ENTRY_ID,
|
|
|
|
/* uint; number of packets forwarded via the nexthop group entry */
|
|
NHA_GROUP_STATS_ENTRY_PACKETS,
|
|
|
|
/* uint; number of packets forwarded via the nexthop group entry in
|
|
* hardware
|
|
*/
|
|
NHA_GROUP_STATS_ENTRY_PACKETS_HW,
|
|
|
|
__NHA_GROUP_STATS_ENTRY_MAX,
|
|
};
|
|
|
|
#define NHA_GROUP_STATS_ENTRY_MAX (__NHA_GROUP_STATS_ENTRY_MAX - 1)
|
|
|
|
#endif
|