devlink: Extend devlink rate API with traffic classes bandwidth management

Introduce support for specifying relative bandwidth shares between
traffic classes (TC) in the devlink-rate API. This new option allows
users to allocate bandwidth across multiple traffic classes in a
single command.

This feature provides a more granular control over traffic management,
especially for scenarios requiring Enhanced Transmission Selection.

Users can now define a relative bandwidth share for each traffic class.
For example, assigning share values of 20 to TC0 (TCP/UDP) and 80 to TC5
(RoCE) will result in TC0 receiving 20% and TC5 receiving 80% of the
total bandwidth. The actual percentage each class receives depends on
the ratio of its share value to the sum of all shares.

Example:
DEV=pci/0000:08:00.0

$ devlink port function rate add $DEV/vfs_group tx_share 10Gbit \
  tx_max 50Gbit tc-bw 0:20 1:0 2:0 3:0 4:0 5:80 6:0 7:0

$ devlink port function rate set $DEV/vfs_group \
  tc-bw 0:20 1:0 2:0 3:0 4:0 5:20 6:60 7:0

Example usage with ynl:

./tools/net/ynl/cli.py --spec Documentation/netlink/specs/devlink.yaml \
  --do rate-set --json '{
  "bus-name": "pci",
  "dev-name": "0000:08:00.0",
  "port-index": 1,
  "rate-tc-bws": [
    {"rate-tc-index": 0, "rate-tc-bw": 50},
    {"rate-tc-index": 1, "rate-tc-bw": 50},
    {"rate-tc-index": 2, "rate-tc-bw": 0},
    {"rate-tc-index": 3, "rate-tc-bw": 0},
    {"rate-tc-index": 4, "rate-tc-bw": 0},
    {"rate-tc-index": 5, "rate-tc-bw": 0},
    {"rate-tc-index": 6, "rate-tc-bw": 0},
    {"rate-tc-index": 7, "rate-tc-bw": 0}
  ]
}'

./tools/net/ynl/cli.py --spec Documentation/netlink/specs/devlink.yaml \
  --do rate-get --json '{
  "bus-name": "pci",
  "dev-name": "0000:08:00.0",
  "port-index": 1
}'

output for rate-get:
{'bus-name': 'pci',
 'dev-name': '0000:08:00.0',
 'port-index': 1,
 'rate-tc-bws': [{'rate-tc-bw': 50, 'rate-tc-index': 0},
                 {'rate-tc-bw': 50, 'rate-tc-index': 1},
                 {'rate-tc-bw': 0, 'rate-tc-index': 2},
                 {'rate-tc-bw': 0, 'rate-tc-index': 3},
                 {'rate-tc-bw': 0, 'rate-tc-index': 4},
                 {'rate-tc-bw': 0, 'rate-tc-index': 5},
                 {'rate-tc-bw': 0, 'rate-tc-index': 6},
                 {'rate-tc-bw': 0, 'rate-tc-index': 7}],
 'rate-tx-max': 0,
 'rate-tx-priority': 0,
 'rate-tx-share': 0,
 'rate-tx-weight': 0,
 'rate-type': 'leaf'}

Signed-off-by: Carolina Jubran <cjubran@nvidia.com>
Reviewed-by: Cosmin Ratiu <cratiu@nvidia.com>
Reviewed-by: Jiri Pirko <jiri@nvidia.com>
Signed-off-by: Tariq Toukan <tariqt@nvidia.com>
Signed-off-by: Mark Bloch <mbloch@nvidia.com>
Link: https://patch.msgid.link/20250629142138.361537-3-mbloch@nvidia.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
This commit is contained in:
Carolina Jubran 2025-06-29 17:21:32 +03:00 committed by Jakub Kicinski
parent 42401c4238
commit 566e8f108f
7 changed files with 195 additions and 5 deletions

View file

@ -224,6 +224,10 @@ definitions:
value: 10
-
name: binary
-
name: rate-tc-index-max
type: const
value: 7
attribute-sets:
-
@ -844,7 +848,23 @@ attribute-sets:
-
name: region-direct
type: flag
-
name: rate-tc-bws
type: nest
multi-attr: true
nested-attributes: dl-rate-tc-bws
-
name: rate-tc-index
type: u8
checks:
max: rate-tc-index-max
-
name: rate-tc-bw
type: u32
doc: |
Specifies the bandwidth share assigned to the Traffic Class.
The bandwidth for the traffic class is determined
in proportion to the sum of the shares of all configured classes.
-
name: dl-dev-stats
subset-of: devlink
@ -1249,6 +1269,14 @@ attribute-sets:
-
name: flash
type: flag
-
name: dl-rate-tc-bws
subset-of: devlink
attributes:
-
name: rate-tc-index
-
name: rate-tc-bw
operations:
enum-model: directional
@ -2176,6 +2204,7 @@ operations:
- rate-tx-priority
- rate-tx-weight
- rate-parent-node-name
- rate-tc-bws
-
name: rate-new
@ -2196,6 +2225,7 @@ operations:
- rate-tx-priority
- rate-tx-weight
- rate-parent-node-name
- rate-tc-bws
-
name: rate-del

View file

@ -418,6 +418,14 @@ API allows to configure following rate object's parameters:
to all node children limits. ``tx_max`` is an upper limit for children.
``tx_share`` is a total bandwidth distributed among children.
``tc_bw``
Allow users to set the bandwidth allocation per traffic class on rate
objects. This enables fine-grained QoS configurations by assigning a relative
share value to each traffic class. The bandwidth is distributed in proportion
to the share value for each class, relative to the sum of all shares.
When applied to a non-leaf node, tc_bw determines how bandwidth is shared
among its child elements.
``tx_priority`` and ``tx_weight`` can be used simultaneously. In that case
nodes with the same priority form a WFQ subgroup in the sibling group
and arbitration among them is based on assigned weights.

View file

@ -118,6 +118,8 @@ struct devlink_rate {
u32 tx_priority;
u32 tx_weight;
u32 tc_bw[DEVLINK_RATE_TCS_MAX];
};
struct devlink_port {
@ -1486,6 +1488,9 @@ struct devlink_ops {
u32 tx_priority, struct netlink_ext_ack *extack);
int (*rate_leaf_tx_weight_set)(struct devlink_rate *devlink_rate, void *priv,
u32 tx_weight, struct netlink_ext_ack *extack);
int (*rate_leaf_tc_bw_set)(struct devlink_rate *devlink_rate,
void *priv, u32 *tc_bw,
struct netlink_ext_ack *extack);
int (*rate_node_tx_share_set)(struct devlink_rate *devlink_rate, void *priv,
u64 tx_share, struct netlink_ext_ack *extack);
int (*rate_node_tx_max_set)(struct devlink_rate *devlink_rate, void *priv,
@ -1494,6 +1499,9 @@ struct devlink_ops {
u32 tx_priority, struct netlink_ext_ack *extack);
int (*rate_node_tx_weight_set)(struct devlink_rate *devlink_rate, void *priv,
u32 tx_weight, struct netlink_ext_ack *extack);
int (*rate_node_tc_bw_set)(struct devlink_rate *devlink_rate,
void *priv, u32 *tc_bw,
struct netlink_ext_ack *extack);
int (*rate_node_new)(struct devlink_rate *rate_node, void **priv,
struct netlink_ext_ack *extack);
int (*rate_node_del)(struct devlink_rate *rate_node, void *priv,

View file

@ -221,6 +221,11 @@ enum devlink_port_flavour {
*/
};
/* IEEE 802.1Qaz standard supported values. */
#define DEVLINK_RATE_TCS_MAX 8
#define DEVLINK_RATE_TC_INDEX_MAX (DEVLINK_RATE_TCS_MAX - 1)
enum devlink_rate_type {
DEVLINK_RATE_TYPE_LEAF,
DEVLINK_RATE_TYPE_NODE,
@ -629,6 +634,10 @@ enum devlink_attr {
DEVLINK_ATTR_REGION_DIRECT, /* flag */
DEVLINK_ATTR_RATE_TC_BWS, /* nested */
DEVLINK_ATTR_RATE_TC_INDEX, /* u8 */
DEVLINK_ATTR_RATE_TC_BW, /* u32 */
/* Add new attributes above here, update the spec in
* Documentation/netlink/specs/devlink.yaml and re-generate
* net/devlink/netlink_gen.c.

View file

@ -45,6 +45,11 @@ const struct nla_policy devlink_dl_port_function_nl_policy[DEVLINK_PORT_FN_ATTR_
[DEVLINK_PORT_FN_ATTR_CAPS] = NLA_POLICY_BITFIELD32(15),
};
const struct nla_policy devlink_dl_rate_tc_bws_nl_policy[DEVLINK_ATTR_RATE_TC_BW + 1] = {
[DEVLINK_ATTR_RATE_TC_INDEX] = NLA_POLICY_MAX(NLA_U8, DEVLINK_RATE_TC_INDEX_MAX),
[DEVLINK_ATTR_RATE_TC_BW] = { .type = NLA_U32, },
};
const struct nla_policy devlink_dl_selftest_id_nl_policy[DEVLINK_ATTR_SELFTEST_ID_FLASH + 1] = {
[DEVLINK_ATTR_SELFTEST_ID_FLASH] = { .type = NLA_FLAG, },
};
@ -523,7 +528,7 @@ static const struct nla_policy devlink_rate_get_dump_nl_policy[DEVLINK_ATTR_DEV_
};
/* DEVLINK_CMD_RATE_SET - do */
static const struct nla_policy devlink_rate_set_nl_policy[DEVLINK_ATTR_RATE_TX_WEIGHT + 1] = {
static const struct nla_policy devlink_rate_set_nl_policy[DEVLINK_ATTR_RATE_TC_BWS + 1] = {
[DEVLINK_ATTR_BUS_NAME] = { .type = NLA_NUL_STRING, },
[DEVLINK_ATTR_DEV_NAME] = { .type = NLA_NUL_STRING, },
[DEVLINK_ATTR_RATE_NODE_NAME] = { .type = NLA_NUL_STRING, },
@ -532,10 +537,11 @@ static const struct nla_policy devlink_rate_set_nl_policy[DEVLINK_ATTR_RATE_TX_W
[DEVLINK_ATTR_RATE_TX_PRIORITY] = { .type = NLA_U32, },
[DEVLINK_ATTR_RATE_TX_WEIGHT] = { .type = NLA_U32, },
[DEVLINK_ATTR_RATE_PARENT_NODE_NAME] = { .type = NLA_NUL_STRING, },
[DEVLINK_ATTR_RATE_TC_BWS] = NLA_POLICY_NESTED(devlink_dl_rate_tc_bws_nl_policy),
};
/* DEVLINK_CMD_RATE_NEW - do */
static const struct nla_policy devlink_rate_new_nl_policy[DEVLINK_ATTR_RATE_TX_WEIGHT + 1] = {
static const struct nla_policy devlink_rate_new_nl_policy[DEVLINK_ATTR_RATE_TC_BWS + 1] = {
[DEVLINK_ATTR_BUS_NAME] = { .type = NLA_NUL_STRING, },
[DEVLINK_ATTR_DEV_NAME] = { .type = NLA_NUL_STRING, },
[DEVLINK_ATTR_RATE_NODE_NAME] = { .type = NLA_NUL_STRING, },
@ -544,6 +550,7 @@ static const struct nla_policy devlink_rate_new_nl_policy[DEVLINK_ATTR_RATE_TX_W
[DEVLINK_ATTR_RATE_TX_PRIORITY] = { .type = NLA_U32, },
[DEVLINK_ATTR_RATE_TX_WEIGHT] = { .type = NLA_U32, },
[DEVLINK_ATTR_RATE_PARENT_NODE_NAME] = { .type = NLA_NUL_STRING, },
[DEVLINK_ATTR_RATE_TC_BWS] = NLA_POLICY_NESTED(devlink_dl_rate_tc_bws_nl_policy),
};
/* DEVLINK_CMD_RATE_DEL - do */
@ -1191,7 +1198,7 @@ const struct genl_split_ops devlink_nl_ops[74] = {
.doit = devlink_nl_rate_set_doit,
.post_doit = devlink_nl_post_doit,
.policy = devlink_rate_set_nl_policy,
.maxattr = DEVLINK_ATTR_RATE_TX_WEIGHT,
.maxattr = DEVLINK_ATTR_RATE_TC_BWS,
.flags = GENL_ADMIN_PERM | GENL_CMD_CAP_DO,
},
{
@ -1201,7 +1208,7 @@ const struct genl_split_ops devlink_nl_ops[74] = {
.doit = devlink_nl_rate_new_doit,
.post_doit = devlink_nl_post_doit,
.policy = devlink_rate_new_nl_policy,
.maxattr = DEVLINK_ATTR_RATE_TX_WEIGHT,
.maxattr = DEVLINK_ATTR_RATE_TC_BWS,
.flags = GENL_ADMIN_PERM | GENL_CMD_CAP_DO,
},
{

View file

@ -13,6 +13,7 @@
/* Common nested types */
extern const struct nla_policy devlink_dl_port_function_nl_policy[DEVLINK_PORT_FN_ATTR_CAPS + 1];
extern const struct nla_policy devlink_dl_rate_tc_bws_nl_policy[DEVLINK_ATTR_RATE_TC_BW + 1];
extern const struct nla_policy devlink_dl_selftest_id_nl_policy[DEVLINK_ATTR_SELFTEST_ID_FLASH + 1];
/* Ops table for devlink */

View file

@ -80,6 +80,29 @@ devlink_rate_get_from_info(struct devlink *devlink, struct genl_info *info)
return ERR_PTR(-EINVAL);
}
static int devlink_rate_put_tc_bws(struct sk_buff *msg, u32 *tc_bw)
{
struct nlattr *nla_tc_bw;
int i;
for (i = 0; i < DEVLINK_RATE_TCS_MAX; i++) {
nla_tc_bw = nla_nest_start(msg, DEVLINK_ATTR_RATE_TC_BWS);
if (!nla_tc_bw)
return -EMSGSIZE;
if (nla_put_u8(msg, DEVLINK_ATTR_RATE_TC_INDEX, i) ||
nla_put_u32(msg, DEVLINK_ATTR_RATE_TC_BW, tc_bw[i]))
goto nla_put_failure;
nla_nest_end(msg, nla_tc_bw);
}
return 0;
nla_put_failure:
nla_nest_cancel(msg, nla_tc_bw);
return -EMSGSIZE;
}
static int devlink_nl_rate_fill(struct sk_buff *msg,
struct devlink_rate *devlink_rate,
enum devlink_command cmd, u32 portid, u32 seq,
@ -129,6 +152,9 @@ static int devlink_nl_rate_fill(struct sk_buff *msg,
devlink_rate->parent->name))
goto nla_put_failure;
if (devlink_rate_put_tc_bws(msg, devlink_rate->tc_bw))
goto nla_put_failure;
genlmsg_end(msg, hdr);
return 0;
@ -316,6 +342,87 @@ devlink_nl_rate_parent_node_set(struct devlink_rate *devlink_rate,
return 0;
}
static int devlink_nl_rate_tc_bw_parse(struct nlattr *parent_nest, u32 *tc_bw,
unsigned long *bitmap,
struct netlink_ext_ack *extack)
{
struct nlattr *tb[DEVLINK_ATTR_MAX + 1];
u8 tc_index;
int err;
err = nla_parse_nested(tb, DEVLINK_ATTR_MAX, parent_nest,
devlink_dl_rate_tc_bws_nl_policy, extack);
if (err)
return err;
if (!tb[DEVLINK_ATTR_RATE_TC_INDEX]) {
NL_SET_ERR_ATTR_MISS(extack, parent_nest,
DEVLINK_ATTR_RATE_TC_INDEX);
return -EINVAL;
}
tc_index = nla_get_u8(tb[DEVLINK_ATTR_RATE_TC_INDEX]);
if (!tb[DEVLINK_ATTR_RATE_TC_BW]) {
NL_SET_ERR_ATTR_MISS(extack, parent_nest,
DEVLINK_ATTR_RATE_TC_BW);
return -EINVAL;
}
if (test_and_set_bit(tc_index, bitmap)) {
NL_SET_ERR_MSG_FMT(extack,
"Duplicate traffic class index specified (%u)",
tc_index);
return -EINVAL;
}
tc_bw[tc_index] = nla_get_u32(tb[DEVLINK_ATTR_RATE_TC_BW]);
return 0;
}
static int devlink_nl_rate_tc_bw_set(struct devlink_rate *devlink_rate,
struct genl_info *info)
{
DECLARE_BITMAP(bitmap, DEVLINK_RATE_TCS_MAX) = {};
struct devlink *devlink = devlink_rate->devlink;
const struct devlink_ops *ops = devlink->ops;
u32 tc_bw[DEVLINK_RATE_TCS_MAX] = {};
int rem, err = -EOPNOTSUPP, i;
struct nlattr *attr;
nlmsg_for_each_attr_type(attr, DEVLINK_ATTR_RATE_TC_BWS, info->nlhdr,
GENL_HDRLEN, rem) {
err = devlink_nl_rate_tc_bw_parse(attr, tc_bw, bitmap,
info->extack);
if (err)
return err;
}
for (i = 0; i < DEVLINK_RATE_TCS_MAX; i++) {
if (!test_bit(i, bitmap)) {
NL_SET_ERR_MSG_FMT(info->extack,
"Bandwidth values must be specified for all %u traffic classes",
DEVLINK_RATE_TCS_MAX);
return -EINVAL;
}
}
if (devlink_rate_is_leaf(devlink_rate))
err = ops->rate_leaf_tc_bw_set(devlink_rate, devlink_rate->priv,
tc_bw, info->extack);
else if (devlink_rate_is_node(devlink_rate))
err = ops->rate_node_tc_bw_set(devlink_rate, devlink_rate->priv,
tc_bw, info->extack);
if (err)
return err;
memcpy(devlink_rate->tc_bw, tc_bw, sizeof(tc_bw));
return 0;
}
static int devlink_nl_rate_set(struct devlink_rate *devlink_rate,
const struct devlink_ops *ops,
struct genl_info *info)
@ -388,6 +495,12 @@ static int devlink_nl_rate_set(struct devlink_rate *devlink_rate,
return err;
}
if (attrs[DEVLINK_ATTR_RATE_TC_BWS]) {
err = devlink_nl_rate_tc_bw_set(devlink_rate, info);
if (err)
return err;
}
return 0;
}
@ -423,6 +536,13 @@ static bool devlink_rate_set_ops_supported(const struct devlink_ops *ops,
"TX weight set isn't supported for the leafs");
return false;
}
if (attrs[DEVLINK_ATTR_RATE_TC_BWS] &&
!ops->rate_leaf_tc_bw_set) {
NL_SET_ERR_MSG_ATTR(info->extack,
attrs[DEVLINK_ATTR_RATE_TC_BWS],
"TC bandwidth set isn't supported for the leafs");
return false;
}
} else if (type == DEVLINK_RATE_TYPE_NODE) {
if (attrs[DEVLINK_ATTR_RATE_TX_SHARE] && !ops->rate_node_tx_share_set) {
NL_SET_ERR_MSG(info->extack, "TX share set isn't supported for the nodes");
@ -449,6 +569,13 @@ static bool devlink_rate_set_ops_supported(const struct devlink_ops *ops,
"TX weight set isn't supported for the nodes");
return false;
}
if (attrs[DEVLINK_ATTR_RATE_TC_BWS] &&
!ops->rate_node_tc_bw_set) {
NL_SET_ERR_MSG_ATTR(info->extack,
attrs[DEVLINK_ATTR_RATE_TC_BWS],
"TC bandwidth set isn't supported for the nodes");
return false;
}
} else {
WARN(1, "Unknown type of rate object");
return false;