From b05d42eefac737ce3cd80114d3579111023941b8 Mon Sep 17 00:00:00 2001 From: Jianbo Liu Date: Thu, 19 Jun 2025 12:48:51 +0300 Subject: [PATCH 1/3] xfrm: hold device only for the asynchronous decryption The dev_hold() on skb->dev during packet reception was originally added to prevent the device from being released prematurely during asynchronous decryption operations. As current hardware can offload decryption, this asynchronous path is not always utilized. This often results in a pattern of dev_hold() immediately followed by dev_put() for each packet, creating unnecessary reference counting overhead detrimental to performance. This patch optimizes this by skipping the dev_hold() and subsequent dev_put() when asynchronous decryption is not being performed. Signed-off-by: Jianbo Liu Reviewed-by: Cosmin Ratiu Signed-off-by: Steffen Klassert --- net/xfrm/xfrm_input.c | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/net/xfrm/xfrm_input.c b/net/xfrm/xfrm_input.c index 7e6a71b9d6a3..c9ddef869aa5 100644 --- a/net/xfrm/xfrm_input.c +++ b/net/xfrm/xfrm_input.c @@ -503,6 +503,7 @@ int xfrm_input(struct sk_buff *skb, int nexthdr, __be32 spi, int encap_type) /* An encap_type of -1 indicates async resumption. */ if (encap_type == -1) { async = 1; + dev_put(skb->dev); seq = XFRM_SKB_CB(skb)->seq.input.low; goto resume; } @@ -649,18 +650,18 @@ lock: XFRM_SKB_CB(skb)->seq.input.low = seq; XFRM_SKB_CB(skb)->seq.input.hi = seq_hi; - dev_hold(skb->dev); - - if (crypto_done) + if (crypto_done) { nexthdr = x->type_offload->input_tail(x, skb); - else + } else { + dev_hold(skb->dev); + nexthdr = x->type->input(x, skb); + if (nexthdr == -EINPROGRESS) + return 0; - if (nexthdr == -EINPROGRESS) - return 0; + dev_put(skb->dev); + } resume: - dev_put(skb->dev); - spin_lock(&x->lock); if (nexthdr < 0) { if (nexthdr == -EBADMSG) { From 94f39804d891cffe4ce17737d295f3b195bc7299 Mon Sep 17 00:00:00 2001 From: Aakash Kumar S Date: Mon, 30 Jun 2025 18:08:56 +0530 Subject: [PATCH 2/3] xfrm: Duplicate SPI Handling The issue originates when Strongswan initiates an XFRM_MSG_ALLOCSPI Netlink message, which triggers the kernel function xfrm_alloc_spi(). This function is expected to ensure uniqueness of the Security Parameter Index (SPI) for inbound Security Associations (SAs). However, it can return success even when the requested SPI is already in use, leading to duplicate SPIs assigned to multiple inbound SAs, differentiated only by their destination addresses. This behavior causes inconsistencies during SPI lookups for inbound packets. Since the lookup may return an arbitrary SA among those with the same SPI, packet processing can fail, resulting in packet drops. According to RFC 4301 section 4.4.2 , for inbound processing a unicast SA is uniquely identified by the SPI and optionally protocol. Reproducing the Issue Reliably: To consistently reproduce the problem, restrict the available SPI range in charon.conf : spi_min = 0x10000000 spi_max = 0x10000002 This limits the system to only 2 usable SPI values. Next, create more than 2 Child SA. each using unique pair of src/dst address. As soon as the 3rd Child SA is initiated, it will be assigned a duplicate SPI, since the SPI pool is already exhausted. With a narrow SPI range, the issue is consistently reproducible. With a broader/default range, it becomes rare and unpredictable. Current implementation: xfrm_spi_hash() lookup function computes hash using daddr, proto, and family. So if two SAs have the same SPI but different destination addresses, then they will: a. Hash into different buckets b. Be stored in different linked lists (byspi + h) c. Not be seen in the same hlist_for_each_entry_rcu() iteration. As a result, the lookup will result in NULL and kernel allows that Duplicate SPI Proposed Change: xfrm_state_lookup_spi_proto() does a truly global search - across all states, regardless of hash bucket and matches SPI and proto. Signed-off-by: Aakash Kumar S Acked-by: Herbert Xu Signed-off-by: Steffen Klassert --- net/xfrm/xfrm_state.c | 74 +++++++++++++++++++++++++------------------ 1 file changed, 44 insertions(+), 30 deletions(-) diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c index 77cc418ad69e..b3950234b150 100644 --- a/net/xfrm/xfrm_state.c +++ b/net/xfrm/xfrm_state.c @@ -1711,6 +1711,26 @@ struct xfrm_state *xfrm_state_lookup_byspi(struct net *net, __be32 spi, } EXPORT_SYMBOL(xfrm_state_lookup_byspi); +static struct xfrm_state *xfrm_state_lookup_spi_proto(struct net *net, __be32 spi, u8 proto) +{ + struct xfrm_state *x; + unsigned int i; + + rcu_read_lock(); + for (i = 0; i <= net->xfrm.state_hmask; i++) { + hlist_for_each_entry_rcu(x, &net->xfrm.state_byspi[i], byspi) { + if (x->id.spi == spi && x->id.proto == proto) { + if (!xfrm_state_hold_rcu(x)) + continue; + rcu_read_unlock(); + return x; + } + } + } + rcu_read_unlock(); + return NULL; +} + static void __xfrm_state_insert(struct xfrm_state *x) { struct net *net = xs_net(x); @@ -2555,10 +2575,8 @@ int xfrm_alloc_spi(struct xfrm_state *x, u32 low, u32 high, unsigned int h; struct xfrm_state *x0; int err = -ENOENT; - __be32 minspi = htonl(low); - __be32 maxspi = htonl(high); + u32 range = high - low + 1; __be32 newspi = 0; - u32 mark = x->mark.v & x->mark.m; spin_lock_bh(&x->lock); if (x->km.state == XFRM_STATE_DEAD) { @@ -2572,39 +2590,35 @@ int xfrm_alloc_spi(struct xfrm_state *x, u32 low, u32 high, err = -ENOENT; - if (minspi == maxspi) { - x0 = xfrm_state_lookup(net, mark, &x->id.daddr, minspi, x->id.proto, x->props.family); - if (x0) { - NL_SET_ERR_MSG(extack, "Requested SPI is already in use"); - xfrm_state_put(x0); + for (h = 0; h < range; h++) { + u32 spi = (low == high) ? low : get_random_u32_inclusive(low, high); + newspi = htonl(spi); + + spin_lock_bh(&net->xfrm.xfrm_state_lock); + x0 = xfrm_state_lookup_spi_proto(net, newspi, x->id.proto); + if (!x0) { + x->id.spi = newspi; + h = xfrm_spi_hash(net, &x->id.daddr, newspi, x->id.proto, x->props.family); + XFRM_STATE_INSERT(byspi, &x->byspi, net->xfrm.state_byspi + h, x->xso.type); + spin_unlock_bh(&net->xfrm.xfrm_state_lock); + err = 0; goto unlock; } - newspi = minspi; - } else { - u32 spi = 0; - for (h = 0; h < high-low+1; h++) { - spi = get_random_u32_inclusive(low, high); - x0 = xfrm_state_lookup(net, mark, &x->id.daddr, htonl(spi), x->id.proto, x->props.family); - if (x0 == NULL) { - newspi = htonl(spi); - break; - } - xfrm_state_put(x0); - } - } - if (newspi) { - spin_lock_bh(&net->xfrm.xfrm_state_lock); - x->id.spi = newspi; - h = xfrm_spi_hash(net, &x->id.daddr, x->id.spi, x->id.proto, x->props.family); - XFRM_STATE_INSERT(byspi, &x->byspi, net->xfrm.state_byspi + h, - x->xso.type); + xfrm_state_put(x0); spin_unlock_bh(&net->xfrm.xfrm_state_lock); - err = 0; - } else { - NL_SET_ERR_MSG(extack, "No SPI available in the requested range"); + if (signal_pending(current)) { + err = -ERESTARTSYS; + goto unlock; + } + + if (low == high) + break; } + if (err) + NL_SET_ERR_MSG(extack, "No SPI available in the requested range"); + unlock: spin_unlock_bh(&x->lock); From 95cfe23285a6de17f11715378c93e6aee6d0ca75 Mon Sep 17 00:00:00 2001 From: Jianbo Liu Date: Thu, 3 Jul 2025 11:45:27 +0300 Subject: [PATCH 3/3] xfrm: Skip redundant statistics update for crypto offload In the crypto offload path, every packet is still processed by the software stack. The state's statistics required for the expiration check are being updated in software. However, the code also calls xfrm_dev_state_update_stats(), which triggers a query to the hardware device to fetch statistics. This hardware query is redundant and introduces unnecessary performance overhead. Skip this call when it's crypto offload (not packet offload) to avoid the unnecessary hardware access, thereby improving performance. Signed-off-by: Jianbo Liu Reviewed-by: Leon Romanovsky Signed-off-by: Steffen Klassert --- net/xfrm/xfrm_state.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c index b3950234b150..f0f66405b39d 100644 --- a/net/xfrm/xfrm_state.c +++ b/net/xfrm/xfrm_state.c @@ -2282,7 +2282,12 @@ EXPORT_SYMBOL(xfrm_state_update); int xfrm_state_check_expire(struct xfrm_state *x) { - xfrm_dev_state_update_stats(x); + /* All counters which are needed to decide if state is expired + * are handled by SW for non-packet offload modes. Simply skip + * the following update and save extra boilerplate in drivers. + */ + if (x->xso.type == XFRM_DEV_OFFLOAD_PACKET) + xfrm_dev_state_update_stats(x); if (!READ_ONCE(x->curlft.use_time)) WRITE_ONCE(x->curlft.use_time, ktime_get_real_seconds());