linux/drivers/s390/net/ism_drv.c

881 lines
19 KiB
C
Raw Permalink Normal View History

// SPDX-License-Identifier: GPL-2.0
/*
* ISM driver for s390.
*
* Copyright IBM Corp. 2018
*/
#define KMSG_COMPONENT "ism"
#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
#include <linux/export.h>
#include <linux/module.h>
#include <linux/types.h>
#include <linux/interrupt.h>
#include <linux/device.h>
#include <linux/err.h>
#include <linux/ctype.h>
#include <linux/processor.h>
#include "ism.h"
MODULE_DESCRIPTION("ISM driver for s390");
MODULE_LICENSE("GPL");
#define DRV_NAME "ism"
static const struct pci_device_id ism_device_table[] = {
{ PCI_VDEVICE(IBM, PCI_DEVICE_ID_IBM_ISM), 0 },
{ 0, }
};
MODULE_DEVICE_TABLE(pci, ism_device_table);
static debug_info_t *ism_debug_info;
#define NO_CLIENT 0xff /* must be >= MAX_CLIENTS */
static struct ism_client *clients[MAX_CLIENTS]; /* use an array rather than */
/* a list for fast mapping */
static u8 max_client;
s390/ism: Fix and simplify add()/remove() callback handling Previously the clients_lock was protecting the clients array against concurrent addition/removal of clients but was also accessed from IRQ context. This meant that it had to be a spinlock and that the add() and remove() callbacks in which clients need to do allocation and take mutexes can't be called under the clients_lock. To work around this these callbacks were moved to workqueues. This not only introduced significant complexity but is also subtly broken in at least one way. In ism_dev_init() and ism_dev_exit() clients[i]->tgt_ism is used to communicate the added/removed ISM device to the work function. While write access to client[i]->tgt_ism is protected by the clients_lock and the code waits that there is no pending add/remove work before and after setting clients[i]->tgt_ism this is not enough. The problem is that the wait happens based on per ISM device counters. Thus a concurrent ism_dev_init()/ism_dev_exit() for a different ISM device may overwrite a clients[i]->tgt_ism between unlocking the clients_lock and the subsequent wait for the work to finnish. Thankfully with the clients_lock no longer held in IRQ context it can be turned into a mutex which can be held during the calls to add()/remove() completely removing the need for the workqueues and the associated broken housekeeping including the per ISM device counters and the clients[i]->tgt_ism. Fixes: 89e7d2ba61b7 ("net/ism: Add new API for client registration") Signed-off-by: Niklas Schnelle <schnelle@linux.ibm.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2023-07-07 12:56:21 +02:00
static DEFINE_MUTEX(clients_lock);
static bool ism_v2_capable;
struct ism_dev_list {
struct list_head list;
struct mutex mutex; /* protects ism device list */
};
static struct ism_dev_list ism_dev_list = {
.list = LIST_HEAD_INIT(ism_dev_list.list),
.mutex = __MUTEX_INITIALIZER(ism_dev_list.mutex),
};
2023-07-07 12:56:20 +02:00
static void ism_setup_forwarding(struct ism_client *client, struct ism_dev *ism)
{
unsigned long flags;
spin_lock_irqsave(&ism->lock, flags);
ism->subs[client->id] = client;
spin_unlock_irqrestore(&ism->lock, flags);
}
int ism_register_client(struct ism_client *client)
{
struct ism_dev *ism;
int i, rc = -ENOSPC;
mutex_lock(&ism_dev_list.mutex);
s390/ism: Fix and simplify add()/remove() callback handling Previously the clients_lock was protecting the clients array against concurrent addition/removal of clients but was also accessed from IRQ context. This meant that it had to be a spinlock and that the add() and remove() callbacks in which clients need to do allocation and take mutexes can't be called under the clients_lock. To work around this these callbacks were moved to workqueues. This not only introduced significant complexity but is also subtly broken in at least one way. In ism_dev_init() and ism_dev_exit() clients[i]->tgt_ism is used to communicate the added/removed ISM device to the work function. While write access to client[i]->tgt_ism is protected by the clients_lock and the code waits that there is no pending add/remove work before and after setting clients[i]->tgt_ism this is not enough. The problem is that the wait happens based on per ISM device counters. Thus a concurrent ism_dev_init()/ism_dev_exit() for a different ISM device may overwrite a clients[i]->tgt_ism between unlocking the clients_lock and the subsequent wait for the work to finnish. Thankfully with the clients_lock no longer held in IRQ context it can be turned into a mutex which can be held during the calls to add()/remove() completely removing the need for the workqueues and the associated broken housekeeping including the per ISM device counters and the clients[i]->tgt_ism. Fixes: 89e7d2ba61b7 ("net/ism: Add new API for client registration") Signed-off-by: Niklas Schnelle <schnelle@linux.ibm.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2023-07-07 12:56:21 +02:00
mutex_lock(&clients_lock);
for (i = 0; i < MAX_CLIENTS; ++i) {
if (!clients[i]) {
clients[i] = client;
client->id = i;
if (i == max_client)
max_client++;
rc = 0;
break;
}
}
s390/ism: Fix and simplify add()/remove() callback handling Previously the clients_lock was protecting the clients array against concurrent addition/removal of clients but was also accessed from IRQ context. This meant that it had to be a spinlock and that the add() and remove() callbacks in which clients need to do allocation and take mutexes can't be called under the clients_lock. To work around this these callbacks were moved to workqueues. This not only introduced significant complexity but is also subtly broken in at least one way. In ism_dev_init() and ism_dev_exit() clients[i]->tgt_ism is used to communicate the added/removed ISM device to the work function. While write access to client[i]->tgt_ism is protected by the clients_lock and the code waits that there is no pending add/remove work before and after setting clients[i]->tgt_ism this is not enough. The problem is that the wait happens based on per ISM device counters. Thus a concurrent ism_dev_init()/ism_dev_exit() for a different ISM device may overwrite a clients[i]->tgt_ism between unlocking the clients_lock and the subsequent wait for the work to finnish. Thankfully with the clients_lock no longer held in IRQ context it can be turned into a mutex which can be held during the calls to add()/remove() completely removing the need for the workqueues and the associated broken housekeeping including the per ISM device counters and the clients[i]->tgt_ism. Fixes: 89e7d2ba61b7 ("net/ism: Add new API for client registration") Signed-off-by: Niklas Schnelle <schnelle@linux.ibm.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2023-07-07 12:56:21 +02:00
mutex_unlock(&clients_lock);
if (i < MAX_CLIENTS) {
/* initialize with all devices that we got so far */
list_for_each_entry(ism, &ism_dev_list.list, list) {
ism->priv[i] = NULL;
client->add(ism);
2023-07-07 12:56:20 +02:00
ism_setup_forwarding(client, ism);
}
}
mutex_unlock(&ism_dev_list.mutex);
return rc;
}
EXPORT_SYMBOL_GPL(ism_register_client);
int ism_unregister_client(struct ism_client *client)
{
struct ism_dev *ism;
unsigned long flags;
int rc = 0;
mutex_lock(&ism_dev_list.mutex);
list_for_each_entry(ism, &ism_dev_list.list, list) {
2023-07-07 12:56:20 +02:00
spin_lock_irqsave(&ism->lock, flags);
/* Stop forwarding IRQs and events */
ism->subs[client->id] = NULL;
for (int i = 0; i < ISM_NR_DMBS; ++i) {
if (ism->sba_client_arr[i] == client->id) {
WARN(1, "%s: attempt to unregister '%s' with registered dmb(s)\n",
__func__, client->name);
rc = -EBUSY;
goto err_reg_dmb;
}
}
2023-07-07 12:56:20 +02:00
spin_unlock_irqrestore(&ism->lock, flags);
}
mutex_unlock(&ism_dev_list.mutex);
mutex_lock(&clients_lock);
clients[client->id] = NULL;
if (client->id + 1 == max_client)
max_client--;
mutex_unlock(&clients_lock);
return rc;
err_reg_dmb:
spin_unlock_irqrestore(&ism->lock, flags);
mutex_unlock(&ism_dev_list.mutex);
return rc;
}
EXPORT_SYMBOL_GPL(ism_unregister_client);
static int ism_cmd(struct ism_dev *ism, void *cmd)
{
struct ism_req_hdr *req = cmd;
struct ism_resp_hdr *resp = cmd;
s390/ism: fix concurrency management in ism_cmd() The s390x ISM device data sheet clearly states that only one request-response sequence is allowable per ISM function at any point in time. Unfortunately as of today the s390/ism driver in Linux does not honor that requirement. This patch aims to rectify that. This problem was discovered based on Aliaksei's bug report which states that for certain workloads the ISM functions end up entering error state (with PEC 2 as seen from the logs) after a while and as a consequence connections handled by the respective function break, and for future connection requests the ISM device is not considered -- given it is in a dysfunctional state. During further debugging PEC 3A was observed as well. A kernel message like [ 1211.244319] zpci: 061a:00:00.0: Event 0x2 reports an error for PCI function 0x61a is a reliable indicator of the stated function entering error state with PEC 2. Let me also point out that a kernel message like [ 1211.244325] zpci: 061a:00:00.0: The ism driver bound to the device does not support error recovery is a reliable indicator that the ISM function won't be auto-recovered because the ISM driver currently lacks support for it. On a technical level, without this synchronization, commands (inputs to the FW) may be partially or fully overwritten (corrupted) by another CPU trying to issue commands on the same function. There is hard evidence that this can lead to DMB token values being used as DMB IOVAs, leading to PEC 2 PCI events indicating invalid DMA. But this is only one of the failure modes imaginable. In theory even completely losing one command and executing another one twice and then trying to interpret the outputs as if the command we intended to execute was actually executed and not the other one is also possible. Frankly, I don't feel confident about providing an exhaustive list of possible consequences. Fixes: 684b89bc39ce ("s390/ism: add device driver for internal shared memory") Reported-by: Aliaksei Makarau <Aliaksei.Makarau@ibm.com> Tested-by: Mahanta Jambigi <mjambigi@linux.ibm.com> Tested-by: Aliaksei Makarau <Aliaksei.Makarau@ibm.com> Signed-off-by: Halil Pasic <pasic@linux.ibm.com> Reviewed-by: Alexandra Winter <wintera@linux.ibm.com> Signed-off-by: Alexandra Winter <wintera@linux.ibm.com> Reviewed-by: Simon Horman <horms@kernel.org> Link: https://patch.msgid.link/20250722161817.1298473-1-wintera@linux.ibm.com Signed-off-by: Paolo Abeni <pabeni@redhat.com>
2025-07-22 18:18:17 +02:00
spin_lock(&ism->cmd_lock);
__ism_write_cmd(ism, req + 1, sizeof(*req), req->len - sizeof(*req));
__ism_write_cmd(ism, req, 0, sizeof(*req));
WRITE_ONCE(resp->ret, ISM_ERROR);
__ism_read_cmd(ism, resp, 0, sizeof(*resp));
if (resp->ret) {
debug_text_event(ism_debug_info, 0, "cmd failure");
debug_event(ism_debug_info, 0, resp, sizeof(*resp));
goto out;
}
__ism_read_cmd(ism, resp + 1, sizeof(*resp), resp->len - sizeof(*resp));
out:
s390/ism: fix concurrency management in ism_cmd() The s390x ISM device data sheet clearly states that only one request-response sequence is allowable per ISM function at any point in time. Unfortunately as of today the s390/ism driver in Linux does not honor that requirement. This patch aims to rectify that. This problem was discovered based on Aliaksei's bug report which states that for certain workloads the ISM functions end up entering error state (with PEC 2 as seen from the logs) after a while and as a consequence connections handled by the respective function break, and for future connection requests the ISM device is not considered -- given it is in a dysfunctional state. During further debugging PEC 3A was observed as well. A kernel message like [ 1211.244319] zpci: 061a:00:00.0: Event 0x2 reports an error for PCI function 0x61a is a reliable indicator of the stated function entering error state with PEC 2. Let me also point out that a kernel message like [ 1211.244325] zpci: 061a:00:00.0: The ism driver bound to the device does not support error recovery is a reliable indicator that the ISM function won't be auto-recovered because the ISM driver currently lacks support for it. On a technical level, without this synchronization, commands (inputs to the FW) may be partially or fully overwritten (corrupted) by another CPU trying to issue commands on the same function. There is hard evidence that this can lead to DMB token values being used as DMB IOVAs, leading to PEC 2 PCI events indicating invalid DMA. But this is only one of the failure modes imaginable. In theory even completely losing one command and executing another one twice and then trying to interpret the outputs as if the command we intended to execute was actually executed and not the other one is also possible. Frankly, I don't feel confident about providing an exhaustive list of possible consequences. Fixes: 684b89bc39ce ("s390/ism: add device driver for internal shared memory") Reported-by: Aliaksei Makarau <Aliaksei.Makarau@ibm.com> Tested-by: Mahanta Jambigi <mjambigi@linux.ibm.com> Tested-by: Aliaksei Makarau <Aliaksei.Makarau@ibm.com> Signed-off-by: Halil Pasic <pasic@linux.ibm.com> Reviewed-by: Alexandra Winter <wintera@linux.ibm.com> Signed-off-by: Alexandra Winter <wintera@linux.ibm.com> Reviewed-by: Simon Horman <horms@kernel.org> Link: https://patch.msgid.link/20250722161817.1298473-1-wintera@linux.ibm.com Signed-off-by: Paolo Abeni <pabeni@redhat.com>
2025-07-22 18:18:17 +02:00
spin_unlock(&ism->cmd_lock);
return resp->ret;
}
static int ism_cmd_simple(struct ism_dev *ism, u32 cmd_code)
{
union ism_cmd_simple cmd;
memset(&cmd, 0, sizeof(cmd));
cmd.request.hdr.cmd = cmd_code;
cmd.request.hdr.len = sizeof(cmd.request);
return ism_cmd(ism, &cmd);
}
static int query_info(struct ism_dev *ism)
{
union ism_qi cmd;
memset(&cmd, 0, sizeof(cmd));
cmd.request.hdr.cmd = ISM_QUERY_INFO;
cmd.request.hdr.len = sizeof(cmd.request);
if (ism_cmd(ism, &cmd))
goto out;
debug_text_event(ism_debug_info, 3, "query info");
debug_event(ism_debug_info, 3, &cmd.response, sizeof(cmd.response));
out:
return 0;
}
static int register_sba(struct ism_dev *ism)
{
union ism_reg_sba cmd;
dma_addr_t dma_handle;
struct ism_sba *sba;
sba = dma_alloc_coherent(&ism->pdev->dev, PAGE_SIZE, &dma_handle,
GFP_KERNEL);
if (!sba)
return -ENOMEM;
memset(&cmd, 0, sizeof(cmd));
cmd.request.hdr.cmd = ISM_REG_SBA;
cmd.request.hdr.len = sizeof(cmd.request);
cmd.request.sba = dma_handle;
if (ism_cmd(ism, &cmd)) {
dma_free_coherent(&ism->pdev->dev, PAGE_SIZE, sba, dma_handle);
return -EIO;
}
ism->sba = sba;
ism->sba_dma_addr = dma_handle;
return 0;
}
static int register_ieq(struct ism_dev *ism)
{
union ism_reg_ieq cmd;
dma_addr_t dma_handle;
struct ism_eq *ieq;
ieq = dma_alloc_coherent(&ism->pdev->dev, PAGE_SIZE, &dma_handle,
GFP_KERNEL);
if (!ieq)
return -ENOMEM;
memset(&cmd, 0, sizeof(cmd));
cmd.request.hdr.cmd = ISM_REG_IEQ;
cmd.request.hdr.len = sizeof(cmd.request);
cmd.request.ieq = dma_handle;
cmd.request.len = sizeof(*ieq);
if (ism_cmd(ism, &cmd)) {
dma_free_coherent(&ism->pdev->dev, PAGE_SIZE, ieq, dma_handle);
return -EIO;
}
ism->ieq = ieq;
ism->ieq_idx = -1;
ism->ieq_dma_addr = dma_handle;
return 0;
}
static int unregister_sba(struct ism_dev *ism)
{
int ret;
if (!ism->sba)
return 0;
ret = ism_cmd_simple(ism, ISM_UNREG_SBA);
if (ret && ret != ISM_ERROR)
return -EIO;
dma_free_coherent(&ism->pdev->dev, PAGE_SIZE,
ism->sba, ism->sba_dma_addr);
ism->sba = NULL;
ism->sba_dma_addr = 0;
return 0;
}
static int unregister_ieq(struct ism_dev *ism)
{
int ret;
if (!ism->ieq)
return 0;
ret = ism_cmd_simple(ism, ISM_UNREG_IEQ);
if (ret && ret != ISM_ERROR)
return -EIO;
dma_free_coherent(&ism->pdev->dev, PAGE_SIZE,
ism->ieq, ism->ieq_dma_addr);
ism->ieq = NULL;
ism->ieq_dma_addr = 0;
return 0;
}
static int ism_read_local_gid(struct ism_dev *ism)
{
union ism_read_gid cmd;
int ret;
memset(&cmd, 0, sizeof(cmd));
cmd.request.hdr.cmd = ISM_READ_GID;
cmd.request.hdr.len = sizeof(cmd.request);
ret = ism_cmd(ism, &cmd);
if (ret)
goto out;
ism->local_gid = cmd.response.gid;
out:
return ret;
}
static void ism_free_dmb(struct ism_dev *ism, struct ism_dmb *dmb)
{
clear_bit(dmb->sba_idx, ism->sba_bitmap);
dma_unmap_page(&ism->pdev->dev, dmb->dma_addr, dmb->dmb_len,
DMA_FROM_DEVICE);
folio_put(virt_to_folio(dmb->cpu_addr));
}
static int ism_alloc_dmb(struct ism_dev *ism, struct ism_dmb *dmb)
{
struct folio *folio;
unsigned long bit;
int rc;
if (PAGE_ALIGN(dmb->dmb_len) > dma_get_max_seg_size(&ism->pdev->dev))
return -EINVAL;
if (!dmb->sba_idx) {
bit = find_next_zero_bit(ism->sba_bitmap, ISM_NR_DMBS,
ISM_DMB_BIT_OFFSET);
if (bit == ISM_NR_DMBS)
return -ENOSPC;
dmb->sba_idx = bit;
}
if (dmb->sba_idx < ISM_DMB_BIT_OFFSET ||
test_and_set_bit(dmb->sba_idx, ism->sba_bitmap))
return -EINVAL;
folio = folio_alloc(GFP_KERNEL | __GFP_NOWARN | __GFP_NOMEMALLOC |
__GFP_NORETRY, get_order(dmb->dmb_len));
if (!folio) {
rc = -ENOMEM;
goto out_bit;
}
dmb->cpu_addr = folio_address(folio);
dmb->dma_addr = dma_map_page(&ism->pdev->dev,
virt_to_page(dmb->cpu_addr), 0,
dmb->dmb_len, DMA_FROM_DEVICE);
if (dma_mapping_error(&ism->pdev->dev, dmb->dma_addr)) {
rc = -ENOMEM;
goto out_free;
}
return 0;
out_free:
kfree(dmb->cpu_addr);
out_bit:
clear_bit(dmb->sba_idx, ism->sba_bitmap);
return rc;
}
int ism_register_dmb(struct ism_dev *ism, struct ism_dmb *dmb,
struct ism_client *client)
{
union ism_reg_dmb cmd;
2023-07-07 12:56:20 +02:00
unsigned long flags;
int ret;
ret = ism_alloc_dmb(ism, dmb);
if (ret)
goto out;
memset(&cmd, 0, sizeof(cmd));
cmd.request.hdr.cmd = ISM_REG_DMB;
cmd.request.hdr.len = sizeof(cmd.request);
cmd.request.dmb = dmb->dma_addr;
cmd.request.dmb_len = dmb->dmb_len;
cmd.request.sba_idx = dmb->sba_idx;
cmd.request.vlan_valid = dmb->vlan_valid;
cmd.request.vlan_id = dmb->vlan_id;
cmd.request.rgid = dmb->rgid;
ret = ism_cmd(ism, &cmd);
if (ret) {
ism_free_dmb(ism, dmb);
goto out;
}
dmb->dmb_tok = cmd.response.dmb_tok;
2023-07-07 12:56:20 +02:00
spin_lock_irqsave(&ism->lock, flags);
ism->sba_client_arr[dmb->sba_idx - ISM_DMB_BIT_OFFSET] = client->id;
2023-07-07 12:56:20 +02:00
spin_unlock_irqrestore(&ism->lock, flags);
out:
return ret;
}
EXPORT_SYMBOL_GPL(ism_register_dmb);
int ism_unregister_dmb(struct ism_dev *ism, struct ism_dmb *dmb)
{
union ism_unreg_dmb cmd;
2023-07-07 12:56:20 +02:00
unsigned long flags;
int ret;
memset(&cmd, 0, sizeof(cmd));
cmd.request.hdr.cmd = ISM_UNREG_DMB;
cmd.request.hdr.len = sizeof(cmd.request);
cmd.request.dmb_tok = dmb->dmb_tok;
2023-07-07 12:56:20 +02:00
spin_lock_irqsave(&ism->lock, flags);
ism->sba_client_arr[dmb->sba_idx - ISM_DMB_BIT_OFFSET] = NO_CLIENT;
2023-07-07 12:56:20 +02:00
spin_unlock_irqrestore(&ism->lock, flags);
ret = ism_cmd(ism, &cmd);
if (ret && ret != ISM_ERROR)
goto out;
ism_free_dmb(ism, dmb);
out:
return ret;
}
EXPORT_SYMBOL_GPL(ism_unregister_dmb);
static int ism_add_vlan_id(struct ism_dev *ism, u64 vlan_id)
{
union ism_set_vlan_id cmd;
memset(&cmd, 0, sizeof(cmd));
cmd.request.hdr.cmd = ISM_ADD_VLAN_ID;
cmd.request.hdr.len = sizeof(cmd.request);
cmd.request.vlan_id = vlan_id;
return ism_cmd(ism, &cmd);
}
static int ism_del_vlan_id(struct ism_dev *ism, u64 vlan_id)
{
union ism_set_vlan_id cmd;
memset(&cmd, 0, sizeof(cmd));
cmd.request.hdr.cmd = ISM_DEL_VLAN_ID;
cmd.request.hdr.len = sizeof(cmd.request);
cmd.request.vlan_id = vlan_id;
return ism_cmd(ism, &cmd);
}
static unsigned int max_bytes(unsigned int start, unsigned int len,
unsigned int boundary)
{
return min(boundary - (start & (boundary - 1)), len);
}
int ism_move(struct ism_dev *ism, u64 dmb_tok, unsigned int idx, bool sf,
unsigned int offset, void *data, unsigned int size)
{
unsigned int bytes;
u64 dmb_req;
int ret;
while (size) {
bytes = max_bytes(offset, size, PAGE_SIZE);
dmb_req = ISM_CREATE_REQ(dmb_tok, idx, size == bytes ? sf : 0,
offset);
ret = __ism_move(ism, dmb_req, data, bytes);
if (ret)
return ret;
size -= bytes;
data += bytes;
offset += bytes;
}
return 0;
}
EXPORT_SYMBOL_GPL(ism_move);
static void ism_handle_event(struct ism_dev *ism)
{
struct ism_event *entry;
2023-07-07 12:56:20 +02:00
struct ism_client *clt;
int i;
while ((ism->ieq_idx + 1) != READ_ONCE(ism->ieq->header.idx)) {
if (++(ism->ieq_idx) == ARRAY_SIZE(ism->ieq->entry))
ism->ieq_idx = 0;
entry = &ism->ieq->entry[ism->ieq_idx];
debug_event(ism_debug_info, 2, entry, sizeof(*entry));
2023-07-07 12:56:20 +02:00
for (i = 0; i < max_client; ++i) {
clt = ism->subs[i];
if (clt)
clt->handle_event(ism, entry);
}
}
}
static irqreturn_t ism_handle_irq(int irq, void *data)
{
struct ism_dev *ism = data;
unsigned long bit, end;
unsigned long *bv;
u16 dmbemask;
2023-07-07 12:56:20 +02:00
u8 client_id;
bv = (void *) &ism->sba->dmb_bits[ISM_DMB_WORD_OFFSET];
end = sizeof(ism->sba->dmb_bits) * BITS_PER_BYTE - ISM_DMB_BIT_OFFSET;
spin_lock(&ism->lock);
ism->sba->s = 0;
barrier();
for (bit = 0;;) {
bit = find_next_bit_inv(bv, end, bit);
if (bit >= end)
break;
clear_bit_inv(bit, bv);
dmbemask = ism->sba->dmbe_mask[bit + ISM_DMB_BIT_OFFSET];
ism->sba->dmbe_mask[bit + ISM_DMB_BIT_OFFSET] = 0;
barrier();
2023-07-07 12:56:20 +02:00
client_id = ism->sba_client_arr[bit];
if (unlikely(client_id == NO_CLIENT || !ism->subs[client_id]))
continue;
ism->subs[client_id]->handle_irq(ism, bit + ISM_DMB_BIT_OFFSET, dmbemask);
}
if (ism->sba->e) {
ism->sba->e = 0;
barrier();
ism_handle_event(ism);
}
spin_unlock(&ism->lock);
return IRQ_HANDLED;
}
static int ism_dev_init(struct ism_dev *ism)
{
struct pci_dev *pdev = ism->pdev;
int i, ret;
ret = pci_alloc_irq_vectors(pdev, 1, 1, PCI_IRQ_MSI);
if (ret <= 0)
goto out;
ism->sba_client_arr = kzalloc(ISM_NR_DMBS, GFP_KERNEL);
if (!ism->sba_client_arr)
goto free_vectors;
memset(ism->sba_client_arr, NO_CLIENT, ISM_NR_DMBS);
ret = request_irq(pci_irq_vector(pdev, 0), ism_handle_irq, 0,
pci_name(pdev), ism);
if (ret)
goto free_client_arr;
ret = register_sba(ism);
if (ret)
goto free_irq;
ret = register_ieq(ism);
if (ret)
goto unreg_sba;
ret = ism_read_local_gid(ism);
if (ret)
goto unreg_ieq;
if (!ism_add_vlan_id(ism, ISM_RESERVED_VLANID))
/* hardware is V2 capable */
ism_v2_capable = true;
else
ism_v2_capable = false;
s390/ism: Fix and simplify add()/remove() callback handling Previously the clients_lock was protecting the clients array against concurrent addition/removal of clients but was also accessed from IRQ context. This meant that it had to be a spinlock and that the add() and remove() callbacks in which clients need to do allocation and take mutexes can't be called under the clients_lock. To work around this these callbacks were moved to workqueues. This not only introduced significant complexity but is also subtly broken in at least one way. In ism_dev_init() and ism_dev_exit() clients[i]->tgt_ism is used to communicate the added/removed ISM device to the work function. While write access to client[i]->tgt_ism is protected by the clients_lock and the code waits that there is no pending add/remove work before and after setting clients[i]->tgt_ism this is not enough. The problem is that the wait happens based on per ISM device counters. Thus a concurrent ism_dev_init()/ism_dev_exit() for a different ISM device may overwrite a clients[i]->tgt_ism between unlocking the clients_lock and the subsequent wait for the work to finnish. Thankfully with the clients_lock no longer held in IRQ context it can be turned into a mutex which can be held during the calls to add()/remove() completely removing the need for the workqueues and the associated broken housekeeping including the per ISM device counters and the clients[i]->tgt_ism. Fixes: 89e7d2ba61b7 ("net/ism: Add new API for client registration") Signed-off-by: Niklas Schnelle <schnelle@linux.ibm.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2023-07-07 12:56:21 +02:00
mutex_lock(&ism_dev_list.mutex);
mutex_lock(&clients_lock);
for (i = 0; i < max_client; ++i) {
if (clients[i]) {
s390/ism: Fix and simplify add()/remove() callback handling Previously the clients_lock was protecting the clients array against concurrent addition/removal of clients but was also accessed from IRQ context. This meant that it had to be a spinlock and that the add() and remove() callbacks in which clients need to do allocation and take mutexes can't be called under the clients_lock. To work around this these callbacks were moved to workqueues. This not only introduced significant complexity but is also subtly broken in at least one way. In ism_dev_init() and ism_dev_exit() clients[i]->tgt_ism is used to communicate the added/removed ISM device to the work function. While write access to client[i]->tgt_ism is protected by the clients_lock and the code waits that there is no pending add/remove work before and after setting clients[i]->tgt_ism this is not enough. The problem is that the wait happens based on per ISM device counters. Thus a concurrent ism_dev_init()/ism_dev_exit() for a different ISM device may overwrite a clients[i]->tgt_ism between unlocking the clients_lock and the subsequent wait for the work to finnish. Thankfully with the clients_lock no longer held in IRQ context it can be turned into a mutex which can be held during the calls to add()/remove() completely removing the need for the workqueues and the associated broken housekeeping including the per ISM device counters and the clients[i]->tgt_ism. Fixes: 89e7d2ba61b7 ("net/ism: Add new API for client registration") Signed-off-by: Niklas Schnelle <schnelle@linux.ibm.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2023-07-07 12:56:21 +02:00
clients[i]->add(ism);
ism_setup_forwarding(clients[i], ism);
}
s390/ism: Fix and simplify add()/remove() callback handling Previously the clients_lock was protecting the clients array against concurrent addition/removal of clients but was also accessed from IRQ context. This meant that it had to be a spinlock and that the add() and remove() callbacks in which clients need to do allocation and take mutexes can't be called under the clients_lock. To work around this these callbacks were moved to workqueues. This not only introduced significant complexity but is also subtly broken in at least one way. In ism_dev_init() and ism_dev_exit() clients[i]->tgt_ism is used to communicate the added/removed ISM device to the work function. While write access to client[i]->tgt_ism is protected by the clients_lock and the code waits that there is no pending add/remove work before and after setting clients[i]->tgt_ism this is not enough. The problem is that the wait happens based on per ISM device counters. Thus a concurrent ism_dev_init()/ism_dev_exit() for a different ISM device may overwrite a clients[i]->tgt_ism between unlocking the clients_lock and the subsequent wait for the work to finnish. Thankfully with the clients_lock no longer held in IRQ context it can be turned into a mutex which can be held during the calls to add()/remove() completely removing the need for the workqueues and the associated broken housekeeping including the per ISM device counters and the clients[i]->tgt_ism. Fixes: 89e7d2ba61b7 ("net/ism: Add new API for client registration") Signed-off-by: Niklas Schnelle <schnelle@linux.ibm.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2023-07-07 12:56:21 +02:00
}
mutex_unlock(&clients_lock);
list_add(&ism->list, &ism_dev_list.list);
mutex_unlock(&ism_dev_list.mutex);
query_info(ism);
return 0;
unreg_ieq:
unregister_ieq(ism);
unreg_sba:
unregister_sba(ism);
free_irq:
free_irq(pci_irq_vector(pdev, 0), ism);
free_client_arr:
kfree(ism->sba_client_arr);
free_vectors:
pci_free_irq_vectors(pdev);
out:
return ret;
}
static void ism_dev_release(struct device *dev)
{
struct ism_dev *ism;
ism = container_of(dev, struct ism_dev, dev);
kfree(ism);
}
static int ism_probe(struct pci_dev *pdev, const struct pci_device_id *id)
{
struct ism_dev *ism;
int ret;
ism = kzalloc(sizeof(*ism), GFP_KERNEL);
if (!ism)
return -ENOMEM;
spin_lock_init(&ism->lock);
s390/ism: fix concurrency management in ism_cmd() The s390x ISM device data sheet clearly states that only one request-response sequence is allowable per ISM function at any point in time. Unfortunately as of today the s390/ism driver in Linux does not honor that requirement. This patch aims to rectify that. This problem was discovered based on Aliaksei's bug report which states that for certain workloads the ISM functions end up entering error state (with PEC 2 as seen from the logs) after a while and as a consequence connections handled by the respective function break, and for future connection requests the ISM device is not considered -- given it is in a dysfunctional state. During further debugging PEC 3A was observed as well. A kernel message like [ 1211.244319] zpci: 061a:00:00.0: Event 0x2 reports an error for PCI function 0x61a is a reliable indicator of the stated function entering error state with PEC 2. Let me also point out that a kernel message like [ 1211.244325] zpci: 061a:00:00.0: The ism driver bound to the device does not support error recovery is a reliable indicator that the ISM function won't be auto-recovered because the ISM driver currently lacks support for it. On a technical level, without this synchronization, commands (inputs to the FW) may be partially or fully overwritten (corrupted) by another CPU trying to issue commands on the same function. There is hard evidence that this can lead to DMB token values being used as DMB IOVAs, leading to PEC 2 PCI events indicating invalid DMA. But this is only one of the failure modes imaginable. In theory even completely losing one command and executing another one twice and then trying to interpret the outputs as if the command we intended to execute was actually executed and not the other one is also possible. Frankly, I don't feel confident about providing an exhaustive list of possible consequences. Fixes: 684b89bc39ce ("s390/ism: add device driver for internal shared memory") Reported-by: Aliaksei Makarau <Aliaksei.Makarau@ibm.com> Tested-by: Mahanta Jambigi <mjambigi@linux.ibm.com> Tested-by: Aliaksei Makarau <Aliaksei.Makarau@ibm.com> Signed-off-by: Halil Pasic <pasic@linux.ibm.com> Reviewed-by: Alexandra Winter <wintera@linux.ibm.com> Signed-off-by: Alexandra Winter <wintera@linux.ibm.com> Reviewed-by: Simon Horman <horms@kernel.org> Link: https://patch.msgid.link/20250722161817.1298473-1-wintera@linux.ibm.com Signed-off-by: Paolo Abeni <pabeni@redhat.com>
2025-07-22 18:18:17 +02:00
spin_lock_init(&ism->cmd_lock);
dev_set_drvdata(&pdev->dev, ism);
ism->pdev = pdev;
ism->dev.parent = &pdev->dev;
ism->dev.release = ism_dev_release;
device_initialize(&ism->dev);
dev_set_name(&ism->dev, "%s", dev_name(&pdev->dev));
ret = device_add(&ism->dev);
if (ret)
goto err_dev;
ret = pci_enable_device_mem(pdev);
if (ret)
goto err;
ret = pci_request_mem_regions(pdev, DRV_NAME);
if (ret)
goto err_disable;
ret = dma_set_mask_and_coherent(&pdev->dev, DMA_BIT_MASK(64));
if (ret)
goto err_resource;
dma_set_seg_boundary(&pdev->dev, SZ_1M - 1);
dma_set_max_seg_size(&pdev->dev, SZ_1M);
pci_set_master(pdev);
ret = ism_dev_init(ism);
if (ret)
goto err_resource;
return 0;
err_resource:
pci_release_mem_regions(pdev);
err_disable:
pci_disable_device(pdev);
err:
device_del(&ism->dev);
err_dev:
dev_set_drvdata(&pdev->dev, NULL);
put_device(&ism->dev);
return ret;
}
static void ism_dev_exit(struct ism_dev *ism)
{
struct pci_dev *pdev = ism->pdev;
unsigned long flags;
int i;
s390/ism: Fix and simplify add()/remove() callback handling Previously the clients_lock was protecting the clients array against concurrent addition/removal of clients but was also accessed from IRQ context. This meant that it had to be a spinlock and that the add() and remove() callbacks in which clients need to do allocation and take mutexes can't be called under the clients_lock. To work around this these callbacks were moved to workqueues. This not only introduced significant complexity but is also subtly broken in at least one way. In ism_dev_init() and ism_dev_exit() clients[i]->tgt_ism is used to communicate the added/removed ISM device to the work function. While write access to client[i]->tgt_ism is protected by the clients_lock and the code waits that there is no pending add/remove work before and after setting clients[i]->tgt_ism this is not enough. The problem is that the wait happens based on per ISM device counters. Thus a concurrent ism_dev_init()/ism_dev_exit() for a different ISM device may overwrite a clients[i]->tgt_ism between unlocking the clients_lock and the subsequent wait for the work to finnish. Thankfully with the clients_lock no longer held in IRQ context it can be turned into a mutex which can be held during the calls to add()/remove() completely removing the need for the workqueues and the associated broken housekeeping including the per ISM device counters and the clients[i]->tgt_ism. Fixes: 89e7d2ba61b7 ("net/ism: Add new API for client registration") Signed-off-by: Niklas Schnelle <schnelle@linux.ibm.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2023-07-07 12:56:21 +02:00
spin_lock_irqsave(&ism->lock, flags);
for (i = 0; i < max_client; ++i)
s390/ism: Fix and simplify add()/remove() callback handling Previously the clients_lock was protecting the clients array against concurrent addition/removal of clients but was also accessed from IRQ context. This meant that it had to be a spinlock and that the add() and remove() callbacks in which clients need to do allocation and take mutexes can't be called under the clients_lock. To work around this these callbacks were moved to workqueues. This not only introduced significant complexity but is also subtly broken in at least one way. In ism_dev_init() and ism_dev_exit() clients[i]->tgt_ism is used to communicate the added/removed ISM device to the work function. While write access to client[i]->tgt_ism is protected by the clients_lock and the code waits that there is no pending add/remove work before and after setting clients[i]->tgt_ism this is not enough. The problem is that the wait happens based on per ISM device counters. Thus a concurrent ism_dev_init()/ism_dev_exit() for a different ISM device may overwrite a clients[i]->tgt_ism between unlocking the clients_lock and the subsequent wait for the work to finnish. Thankfully with the clients_lock no longer held in IRQ context it can be turned into a mutex which can be held during the calls to add()/remove() completely removing the need for the workqueues and the associated broken housekeeping including the per ISM device counters and the clients[i]->tgt_ism. Fixes: 89e7d2ba61b7 ("net/ism: Add new API for client registration") Signed-off-by: Niklas Schnelle <schnelle@linux.ibm.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2023-07-07 12:56:21 +02:00
ism->subs[i] = NULL;
spin_unlock_irqrestore(&ism->lock, flags);
s390/ism: Fix and simplify add()/remove() callback handling Previously the clients_lock was protecting the clients array against concurrent addition/removal of clients but was also accessed from IRQ context. This meant that it had to be a spinlock and that the add() and remove() callbacks in which clients need to do allocation and take mutexes can't be called under the clients_lock. To work around this these callbacks were moved to workqueues. This not only introduced significant complexity but is also subtly broken in at least one way. In ism_dev_init() and ism_dev_exit() clients[i]->tgt_ism is used to communicate the added/removed ISM device to the work function. While write access to client[i]->tgt_ism is protected by the clients_lock and the code waits that there is no pending add/remove work before and after setting clients[i]->tgt_ism this is not enough. The problem is that the wait happens based on per ISM device counters. Thus a concurrent ism_dev_init()/ism_dev_exit() for a different ISM device may overwrite a clients[i]->tgt_ism between unlocking the clients_lock and the subsequent wait for the work to finnish. Thankfully with the clients_lock no longer held in IRQ context it can be turned into a mutex which can be held during the calls to add()/remove() completely removing the need for the workqueues and the associated broken housekeeping including the per ISM device counters and the clients[i]->tgt_ism. Fixes: 89e7d2ba61b7 ("net/ism: Add new API for client registration") Signed-off-by: Niklas Schnelle <schnelle@linux.ibm.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2023-07-07 12:56:21 +02:00
mutex_lock(&ism_dev_list.mutex);
mutex_lock(&clients_lock);
for (i = 0; i < max_client; ++i) {
if (clients[i])
clients[i]->remove(ism);
}
mutex_unlock(&clients_lock);
if (ism_v2_capable)
ism_del_vlan_id(ism, ISM_RESERVED_VLANID);
unregister_ieq(ism);
unregister_sba(ism);
free_irq(pci_irq_vector(pdev, 0), ism);
kfree(ism->sba_client_arr);
pci_free_irq_vectors(pdev);
list_del_init(&ism->list);
s390/ism: Fix and simplify add()/remove() callback handling Previously the clients_lock was protecting the clients array against concurrent addition/removal of clients but was also accessed from IRQ context. This meant that it had to be a spinlock and that the add() and remove() callbacks in which clients need to do allocation and take mutexes can't be called under the clients_lock. To work around this these callbacks were moved to workqueues. This not only introduced significant complexity but is also subtly broken in at least one way. In ism_dev_init() and ism_dev_exit() clients[i]->tgt_ism is used to communicate the added/removed ISM device to the work function. While write access to client[i]->tgt_ism is protected by the clients_lock and the code waits that there is no pending add/remove work before and after setting clients[i]->tgt_ism this is not enough. The problem is that the wait happens based on per ISM device counters. Thus a concurrent ism_dev_init()/ism_dev_exit() for a different ISM device may overwrite a clients[i]->tgt_ism between unlocking the clients_lock and the subsequent wait for the work to finnish. Thankfully with the clients_lock no longer held in IRQ context it can be turned into a mutex which can be held during the calls to add()/remove() completely removing the need for the workqueues and the associated broken housekeeping including the per ISM device counters and the clients[i]->tgt_ism. Fixes: 89e7d2ba61b7 ("net/ism: Add new API for client registration") Signed-off-by: Niklas Schnelle <schnelle@linux.ibm.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2023-07-07 12:56:21 +02:00
mutex_unlock(&ism_dev_list.mutex);
}
static void ism_remove(struct pci_dev *pdev)
{
struct ism_dev *ism = dev_get_drvdata(&pdev->dev);
ism_dev_exit(ism);
pci_release_mem_regions(pdev);
pci_disable_device(pdev);
device_del(&ism->dev);
dev_set_drvdata(&pdev->dev, NULL);
put_device(&ism->dev);
}
static struct pci_driver ism_driver = {
.name = DRV_NAME,
.id_table = ism_device_table,
.probe = ism_probe,
.remove = ism_remove,
};
static int __init ism_init(void)
{
int ret;
ism_debug_info = debug_register("ism", 2, 1, 16);
if (!ism_debug_info)
return -ENODEV;
memset(clients, 0, sizeof(clients));
max_client = 0;
debug_register_view(ism_debug_info, &debug_hex_ascii_view);
ret = pci_register_driver(&ism_driver);
if (ret)
debug_unregister(ism_debug_info);
return ret;
}
static void __exit ism_exit(void)
{
pci_unregister_driver(&ism_driver);
debug_unregister(ism_debug_info);
}
module_init(ism_init);
module_exit(ism_exit);
/*************************** SMC-D Implementation *****************************/
#if IS_ENABLED(CONFIG_SMC)
static int ism_query_rgid(struct ism_dev *ism, u64 rgid, u32 vid_valid,
u32 vid)
{
union ism_query_rgid cmd;
memset(&cmd, 0, sizeof(cmd));
cmd.request.hdr.cmd = ISM_QUERY_RGID;
cmd.request.hdr.len = sizeof(cmd.request);
cmd.request.rgid = rgid;
cmd.request.vlan_valid = vid_valid;
cmd.request.vlan_id = vid;
return ism_cmd(ism, &cmd);
}
static int smcd_query_rgid(struct smcd_dev *smcd, struct smcd_gid *rgid,
u32 vid_valid, u32 vid)
{
return ism_query_rgid(smcd->priv, rgid->gid, vid_valid, vid);
}
static int smcd_register_dmb(struct smcd_dev *smcd, struct smcd_dmb *dmb,
void *client)
{
return ism_register_dmb(smcd->priv, (struct ism_dmb *)dmb, client);
}
static int smcd_unregister_dmb(struct smcd_dev *smcd, struct smcd_dmb *dmb)
{
return ism_unregister_dmb(smcd->priv, (struct ism_dmb *)dmb);
}
static int smcd_add_vlan_id(struct smcd_dev *smcd, u64 vlan_id)
{
return ism_add_vlan_id(smcd->priv, vlan_id);
}
static int smcd_del_vlan_id(struct smcd_dev *smcd, u64 vlan_id)
{
return ism_del_vlan_id(smcd->priv, vlan_id);
}
static int smcd_set_vlan_required(struct smcd_dev *smcd)
{
return ism_cmd_simple(smcd->priv, ISM_SET_VLAN);
}
static int smcd_reset_vlan_required(struct smcd_dev *smcd)
{
return ism_cmd_simple(smcd->priv, ISM_RESET_VLAN);
}
static int ism_signal_ieq(struct ism_dev *ism, u64 rgid, u32 trigger_irq,
u32 event_code, u64 info)
{
union ism_sig_ieq cmd;
memset(&cmd, 0, sizeof(cmd));
cmd.request.hdr.cmd = ISM_SIGNAL_IEQ;
cmd.request.hdr.len = sizeof(cmd.request);
cmd.request.rgid = rgid;
cmd.request.trigger_irq = trigger_irq;
cmd.request.event_code = event_code;
cmd.request.info = info;
return ism_cmd(ism, &cmd);
}
static int smcd_signal_ieq(struct smcd_dev *smcd, struct smcd_gid *rgid,
u32 trigger_irq, u32 event_code, u64 info)
{
return ism_signal_ieq(smcd->priv, rgid->gid,
trigger_irq, event_code, info);
}
static int smcd_move(struct smcd_dev *smcd, u64 dmb_tok, unsigned int idx,
bool sf, unsigned int offset, void *data,
unsigned int size)
{
return ism_move(smcd->priv, dmb_tok, idx, sf, offset, data, size);
}
static int smcd_supports_v2(void)
{
return ism_v2_capable;
}
static u64 ism_get_local_gid(struct ism_dev *ism)
{
return ism->local_gid;
}
static void smcd_get_local_gid(struct smcd_dev *smcd,
struct smcd_gid *smcd_gid)
{
smcd_gid->gid = ism_get_local_gid(smcd->priv);
smcd_gid->gid_ext = 0;
}
static u16 ism_get_chid(struct ism_dev *ism)
{
if (!ism || !ism->pdev)
return 0;
return to_zpci(ism->pdev)->pchid;
}
static u16 smcd_get_chid(struct smcd_dev *smcd)
{
return ism_get_chid(smcd->priv);
}
static inline struct device *smcd_get_dev(struct smcd_dev *dev)
{
struct ism_dev *ism = dev->priv;
return &ism->dev;
}
static const struct smcd_ops ism_ops = {
.query_remote_gid = smcd_query_rgid,
.register_dmb = smcd_register_dmb,
.unregister_dmb = smcd_unregister_dmb,
.add_vlan_id = smcd_add_vlan_id,
.del_vlan_id = smcd_del_vlan_id,
.set_vlan_required = smcd_set_vlan_required,
.reset_vlan_required = smcd_reset_vlan_required,
.signal_event = smcd_signal_ieq,
.move_data = smcd_move,
.supports_v2 = smcd_supports_v2,
.get_local_gid = smcd_get_local_gid,
.get_chid = smcd_get_chid,
.get_dev = smcd_get_dev,
};
const struct smcd_ops *ism_get_smcd_ops(void)
{
return &ism_ops;
}
EXPORT_SYMBOL_GPL(ism_get_smcd_ops);
#endif