linux/drivers/accel/amdxdna/aie2_error.c
Lizhi Hou 412576293c accel/amdxdna: Remove casting mailbox payload pointer
The mailbox payload pointer is void __iomem *. Casting it to u32 * is
incorrect and causes sparse warning.
  cast removes address space '__iomem' of expression

Fixes: b87f920b93 ("accel/amdxdna: Support hardware mailbox")
Reported-by: kernel test robot <lkp@intel.com>
Closes: https://lore.kernel.org/oe-kbuild-all/202501130921.ktqwsMLH-lkp@intel.com/
Signed-off-by: Lizhi Hou <lizhi.hou@amd.com>
Reviewed-by: Mario Limonciello <mario.limonciello@amd.com>
Signed-off-by: Mario Limonciello <mario.limonciello@amd.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20250113182617.1256094-1-lizhi.hou@amd.com
2025-01-13 14:21:39 -06:00

358 lines
9 KiB
C

// SPDX-License-Identifier: GPL-2.0
/*
* Copyright (C) 2023-2024, Advanced Micro Devices, Inc.
*/
#include <drm/drm_cache.h>
#include <drm/drm_device.h>
#include <drm/drm_print.h>
#include <drm/gpu_scheduler.h>
#include <linux/dma-mapping.h>
#include <linux/kthread.h>
#include <linux/kernel.h>
#include "aie2_msg_priv.h"
#include "aie2_pci.h"
#include "amdxdna_mailbox.h"
#include "amdxdna_pci_drv.h"
struct async_event {
struct amdxdna_dev_hdl *ndev;
struct async_event_msg_resp resp;
struct workqueue_struct *wq;
struct work_struct work;
u8 *buf;
dma_addr_t addr;
u32 size;
};
struct async_events {
struct workqueue_struct *wq;
u8 *buf;
dma_addr_t addr;
u32 size;
u32 event_cnt;
struct async_event event[] __counted_by(event_cnt);
};
/*
* Below enum, struct and lookup tables are porting from XAIE util header file.
*
* Below data is defined by AIE device and it is used for decode error message
* from the device.
*/
enum aie_module_type {
AIE_MEM_MOD = 0,
AIE_CORE_MOD,
AIE_PL_MOD,
};
enum aie_error_category {
AIE_ERROR_SATURATION = 0,
AIE_ERROR_FP,
AIE_ERROR_STREAM,
AIE_ERROR_ACCESS,
AIE_ERROR_BUS,
AIE_ERROR_INSTRUCTION,
AIE_ERROR_ECC,
AIE_ERROR_LOCK,
AIE_ERROR_DMA,
AIE_ERROR_MEM_PARITY,
/* Unknown is not from XAIE, added for better category */
AIE_ERROR_UNKNOWN,
};
/* Don't pack, unless XAIE side changed */
struct aie_error {
__u8 row;
__u8 col;
__u32 mod_type;
__u8 event_id;
};
struct aie_err_info {
u32 err_cnt;
u32 ret_code;
u32 rsvd;
struct aie_error payload[] __counted_by(err_cnt);
};
struct aie_event_category {
u8 event_id;
enum aie_error_category category;
};
#define EVENT_CATEGORY(id, cat) { id, cat }
static const struct aie_event_category aie_ml_mem_event_cat[] = {
EVENT_CATEGORY(88U, AIE_ERROR_ECC),
EVENT_CATEGORY(90U, AIE_ERROR_ECC),
EVENT_CATEGORY(91U, AIE_ERROR_MEM_PARITY),
EVENT_CATEGORY(92U, AIE_ERROR_MEM_PARITY),
EVENT_CATEGORY(93U, AIE_ERROR_MEM_PARITY),
EVENT_CATEGORY(94U, AIE_ERROR_MEM_PARITY),
EVENT_CATEGORY(95U, AIE_ERROR_MEM_PARITY),
EVENT_CATEGORY(96U, AIE_ERROR_MEM_PARITY),
EVENT_CATEGORY(97U, AIE_ERROR_DMA),
EVENT_CATEGORY(98U, AIE_ERROR_DMA),
EVENT_CATEGORY(99U, AIE_ERROR_DMA),
EVENT_CATEGORY(100U, AIE_ERROR_DMA),
EVENT_CATEGORY(101U, AIE_ERROR_LOCK),
};
static const struct aie_event_category aie_ml_core_event_cat[] = {
EVENT_CATEGORY(55U, AIE_ERROR_ACCESS),
EVENT_CATEGORY(56U, AIE_ERROR_STREAM),
EVENT_CATEGORY(57U, AIE_ERROR_STREAM),
EVENT_CATEGORY(58U, AIE_ERROR_BUS),
EVENT_CATEGORY(59U, AIE_ERROR_INSTRUCTION),
EVENT_CATEGORY(60U, AIE_ERROR_ACCESS),
EVENT_CATEGORY(62U, AIE_ERROR_ECC),
EVENT_CATEGORY(64U, AIE_ERROR_ECC),
EVENT_CATEGORY(65U, AIE_ERROR_ACCESS),
EVENT_CATEGORY(66U, AIE_ERROR_ACCESS),
EVENT_CATEGORY(67U, AIE_ERROR_LOCK),
EVENT_CATEGORY(70U, AIE_ERROR_INSTRUCTION),
EVENT_CATEGORY(71U, AIE_ERROR_STREAM),
EVENT_CATEGORY(72U, AIE_ERROR_BUS),
};
static const struct aie_event_category aie_ml_mem_tile_event_cat[] = {
EVENT_CATEGORY(130U, AIE_ERROR_ECC),
EVENT_CATEGORY(132U, AIE_ERROR_ECC),
EVENT_CATEGORY(133U, AIE_ERROR_DMA),
EVENT_CATEGORY(134U, AIE_ERROR_DMA),
EVENT_CATEGORY(135U, AIE_ERROR_STREAM),
EVENT_CATEGORY(136U, AIE_ERROR_STREAM),
EVENT_CATEGORY(137U, AIE_ERROR_STREAM),
EVENT_CATEGORY(138U, AIE_ERROR_BUS),
EVENT_CATEGORY(139U, AIE_ERROR_LOCK),
};
static const struct aie_event_category aie_ml_shim_tile_event_cat[] = {
EVENT_CATEGORY(64U, AIE_ERROR_BUS),
EVENT_CATEGORY(65U, AIE_ERROR_STREAM),
EVENT_CATEGORY(66U, AIE_ERROR_STREAM),
EVENT_CATEGORY(67U, AIE_ERROR_BUS),
EVENT_CATEGORY(68U, AIE_ERROR_BUS),
EVENT_CATEGORY(69U, AIE_ERROR_BUS),
EVENT_CATEGORY(70U, AIE_ERROR_BUS),
EVENT_CATEGORY(71U, AIE_ERROR_BUS),
EVENT_CATEGORY(72U, AIE_ERROR_DMA),
EVENT_CATEGORY(73U, AIE_ERROR_DMA),
EVENT_CATEGORY(74U, AIE_ERROR_LOCK),
};
static enum aie_error_category
aie_get_error_category(u8 row, u8 event_id, enum aie_module_type mod_type)
{
const struct aie_event_category *lut;
int num_entry;
int i;
switch (mod_type) {
case AIE_PL_MOD:
lut = aie_ml_shim_tile_event_cat;
num_entry = ARRAY_SIZE(aie_ml_shim_tile_event_cat);
break;
case AIE_CORE_MOD:
lut = aie_ml_core_event_cat;
num_entry = ARRAY_SIZE(aie_ml_core_event_cat);
break;
case AIE_MEM_MOD:
if (row == 1) {
lut = aie_ml_mem_tile_event_cat;
num_entry = ARRAY_SIZE(aie_ml_mem_tile_event_cat);
} else {
lut = aie_ml_mem_event_cat;
num_entry = ARRAY_SIZE(aie_ml_mem_event_cat);
}
break;
default:
return AIE_ERROR_UNKNOWN;
}
for (i = 0; i < num_entry; i++) {
if (event_id != lut[i].event_id)
continue;
return lut[i].category;
}
return AIE_ERROR_UNKNOWN;
}
static u32 aie2_error_backtrack(struct amdxdna_dev_hdl *ndev, void *err_info, u32 num_err)
{
struct aie_error *errs = err_info;
u32 err_col = 0; /* assume that AIE has less than 32 columns */
int i;
/* Get err column bitmap */
for (i = 0; i < num_err; i++) {
struct aie_error *err = &errs[i];
enum aie_error_category cat;
cat = aie_get_error_category(err->row, err->event_id, err->mod_type);
XDNA_ERR(ndev->xdna, "Row: %d, Col: %d, module %d, event ID %d, category %d",
err->row, err->col, err->mod_type,
err->event_id, cat);
if (err->col >= 32) {
XDNA_WARN(ndev->xdna, "Invalid column number");
break;
}
err_col |= (1 << err->col);
}
return err_col;
}
static int aie2_error_async_cb(void *handle, void __iomem *data, size_t size)
{
struct async_event *e = handle;
if (data) {
e->resp.type = readl(data + offsetof(struct async_event_msg_resp, type));
wmb(); /* Update status in the end, so that no lock for here */
e->resp.status = readl(data + offsetof(struct async_event_msg_resp, status));
}
queue_work(e->wq, &e->work);
return 0;
}
static int aie2_error_event_send(struct async_event *e)
{
drm_clflush_virt_range(e->buf, e->size); /* device can access */
return aie2_register_asyn_event_msg(e->ndev, e->addr, e->size, e,
aie2_error_async_cb);
}
static void aie2_error_worker(struct work_struct *err_work)
{
struct aie_err_info *info;
struct amdxdna_dev *xdna;
struct async_event *e;
u32 max_err;
u32 err_col;
e = container_of(err_work, struct async_event, work);
xdna = e->ndev->xdna;
if (e->resp.status == MAX_AIE2_STATUS_CODE)
return;
e->resp.status = MAX_AIE2_STATUS_CODE;
print_hex_dump_debug("AIE error: ", DUMP_PREFIX_OFFSET, 16, 4,
e->buf, 0x100, false);
info = (struct aie_err_info *)e->buf;
XDNA_DBG(xdna, "Error count %d return code %d", info->err_cnt, info->ret_code);
max_err = (e->size - sizeof(*info)) / sizeof(struct aie_error);
if (unlikely(info->err_cnt > max_err)) {
WARN_ONCE(1, "Error count too large %d\n", info->err_cnt);
return;
}
err_col = aie2_error_backtrack(e->ndev, info->payload, info->err_cnt);
if (!err_col) {
XDNA_WARN(xdna, "Did not get error column");
return;
}
mutex_lock(&xdna->dev_lock);
/* Re-sent this event to firmware */
if (aie2_error_event_send(e))
XDNA_WARN(xdna, "Unable to register async event");
mutex_unlock(&xdna->dev_lock);
}
int aie2_error_async_events_send(struct amdxdna_dev_hdl *ndev)
{
struct amdxdna_dev *xdna = ndev->xdna;
struct async_event *e;
int i, ret;
drm_WARN_ON(&xdna->ddev, !mutex_is_locked(&xdna->dev_lock));
for (i = 0; i < ndev->async_events->event_cnt; i++) {
e = &ndev->async_events->event[i];
ret = aie2_error_event_send(e);
if (ret)
return ret;
}
return 0;
}
void aie2_error_async_events_free(struct amdxdna_dev_hdl *ndev)
{
struct amdxdna_dev *xdna = ndev->xdna;
struct async_events *events;
events = ndev->async_events;
mutex_unlock(&xdna->dev_lock);
destroy_workqueue(events->wq);
mutex_lock(&xdna->dev_lock);
dma_free_noncoherent(xdna->ddev.dev, events->size, events->buf,
events->addr, DMA_FROM_DEVICE);
kfree(events);
}
int aie2_error_async_events_alloc(struct amdxdna_dev_hdl *ndev)
{
struct amdxdna_dev *xdna = ndev->xdna;
u32 total_col = ndev->total_col;
u32 total_size = ASYNC_BUF_SIZE * total_col;
struct async_events *events;
int i, ret;
events = kzalloc(struct_size(events, event, total_col), GFP_KERNEL);
if (!events)
return -ENOMEM;
events->buf = dma_alloc_noncoherent(xdna->ddev.dev, total_size, &events->addr,
DMA_FROM_DEVICE, GFP_KERNEL);
if (!events->buf) {
ret = -ENOMEM;
goto free_events;
}
events->size = total_size;
events->event_cnt = total_col;
events->wq = alloc_ordered_workqueue("async_wq", 0);
if (!events->wq) {
ret = -ENOMEM;
goto free_buf;
}
for (i = 0; i < events->event_cnt; i++) {
struct async_event *e = &events->event[i];
u32 offset = i * ASYNC_BUF_SIZE;
e->ndev = ndev;
e->wq = events->wq;
e->buf = &events->buf[offset];
e->addr = events->addr + offset;
e->size = ASYNC_BUF_SIZE;
e->resp.status = MAX_AIE2_STATUS_CODE;
INIT_WORK(&e->work, aie2_error_worker);
}
ndev->async_events = events;
XDNA_DBG(xdna, "Async event count %d, buf total size 0x%x",
events->event_cnt, events->size);
return 0;
free_buf:
dma_free_noncoherent(xdna->ddev.dev, events->size, events->buf,
events->addr, DMA_FROM_DEVICE);
free_events:
kfree(events);
return ret;
}