linux/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h

/*
 * Copyright 2016 Advanced Micro Devices, Inc.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the "Software"),
 * to deal in the Software without restriction, including without limitation
 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
 * and/or sell copies of the Software, and to permit persons to whom the
 * Software is furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
 * OTHER DEALINGS IN THE SOFTWARE.
 */
#ifndef __AMDGPU_XGMI_H__
#define __AMDGPU_XGMI_H__

#include <drm/task_barrier.h>
#include "amdgpu_psp.h"


struct amdgpu_hive_info {
	uint64_t		hive_id;
	struct list_head	device_list;
	int number_devices;
	struct mutex hive_lock;
	atomic_t in_reset;
	struct kobject *kobj;
	struct device_attribute dev_attr;
	struct amdgpu_device *adev;
	int hi_req_count;
	struct amdgpu_device *hi_req_gpu;
	struct task_barrier tb;
	enum {
		AMDGPU_XGMI_PSTATE_MIN,
		AMDGPU_XGMI_PSTATE_MAX_VEGA20,
		AMDGPU_XGMI_PSTATE_UNKNOWN
	} pstate;
};

struct amdgpu_pcs_ras_field {
	const char *err_name;
	uint32_t pcs_err_mask;
	uint32_t pcs_err_shift;
};

struct amdgpu_hive_info *amdgpu_get_xgmi_hive(struct amdgpu_device *adev, int lock);
int amdgpu_xgmi_update_topology(struct amdgpu_hive_info *hive, struct amdgpu_device *adev);
int amdgpu_xgmi_add_device(struct amdgpu_device *adev);
int amdgpu_xgmi_remove_device(struct amdgpu_device *adev);
int amdgpu_xgmi_set_pstate(struct amdgpu_device *adev, int pstate);
int amdgpu_xgmi_get_hops_count(struct amdgpu_device *adev,
		struct amdgpu_device *peer_adev);
int amdgpu_xgmi_ras_late_init(struct amdgpu_device *adev);
void amdgpu_xgmi_ras_fini(struct amdgpu_device *adev);
uint64_t amdgpu_xgmi_get_relative_phy_addr(struct amdgpu_device *adev,
					   uint64_t addr);
int amdgpu_xgmi_query_ras_error_count(struct amdgpu_device *adev,
				      void *ras_error_status);
void amdgpu_xgmi_reset_ras_error_count(struct amdgpu_device *adev);

static inline bool amdgpu_xgmi_same_hive(struct amdgpu_device *adev,
		struct amdgpu_device *bo_adev)
{
	return (adev != bo_adev &&
		adev->gmc.xgmi.hive_id &&
		adev->gmc.xgmi.hive_id == bo_adev->gmc.xgmi.hive_id);
}

#endif
drm/amdgpu: Refactor amdgpu_xgmi_add_device This is prep work for updating each PSP FW in hive after GPU reset. Split into build topology SW state and update each PSP FW in the hive. Save topology and count of XGMI devices for reuse. v2: Create seperate header for XGMI. Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky@amd.com> Acked-by: Christian König <christian.koenig@amd.com> Signed-off-by: Alex Deucher <alexander.deucher@amd.com> 2018-11-12 16:16:03 -05:00			`/*`
			`* Copyright 2016 Advanced Micro Devices, Inc.`
			`*`
			`* Permission is hereby granted, free of charge, to any person obtaining a`
			`* copy of this software and associated documentation files (the "Software"),`
			`* to deal in the Software without restriction, including without limitation`
			`* the rights to use, copy, modify, merge, publish, distribute, sublicense,`
			`* and/or sell copies of the Software, and to permit persons to whom the`
			`* Software is furnished to do so, subject to the following conditions:`
			`*`
			`* The above copyright notice and this permission notice shall be included in`
			`* all copies or substantial portions of the Software.`
			`*`
			`* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR`
			`* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,`
			`* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL`
			`* THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR`
			`* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,`
			`* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR`
			`* OTHER DEALINGS IN THE SOFTWARE.`
			`*/`
			`#ifndef __AMDGPU_XGMI_H__`
			`#define __AMDGPU_XGMI_H__`

drm/amdgpu: Add task barrier to XGMI hive. Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky@amd.com> Reviewed-by: Le Ma <Le.Ma@amd.com> Signed-off-by: Alex Deucher <alexander.deucher@amd.com> 2019-12-06 12:43:30 -05:00			`#include <drm/task_barrier.h>`
drm/amdgpu: Expose hive adev list and xgmi_mutex It's needed for device reset of entire hive. v3: Add per hive lock to allow avoiding duplicate resets triggered by multiple members of same hive. Expose amdgpu_hive_info instead of adding getter functions. Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky@amd.com> Reviewed-by: Alex Deucher <alexander.deucher@amd.com> Acked-by: Christian König <christian.koenig@amd.com> Signed-off-by: Alex Deucher <alexander.deucher@amd.com> 2018-11-14 15:50:05 -05:00			`#include "amdgpu_psp.h"`
drm/amdgpu: Refactor amdgpu_xgmi_add_device This is prep work for updating each PSP FW in hive after GPU reset. Split into build topology SW state and update each PSP FW in the hive. Save topology and count of XGMI devices for reuse. v2: Create seperate header for XGMI. Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky@amd.com> Acked-by: Christian König <christian.koenig@amd.com> Signed-off-by: Alex Deucher <alexander.deucher@amd.com> 2018-11-12 16:16:03 -05:00
drm/amdgpu: fix race between pstate and remote buffer map Vega20 arbitrates pstate at hive level and not device level. Last peer to remote buffer unmap could drop P-State while another process is still remote buffer mapped. With this fix, P-States still needs to be disabled for now as SMU bug was discovered on synchronous P2P transfers. This should be fixed in the next FW update. Signed-off-by: Jonathan Kim <Jonathan.Kim@amd.com> Reviewed-by: Felix Kuehling <Felix.Kuehling@amd.com> Signed-off-by: Alex Deucher <alexander.deucher@amd.com> 2020-03-17 15:43:41 -04:00
drm/amdgpu: Expose hive adev list and xgmi_mutex It's needed for device reset of entire hive. v3: Add per hive lock to allow avoiding duplicate resets triggered by multiple members of same hive. Expose amdgpu_hive_info instead of adding getter functions. Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky@amd.com> Reviewed-by: Alex Deucher <alexander.deucher@amd.com> Acked-by: Christian König <christian.koenig@amd.com> Signed-off-by: Alex Deucher <alexander.deucher@amd.com> 2018-11-14 15:50:05 -05:00			`struct amdgpu_hive_info {`
			`uint64_t hive_id;`
			`struct list_head device_list;`
			`int number_devices;`
drm/amdgpu: fix system hang issue during GPU reset when GPU hang, driver has multi-paths to enter amdgpu_device_gpu_recover, the atomic adev->in_gpu_reset and hive->in_reset are used to avoid re-entering GPU recovery. During GPU reset and resume, it is unsafe that other threads access GPU, which maybe cause GPU reset failed. Therefore the new rw_semaphore adev->reset_sem is introduced, which protect GPU from being accessed by external threads during recovery. v2: 1. add rwlock for some ioctls, debugfs and file-close function. 2. change to use dqm->is_resetting and dqm_lock for protection in kfd driver. 3. remove try_lock and change adev->in_gpu_reset as atomic, to avoid re-enter GPU recovery for the same GPU hang. v3: 1. change back to use adev->reset_sem to protect kfd callback functions, because dqm_lock couldn't protect all codes, for example: free_mqd must be called outside of dqm_lock; [ 1230.176199] Hardware name: Supermicro SYS-7049GP-TRT/X11DPG-QT, BIOS 3.1 05/23/2019 [ 1230.177221] Call Trace: [ 1230.178249] dump_stack+0x98/0xd5 [ 1230.179443] amdgpu_virt_kiq_reg_write_reg_wait+0x181/0x190 [amdgpu] [ 1230.180673] gmc_v9_0_flush_gpu_tlb+0xcc/0x310 [amdgpu] [ 1230.181882] amdgpu_gart_unbind+0xa9/0xe0 [amdgpu] [ 1230.183098] amdgpu_ttm_backend_unbind+0x46/0x180 [amdgpu] [ 1230.184239] ? ttm_bo_put+0x171/0x5f0 [ttm] [ 1230.185394] ttm_tt_unbind+0x21/0x40 [ttm] [ 1230.186558] ttm_tt_destroy.part.12+0x12/0x60 [ttm] [ 1230.187707] ttm_tt_destroy+0x13/0x20 [ttm] [ 1230.188832] ttm_bo_cleanup_memtype_use+0x36/0x80 [ttm] [ 1230.189979] ttm_bo_put+0x1be/0x5f0 [ttm] [ 1230.191230] amdgpu_bo_unref+0x1e/0x30 [amdgpu] [ 1230.192522] amdgpu_amdkfd_free_gtt_mem+0xaf/0x140 [amdgpu] [ 1230.193833] free_mqd+0x25/0x40 [amdgpu] [ 1230.195143] destroy_queue_cpsch+0x1a7/0x270 [amdgpu] [ 1230.196475] pqm_destroy_queue+0x105/0x260 [amdgpu] [ 1230.197819] kfd_ioctl_destroy_queue+0x37/0x70 [amdgpu] [ 1230.199154] kfd_ioctl+0x277/0x500 [amdgpu] [ 1230.200458] ? kfd_ioctl_get_clock_counters+0x60/0x60 [amdgpu] [ 1230.201656] ? tomoyo_file_ioctl+0x19/0x20 [ 1230.202831] ksys_ioctl+0x98/0xb0 [ 1230.204004] __x64_sys_ioctl+0x1a/0x20 [ 1230.205174] do_syscall_64+0x5f/0x250 [ 1230.206339] entry_SYSCALL_64_after_hwframe+0x49/0xbe 2. remove try_lock and introduce atomic hive->in_reset, to avoid re-enter GPU recovery. v4: 1. remove an unnecessary whitespace change in kfd_chardev.c 2. remove comment codes in amdgpu_device.c 3. add more detailed comment in commit message 4. define a wrap function amdgpu_in_reset v5: 1. Fix some style issues. Reviewed-by: Hawking Zhang <Hawking.Zhang@amd.com> Suggested-by: Andrey Grodzovsky <andrey.grodzovsky@amd.com> Suggested-by: Christian König <christian.koenig@amd.com> Suggested-by: Felix Kuehling <Felix.Kuehling@amd.com> Suggested-by: Lijo Lazar <Lijo.Lazar@amd.com> Suggested-by: Luben Tukov <luben.tuikov@amd.com> Signed-off-by: Dennis Li <Dennis.Li@amd.com> Signed-off-by: Alex Deucher <alexander.deucher@amd.com> 2020-07-08 15:07:13 +08:00			`struct mutex hive_lock;`
			`atomic_t in_reset;`
drm/amdgpu: Add sysfs entries for xgmi hive v2. For each device a file xgmi_device_id is created. On the first device a subdirectory named xgmi_hive_info is created, It contains a file named hive_id and symlinks named node 1-4 linking to each device in the hive. v2: Return error codes instead of '-1' and few misspellings. Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky@amd.com> Reviewed-by: Christian König <christian.koenig@amd.com> Signed-off-by: Alex Deucher <alexander.deucher@amd.com> 2019-03-05 10:39:08 -05:00			`struct kobject *kobj;`
			`struct device_attribute dev_attr;`
			`struct amdgpu_device *adev;`
drm/amdgpu: fix race between pstate and remote buffer map Vega20 arbitrates pstate at hive level and not device level. Last peer to remote buffer unmap could drop P-State while another process is still remote buffer mapped. With this fix, P-States still needs to be disabled for now as SMU bug was discovered on synchronous P2P transfers. This should be fixed in the next FW update. Signed-off-by: Jonathan Kim <Jonathan.Kim@amd.com> Reviewed-by: Felix Kuehling <Felix.Kuehling@amd.com> Signed-off-by: Alex Deucher <alexander.deucher@amd.com> 2020-03-17 15:43:41 -04:00			`int hi_req_count;`
			`struct amdgpu_device *hi_req_gpu;`
drm/amdgpu: Add task barrier to XGMI hive. Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky@amd.com> Reviewed-by: Le Ma <Le.Ma@amd.com> Signed-off-by: Alex Deucher <alexander.deucher@amd.com> 2019-12-06 12:43:30 -05:00			`struct task_barrier tb;`
drm/amdgpu: fix race between pstate and remote buffer map Vega20 arbitrates pstate at hive level and not device level. Last peer to remote buffer unmap could drop P-State while another process is still remote buffer mapped. With this fix, P-States still needs to be disabled for now as SMU bug was discovered on synchronous P2P transfers. This should be fixed in the next FW update. Signed-off-by: Jonathan Kim <Jonathan.Kim@amd.com> Reviewed-by: Felix Kuehling <Felix.Kuehling@amd.com> Signed-off-by: Alex Deucher <alexander.deucher@amd.com> 2020-03-17 15:43:41 -04:00			`enum {`
			`AMDGPU_XGMI_PSTATE_MIN,`
			`AMDGPU_XGMI_PSTATE_MAX_VEGA20,`
			`AMDGPU_XGMI_PSTATE_UNKNOWN`
			`} pstate;`
drm/amdgpu: Expose hive adev list and xgmi_mutex It's needed for device reset of entire hive. v3: Add per hive lock to allow avoiding duplicate resets triggered by multiple members of same hive. Expose amdgpu_hive_info instead of adding getter functions. Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky@amd.com> Reviewed-by: Alex Deucher <alexander.deucher@amd.com> Acked-by: Christian König <christian.koenig@amd.com> Signed-off-by: Alex Deucher <alexander.deucher@amd.com> 2018-11-14 15:50:05 -05:00			`};`
drm/amdgpu: Refactor amdgpu_xgmi_add_device This is prep work for updating each PSP FW in hive after GPU reset. Split into build topology SW state and update each PSP FW in the hive. Save topology and count of XGMI devices for reuse. v2: Create seperate header for XGMI. Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky@amd.com> Acked-by: Christian König <christian.koenig@amd.com> Signed-off-by: Alex Deucher <alexander.deucher@amd.com> 2018-11-12 16:16:03 -05:00
drm/amdgpu: add helper funcs to detect PCS error Since from vega20, hardware supports run-time detect and report XGMI/WAFL PCS ras error. Add helper functions to walkthrough every type of ras error and report it if any. Signed-off-by: Hawking Zhang <Hawking.Zhang@amd.com> Reviewed-by: Guchun Chen <guchun.chen@amd.com> Reviewed-by: Tao Zhou <tao.zhou1@amd.com> Signed-off-by: Alex Deucher <alexander.deucher@amd.com> 2020-02-21 21:43:44 +08:00			`struct amdgpu_pcs_ras_field {`
			`const char *err_name;`
			`uint32_t pcs_err_mask;`
			`uint32_t pcs_err_shift;`
			`};`

drm/amd/amdgpu: add missing mutex lock to amdgpu_get_xgmi_hive() (v3) v2: Move locks around in other functions so that this function can stand on its own. Also only hold the hive specific lock for add/remove device instead of the driver global lock so you can't add/remove devices in parallel from one hive. v3: add reset_lock Acked-by: Shaoyun.liu < Shaoyun.liu@amd.com> Signed-off-by: Tom St Denis <tom.stdenis@amd.com> Reviewed-by: Andrey Grodzovsky <andrey.grodzovsky@amd.com> Signed-off-by: Alex Deucher <alexander.deucher@amd.com> 2019-01-07 17:39:10 -05:00			`struct amdgpu_hive_info amdgpu_get_xgmi_hive(struct amdgpu_device adev, int lock);`
drm/amdgpu: Refactor amdgpu_xgmi_add_device This is prep work for updating each PSP FW in hive after GPU reset. Split into build topology SW state and update each PSP FW in the hive. Save topology and count of XGMI devices for reuse. v2: Create seperate header for XGMI. Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky@amd.com> Acked-by: Christian König <christian.koenig@amd.com> Signed-off-by: Alex Deucher <alexander.deucher@amd.com> 2018-11-12 16:16:03 -05:00			`int amdgpu_xgmi_update_topology(struct amdgpu_hive_info hive, struct amdgpu_device adev);`
			`int amdgpu_xgmi_add_device(struct amdgpu_device *adev);`
drm/amdgpu: move xgmi init/fini to xgmi_add/remove_device call (v2) For sriov, psp ip block has to be initialized before ih block for the dynamic register programming interface that needed for vf ih ring buffer. On the other hand, current psp ip block hw_init function will initialize xgmi session which actaully depends on interrupt to return session context. This results an empty xgmi ta session id and later failures on all the xgmi ta cmd invoked from vf. xgmi ta session initialization has to be done after ih ip block hw_init call. to unify xgmi session init/fini for both bare-metal sriov virtualization use scenario, move xgmi ta init to xgmi_add_device call, and accordingly terminate xgmi ta session in xgmi_remove_device call. The existing suspend/resume sequence will not be changed. v2: squash in return fix from Nirmoy Signed-off-by: Hawking Zhang <Hawking.Zhang@amd.com> Reviewed-by: Frank Min <Frank.Min@amd.com> Signed-off-by: Alex Deucher <alexander.deucher@amd.com> 2019-12-23 16:51:42 +08:00			`int amdgpu_xgmi_remove_device(struct amdgpu_device *adev);`
drm/amdgpu: XGMI pstate switch initial support Driver vote low to high pstate switch whenever there is an outstanding XGMI mapping request. Driver vote high to low pstate when all the outstanding XGMI mapping is terminated. Signed-off-by: shaoyunl <shaoyun.liu@amd.com> Reviewed-by: Christian König <christian.koenig@amd.com> Signed-off-by: Alex Deucher <alexander.deucher@amd.com> 2019-03-20 16:14:56 -04:00			`int amdgpu_xgmi_set_pstate(struct amdgpu_device *adev, int pstate);`
drm/amdgpu: Implement get num of hops between two xgmi device KFD need to provide the info for upper level to determine the data path Signed-off-by: shaoyunl <shaoyun.liu@amd.com> Reviewed-by: Felix Kuehling <Felix.Kuehling@amd.com> Signed-off-by: Alex Deucher <alexander.deucher@amd.com> 2019-04-17 14:28:18 -04:00			`int amdgpu_xgmi_get_hops_count(struct amdgpu_device *adev,`
			`struct amdgpu_device *peer_adev);`
drm/amdgpu: initialize ras structures for xgmi block (v2) init ras common interface and fs node for xgmi block v2: remove unnecessary physical node number check before invoking amdgpu_xgmi_ras_late_init Signed-off-by: Hawking Zhang <Hawking.Zhang@amd.com> Reviewed-by: Guchun Chen <guchun.chen@amd.com> Signed-off-by: Alex Deucher <alexander.deucher@amd.com> 2019-09-10 11:13:39 +08:00			`int amdgpu_xgmi_ras_late_init(struct amdgpu_device *adev);`
drm/amdgpu: move xgmi ras fini to xgmi block it's more suitable to put xgmi ras fini in xgmi block Signed-off-by: Tao Zhou <tao.zhou1@amd.com> Reviewed-by: Guchun Chen <guchun.chen@amd.com> Signed-off-by: Alex Deucher <alexander.deucher@amd.com> 2019-09-18 17:58:14 +08:00			`void amdgpu_xgmi_ras_fini(struct amdgpu_device *adev);`
drm/amdgpu: move get_xgmi_relative_phy_addr to amdgpu_xgmi.c centralize all the xgmi related function to amdgpu_xgmi.c Signed-off-by: Hawking Zhang <Hawking.Zhang@amd.com> Acked-by: Evan Quan <evan.quan@amd.com> Signed-off-by: Alex Deucher <alexander.deucher@amd.com> 2020-02-24 15:36:13 +08:00			`uint64_t amdgpu_xgmi_get_relative_phy_addr(struct amdgpu_device *adev,`
			`uint64_t addr);`
drm/amdgpu: add helper funcs to detect PCS error Since from vega20, hardware supports run-time detect and report XGMI/WAFL PCS ras error. Add helper functions to walkthrough every type of ras error and report it if any. Signed-off-by: Hawking Zhang <Hawking.Zhang@amd.com> Reviewed-by: Guchun Chen <guchun.chen@amd.com> Reviewed-by: Tao Zhou <tao.zhou1@amd.com> Signed-off-by: Alex Deucher <alexander.deucher@amd.com> 2020-02-21 21:43:44 +08:00			`int amdgpu_xgmi_query_ras_error_count(struct amdgpu_device *adev,`
			`void *ras_error_status);`
drm/amdgpu: added xgmi ras error reset sequence added mechanism to clear xgmi ras status inbetween error queries Reviewed-by: Hawking Zhang <Hawking.Zhang@amd.com> Signed-off-by: John Clements <john.clements@amd.com> Signed-off-by: Alex Deucher <alexander.deucher@amd.com> 2020-03-25 15:56:31 +08:00			`void amdgpu_xgmi_reset_ras_error_count(struct amdgpu_device *adev);`
drm/amdgpu: XGMI pstate switch initial support Driver vote low to high pstate switch whenever there is an outstanding XGMI mapping request. Driver vote high to low pstate when all the outstanding XGMI mapping is terminated. Signed-off-by: shaoyunl <shaoyun.liu@amd.com> Reviewed-by: Christian König <christian.koenig@amd.com> Signed-off-by: Alex Deucher <alexander.deucher@amd.com> 2019-03-20 16:14:56 -04:00
			`static inline bool amdgpu_xgmi_same_hive(struct amdgpu_device *adev,`
			`struct amdgpu_device *bo_adev)`
			`{`
			`return (adev != bo_adev &&`
			`adev->gmc.xgmi.hive_id &&`
			`adev->gmc.xgmi.hive_id == bo_adev->gmc.xgmi.hive_id);`
			`}`
drm/amdgpu: Refactor amdgpu_xgmi_add_device This is prep work for updating each PSP FW in hive after GPU reset. Split into build topology SW state and update each PSP FW in the hive. Save topology and count of XGMI devices for reuse. v2: Create seperate header for XGMI. Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky@amd.com> Acked-by: Christian König <christian.koenig@amd.com> Signed-off-by: Alex Deucher <alexander.deucher@amd.com> 2018-11-12 16:16:03 -05:00
			`#endif`