aboutsummaryrefslogtreecommitdiffstats
path: root/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c')
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c46
1 files changed, 37 insertions, 9 deletions
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 404483437bd3..cef94e2169fe 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -198,9 +198,6 @@ static int amdgpu_ras_debugfs_ctrl_parse_data(struct file *f,
return 0;
}
-static struct ras_manager *amdgpu_ras_find_obj(struct amdgpu_device *adev,
- struct ras_common_if *head);
-
/**
* DOC: AMDGPU RAS debugfs control interface
*
@@ -318,7 +315,7 @@ static ssize_t amdgpu_ras_debugfs_ctrl_write(struct file *f, const char __user *
default:
ret = -EINVAL;
break;
- };
+ }
if (ret)
return -EINVAL;
@@ -445,7 +442,7 @@ static struct ras_manager *amdgpu_ras_create_obj(struct amdgpu_device *adev,
}
/* return an obj equal to head, or the first when head is NULL */
-static struct ras_manager *amdgpu_ras_find_obj(struct amdgpu_device *adev,
+struct ras_manager *amdgpu_ras_find_obj(struct amdgpu_device *adev,
struct ras_common_if *head)
{
struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
@@ -689,6 +686,7 @@ int amdgpu_ras_error_query(struct amdgpu_device *adev,
{
struct ras_manager *obj = amdgpu_ras_find_obj(adev, &info->head);
struct ras_err_data err_data = {0, 0, 0, NULL};
+ int i;
if (!obj)
return -EINVAL;
@@ -703,6 +701,13 @@ int amdgpu_ras_error_query(struct amdgpu_device *adev,
if (adev->umc.funcs->query_ras_error_address)
adev->umc.funcs->query_ras_error_address(adev, &err_data);
break;
+ case AMDGPU_RAS_BLOCK__SDMA:
+ if (adev->sdma.funcs->query_ras_error_count) {
+ for (i = 0; i < adev->sdma.num_instances; i++)
+ adev->sdma.funcs->query_ras_error_count(adev, i,
+ &err_data);
+ }
+ break;
case AMDGPU_RAS_BLOCK__GFX:
if (adev->gfx.funcs->query_ras_error_count)
adev->gfx.funcs->query_ras_error_count(adev, &err_data);
@@ -737,6 +742,20 @@ int amdgpu_ras_error_query(struct amdgpu_device *adev,
return 0;
}
+uint64_t get_xgmi_relative_phy_addr(struct amdgpu_device *adev, uint64_t addr)
+{
+ uint32_t df_inst_id;
+
+ if ((!adev->df.funcs) ||
+ (!adev->df.funcs->get_df_inst_id) ||
+ (!adev->df.funcs->get_dram_base_addr))
+ return addr;
+
+ df_inst_id = adev->df.funcs->get_df_inst_id(adev);
+
+ return addr + adev->df.funcs->get_dram_base_addr(adev, df_inst_id);
+}
+
/* wrapper of psp_ras_trigger_error */
int amdgpu_ras_error_inject(struct amdgpu_device *adev,
struct ras_inject_if *info)
@@ -754,6 +773,12 @@ int amdgpu_ras_error_inject(struct amdgpu_device *adev,
if (!obj)
return -EINVAL;
+ /* Calculate XGMI relative offset */
+ if (adev->gmc.xgmi.num_physical_nodes > 1) {
+ block_info.address = get_xgmi_relative_phy_addr(adev,
+ block_info.address);
+ }
+
switch (info->head.block) {
case AMDGPU_RAS_BLOCK__GFX:
if (adev->gfx.funcs->ras_error_inject)
@@ -1314,6 +1339,7 @@ static int amdgpu_ras_badpages_read(struct amdgpu_device *adev,
data = con->eh_data;
if (!data || data->count == 0) {
*bps = NULL;
+ ret = -EINVAL;
goto out;
}
@@ -1347,7 +1373,8 @@ static void amdgpu_ras_do_recovery(struct work_struct *work)
struct amdgpu_ras *ras =
container_of(work, struct amdgpu_ras, recovery_work);
- amdgpu_device_gpu_recover(ras->adev, 0);
+ if (amdgpu_device_should_recover_gpu(ras->adev))
+ amdgpu_device_gpu_recover(ras->adev, 0);
atomic_set(&ras->in_recovery, 0);
}
@@ -1687,7 +1714,8 @@ static void amdgpu_ras_check_supported(struct amdgpu_device *adev,
*supported = 0;
if (amdgpu_sriov_vf(adev) ||
- adev->asic_type != CHIP_VEGA20)
+ (adev->asic_type != CHIP_VEGA20 &&
+ adev->asic_type != CHIP_ARCTURUS))
return;
if (adev->is_atom_fw &&
@@ -1872,7 +1900,7 @@ void amdgpu_ras_resume(struct amdgpu_device *adev)
* See feature_enable_on_boot
*/
amdgpu_ras_disable_all_features(adev, 1);
- amdgpu_ras_reset_gpu(adev, 0);
+ amdgpu_ras_reset_gpu(adev);
}
}
@@ -1935,6 +1963,6 @@ void amdgpu_ras_global_ras_isr(struct amdgpu_device *adev)
if (atomic_cmpxchg(&amdgpu_ras_in_intr, 0, 1) == 0) {
DRM_WARN("RAS event of type ERREVENT_ATHUB_INTERRUPT detected!\n");
- amdgpu_ras_reset_gpu(adev, false);
+ amdgpu_ras_reset_gpu(adev);
}
}