aboutsummaryrefslogtreecommitdiffstats
path: root/drivers/gpu/drm/amd/amdgpu
diff options
context:
space:
mode:
authorDave Airlie <airlied@redhat.com>2022-02-25 05:30:17 +1000
committerDave Airlie <airlied@redhat.com>2022-02-25 05:50:18 +1000
commit54f43c17d681f6d9523fcfaeefc9df77993802e1 (patch)
treea1b3807e26d05b9b17aea43b7725caf3757522ca /drivers/gpu/drm/amd/amdgpu
parentMerge tag 'drm-intel-next-2022-02-23' of git://anongit.freedesktop.org/drm/drm-intel into drm-next (diff)
parentdrm/selftests: add drm buddy pathological testcase (diff)
downloadlinux-dev-54f43c17d681f6d9523fcfaeefc9df77993802e1.tar.xz
linux-dev-54f43c17d681f6d9523fcfaeefc9df77993802e1.zip
Merge tag 'drm-misc-next-2022-02-23' of git://anongit.freedesktop.org/drm/drm-misc into drm-next
drm-misc-next for v5.18: UAPI Changes: Cross-subsystem Changes: - Split out panel-lvds and lvds dt bindings . - Put yes/no on/off disabled/enabled strings in linux/string_helpers.h and use it in drivers and tomoyo. - Clarify dma_fence_chain and dma_fence_array should never include eachother. - Flatten chains in syncobj's. - Don't double add in fbdev/defio when page is already enlisted. - Don't sort deferred-I/O pages by default in fbdev. Core Changes: - Fix missing pm_runtime_put_sync in bridge. - Set modifier support to only linear fb modifier if drivers don't advertise support. - As a result, we remove allow_fb_modifiers. - Add missing clear for EDID Deep Color Modes in drm_reset_display_info. - Assorted documentation updates. - Warn once in drm_clflush if there is no arch support. - Add missing select for dp helper in drm_panel_edp. - Assorted small fixes. - Improve fb-helper's clipping handling. - Don't dump shmem mmaps in a core dump. - Add accounting to ttm resource manager, and use it in amdgpu. - Allow querying the detected eDP panel through debugfs. - Add helpers for xrgb8888 to 8 and 1 bits gray. - Improve drm's buddy allocator. - Add selftests for the buddy allocator. Driver Changes: - Add support for nomodeset to a lot of drm drivers. - Use drm_module_*_driver in a lot of drm drivers. - Assorted small fixes to bridge/lt9611, v3d, vc4, vmwgfx, mxsfb, nouveau, bridge/dw-hdmi, panfrost, lima, ingenic, sprd, bridge/anx7625, ti-sn65dsi86. - Add bridge/it6505. - Create DP and DVI-I connectors in ast. - Assorted nouveau backlight fixes. - Rework amdgpu reset handling. - Add dt bindings for ingenic,jz4780-dw-hdmi. - Support reading edid through aux channel in ingenic. - Add a drm driver for Solomon SSD130x OLED displays. - Add simple support for sharp LQ140M1JW46. - Add more panels to nt35560. Signed-off-by: Dave Airlie <airlied@redhat.com> From: Maarten Lankhorst <maarten.lankhorst@linux.intel.com> Link: https://patchwork.freedesktop.org/patch/msgid/686ec871-e77f-c230-22e5-9e3bb80f064a@linux.intel.com
Diffstat (limited to 'drivers/gpu/drm/amd/amdgpu')
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu.h14
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c2
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c10
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_device.c274
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_display.c6
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c43
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_gtt_mgr.c49
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_job.c3
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c14
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_object.c4
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_preempt_mgr.c62
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c18
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c56
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h40
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c5
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h6
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_sync.c5
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c2
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h11
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c6
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c58
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c27
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h3
-rw-r--r--drivers/gpu/drm/amd/amdgpu/atom.c4
-rw-r--r--drivers/gpu/drm/amd/amdgpu/dce_v10_0.c2
-rw-r--r--drivers/gpu/drm/amd/amdgpu/dce_v11_0.c2
-rw-r--r--drivers/gpu/drm/amd/amdgpu/dce_v6_0.c1
-rw-r--r--drivers/gpu/drm/amd/amdgpu/dce_v8_0.c2
-rw-r--r--drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c6
-rw-r--r--drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c14
-rw-r--r--drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c19
-rw-r--r--drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c19
-rw-r--r--drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c11
33 files changed, 420 insertions, 378 deletions
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index 566303c9942f..8685784f24a9 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -815,6 +815,8 @@ struct ip_discovery_top;
#define AMDGPU_RESET_MAGIC_NUM 64
#define AMDGPU_MAX_DF_PERFMONS 4
#define AMDGPU_PRODUCT_NAME_LEN 64
+struct amdgpu_reset_domain;
+
struct amdgpu_device {
struct device *dev;
struct pci_dev *pdev;
@@ -1050,9 +1052,7 @@ struct amdgpu_device {
bool in_s4;
bool in_s0ix;
- atomic_t in_gpu_reset;
enum pp_mp1_state mp1_state;
- struct rw_semaphore reset_sem;
struct amdgpu_doorbell_index doorbell_index;
struct mutex notifier_lock;
@@ -1100,6 +1100,8 @@ struct amdgpu_device {
struct list_head ras_list;
struct ip_discovery_top *ip_top;
+
+ struct amdgpu_reset_domain *reset_domain;
};
static inline struct amdgpu_device *drm_to_adev(struct drm_device *ddev)
@@ -1293,6 +1295,8 @@ bool amdgpu_device_has_job_running(struct amdgpu_device *adev);
bool amdgpu_device_should_recover_gpu(struct amdgpu_device *adev);
int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
struct amdgpu_job* job);
+int amdgpu_device_gpu_recover_imp(struct amdgpu_device *adev,
+ struct amdgpu_job *job);
void amdgpu_device_pci_config_reset(struct amdgpu_device *adev);
int amdgpu_device_pci_reset(struct amdgpu_device *adev);
bool amdgpu_device_need_post(struct amdgpu_device *adev);
@@ -1479,8 +1483,6 @@ static inline bool amdgpu_is_tmz(struct amdgpu_device *adev)
return adev->gmc.tmz_enabled;
}
-static inline int amdgpu_in_reset(struct amdgpu_device *adev)
-{
- return atomic_read(&adev->in_gpu_reset);
-}
+int amdgpu_in_reset(struct amdgpu_device *adev);
+
#endif
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
index 10b9e99c8941..e762e45b7b85 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
@@ -312,7 +312,7 @@ static void amdgpu_cs_get_threshold_for_moves(struct amdgpu_device *adev,
}
total_vram = adev->gmc.real_vram_size - atomic64_read(&adev->vram_pin_size);
- used_vram = amdgpu_vram_mgr_usage(&adev->mman.vram_mgr);
+ used_vram = ttm_resource_manager_usage(&adev->mman.vram_mgr.manager);
free_vram = used_vram >= total_vram ? 0 : total_vram - used_vram;
spin_lock(&adev->mm_stats.lock);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
index db8a8710333e..ae0c83237608 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
@@ -37,6 +37,8 @@
#include "amdgpu_fw_attestation.h"
#include "amdgpu_umr.h"
+#include "amdgpu_reset.h"
+
#if defined(CONFIG_DEBUG_FS)
/**
@@ -1284,7 +1286,7 @@ static int amdgpu_debugfs_test_ib_show(struct seq_file *m, void *unused)
}
/* Avoid accidently unparking the sched thread during GPU reset */
- r = down_write_killable(&adev->reset_sem);
+ r = down_write_killable(&adev->reset_domain->sem);
if (r)
return r;
@@ -1313,7 +1315,7 @@ static int amdgpu_debugfs_test_ib_show(struct seq_file *m, void *unused)
kthread_unpark(ring->sched.thread);
}
- up_write(&adev->reset_sem);
+ up_write(&adev->reset_domain->sem);
pm_runtime_mark_last_busy(dev->dev);
pm_runtime_put_autosuspend(dev->dev);
@@ -1522,7 +1524,7 @@ static int amdgpu_debugfs_ib_preempt(void *data, u64 val)
return -ENOMEM;
/* Avoid accidently unparking the sched thread during GPU reset */
- r = down_read_killable(&adev->reset_sem);
+ r = down_read_killable(&adev->reset_domain->sem);
if (r)
goto pro_end;
@@ -1565,7 +1567,7 @@ failure:
/* restart the scheduler */
kthread_unpark(ring->sched.thread);
- up_read(&adev->reset_sem);
+ up_read(&adev->reset_domain->sem);
ttm_bo_unlock_delayed_workqueue(&adev->mman.bdev, resched);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 0cfccea722f8..ca8e76771ea1 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -426,10 +426,10 @@ bool amdgpu_device_skip_hw_access(struct amdgpu_device *adev)
* the lock.
*/
if (in_task()) {
- if (down_read_trylock(&adev->reset_sem))
- up_read(&adev->reset_sem);
+ if (down_read_trylock(&adev->reset_domain->sem))
+ up_read(&adev->reset_domain->sem);
else
- lockdep_assert_held(&adev->reset_sem);
+ lockdep_assert_held(&adev->reset_domain->sem);
}
#endif
return false;
@@ -455,9 +455,9 @@ uint32_t amdgpu_device_rreg(struct amdgpu_device *adev,
if ((reg * 4) < adev->rmmio_size) {
if (!(acc_flags & AMDGPU_REGS_NO_KIQ) &&
amdgpu_sriov_runtime(adev) &&
- down_read_trylock(&adev->reset_sem)) {
+ down_read_trylock(&adev->reset_domain->sem)) {
ret = amdgpu_kiq_rreg(adev, reg);
- up_read(&adev->reset_sem);
+ up_read(&adev->reset_domain->sem);
} else {
ret = readl(((void __iomem *)adev->rmmio) + (reg * 4));
}
@@ -540,9 +540,9 @@ void amdgpu_device_wreg(struct amdgpu_device *adev,
if ((reg * 4) < adev->rmmio_size) {
if (!(acc_flags & AMDGPU_REGS_NO_KIQ) &&
amdgpu_sriov_runtime(adev) &&
- down_read_trylock(&adev->reset_sem)) {
+ down_read_trylock(&adev->reset_domain->sem)) {
amdgpu_kiq_wreg(adev, reg, v);
- up_read(&adev->reset_sem);
+ up_read(&adev->reset_domain->sem);
} else {
writel(v, ((void __iomem *)adev->rmmio) + (reg * 4));
}
@@ -2331,6 +2331,49 @@ static int amdgpu_device_fw_loading(struct amdgpu_device *adev)
return r;
}
+static int amdgpu_device_init_schedulers(struct amdgpu_device *adev)
+{
+ long timeout;
+ int r, i;
+
+ for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
+ struct amdgpu_ring *ring = adev->rings[i];
+
+ /* No need to setup the GPU scheduler for rings that don't need it */
+ if (!ring || ring->no_scheduler)
+ continue;
+
+ switch (ring->funcs->type) {
+ case AMDGPU_RING_TYPE_GFX:
+ timeout = adev->gfx_timeout;
+ break;
+ case AMDGPU_RING_TYPE_COMPUTE:
+ timeout = adev->compute_timeout;
+ break;
+ case AMDGPU_RING_TYPE_SDMA:
+ timeout = adev->sdma_timeout;
+ break;
+ default:
+ timeout = adev->video_timeout;
+ break;
+ }
+
+ r = drm_sched_init(&ring->sched, &amdgpu_sched_ops,
+ ring->num_hw_submission, amdgpu_job_hang_limit,
+ timeout, adev->reset_domain->wq,
+ ring->sched_score, ring->name,
+ adev->dev);
+ if (r) {
+ DRM_ERROR("Failed to create scheduler on ring %s.\n",
+ ring->name);
+ return r;
+ }
+ }
+
+ return 0;
+}
+
+
/**
* amdgpu_device_ip_init - run init for hardware IPs
*
@@ -2442,8 +2485,28 @@ static int amdgpu_device_ip_init(struct amdgpu_device *adev)
if (r)
goto init_failed;
- if (adev->gmc.xgmi.num_physical_nodes > 1)
- amdgpu_xgmi_add_device(adev);
+ /**
+ * In case of XGMI grab extra reference for reset domain for this device
+ */
+ if (adev->gmc.xgmi.num_physical_nodes > 1) {
+ if (amdgpu_xgmi_add_device(adev) == 0) {
+ struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev);
+
+ if (!hive->reset_domain ||
+ !amdgpu_reset_get_reset_domain(hive->reset_domain)) {
+ r = -ENOENT;
+ goto init_failed;
+ }
+
+ /* Drop the early temporary reset domain we created for device */
+ amdgpu_reset_put_reset_domain(adev->reset_domain);
+ adev->reset_domain = hive->reset_domain;
+ }
+ }
+
+ r = amdgpu_device_init_schedulers(adev);
+ if (r)
+ goto init_failed;
/* Don't init kfd if whole hive need to be reset during init */
if (!adev->gmc.xgmi.pending_reset)
@@ -3543,8 +3606,6 @@ int amdgpu_device_init(struct amdgpu_device *adev,
mutex_init(&adev->mn_lock);
mutex_init(&adev->virt.vf_errors.lock);
hash_init(adev->mn_hash);
- atomic_set(&adev->in_gpu_reset, 0);
- init_rwsem(&adev->reset_sem);
mutex_init(&adev->psp.mutex);
mutex_init(&adev->notifier_lock);
mutex_init(&adev->pm.stable_pstate_ctx_lock);
@@ -3630,6 +3691,15 @@ int amdgpu_device_init(struct amdgpu_device *adev,
return r;
}
+ /*
+ * Reset domain needs to be present early, before XGMI hive discovered
+ * (if any) and intitialized to use reset sem and in_gpu reset flag
+ * early on during init.
+ */
+ adev->reset_domain = amdgpu_reset_create_reset_domain(SINGLE_DEVICE ,"amdgpu-reset-dev");
+ if (!adev->reset_domain)
+ return -ENOMEM;
+
/* early init functions */
r = amdgpu_device_ip_early_init(adev);
if (r)
@@ -4006,6 +4076,9 @@ void amdgpu_device_fini_sw(struct amdgpu_device *adev)
if (adev->mman.discovery_bin)
amdgpu_discovery_fini(adev);
+ amdgpu_reset_put_reset_domain(adev->reset_domain);
+ adev->reset_domain = NULL;
+
kfree(adev->pci_state);
}
@@ -4817,17 +4890,8 @@ end:
return r;
}
-static bool amdgpu_device_lock_adev(struct amdgpu_device *adev,
- struct amdgpu_hive_info *hive)
+static void amdgpu_device_set_mp1_state(struct amdgpu_device *adev)
{
- if (atomic_cmpxchg(&adev->in_gpu_reset, 0, 1) != 0)
- return false;
-
- if (hive) {
- down_write_nest_lock(&adev->reset_sem, &hive->hive_lock);
- } else {
- down_write(&adev->reset_sem);
- }
switch (amdgpu_asic_reset_method(adev)) {
case AMD_RESET_METHOD_MODE1:
@@ -4840,56 +4904,12 @@ static bool amdgpu_device_lock_adev(struct amdgpu_device *adev,
adev->mp1_state = PP_MP1_STATE_NONE;
break;
}
-
- return true;
}
-static void amdgpu_device_unlock_adev(struct amdgpu_device *adev)
+static void amdgpu_device_unset_mp1_state(struct amdgpu_device *adev)
{
amdgpu_vf_error_trans_all(adev);
adev->mp1_state = PP_MP1_STATE_NONE;
- atomic_set(&adev->in_gpu_reset, 0);
- up_write(&adev->reset_sem);
-}
-
-/*
- * to lockup a list of amdgpu devices in a hive safely, if not a hive
- * with multiple nodes, it will be similar as amdgpu_device_lock_adev.
- *
- * unlock won't require roll back.
- */
-static int amdgpu_device_lock_hive_adev(struct amdgpu_device *adev, struct amdgpu_hive_info *hive)
-{
- struct amdgpu_device *tmp_adev = NULL;
-
- if (!amdgpu_sriov_vf(adev) && (adev->gmc.xgmi.num_physical_nodes > 1)) {
- if (!hive) {
- dev_err(adev->dev, "Hive is NULL while device has multiple xgmi nodes");
- return -ENODEV;
- }
- list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) {
- if (!amdgpu_device_lock_adev(tmp_adev, hive))
- goto roll_back;
- }
- } else if (!amdgpu_device_lock_adev(adev, hive))
- return -EAGAIN;
-
- return 0;
-roll_back:
- if (!list_is_first(&tmp_adev->gmc.xgmi.head, &hive->device_list)) {
- /*
- * if the lockup iteration break in the middle of a hive,
- * it may means there may has a race issue,
- * or a hive device locked up independently.
- * we may be in trouble and may not, so will try to roll back
- * the lock and give out a warnning.
- */
- dev_warn(tmp_adev->dev, "Hive lock iteration broke in the middle. Rolling back to unlock");
- list_for_each_entry_continue_reverse(tmp_adev, &hive->device_list, gmc.xgmi.head) {
- amdgpu_device_unlock_adev(tmp_adev);
- }
- }
- return -EAGAIN;
}
static void amdgpu_device_resume_display_audio(struct amdgpu_device *adev)
@@ -5023,7 +5043,7 @@ retry:
}
/**
- * amdgpu_device_gpu_recover - reset the asic and recover scheduler
+ * amdgpu_device_gpu_recover_imp - reset the asic and recover scheduler
*
* @adev: amdgpu_device pointer
* @job: which job trigger hang
@@ -5033,7 +5053,7 @@ retry:
* Returns 0 for success or an error on failure.
*/
-int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
+int amdgpu_device_gpu_recover_imp(struct amdgpu_device *adev,
struct amdgpu_job *job)
{
struct list_head device_list, *device_list_handle = NULL;
@@ -5067,26 +5087,10 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
dev_info(adev->dev, "GPU %s begin!\n",
need_emergency_restart ? "jobs stop":"reset");
- /*
- * Here we trylock to avoid chain of resets executing from
- * either trigger by jobs on different adevs in XGMI hive or jobs on
- * different schedulers for same device while this TO handler is running.
- * We always reset all schedulers for device and all devices for XGMI
- * hive so that should take care of them too.
- */
if (!amdgpu_sriov_vf(adev))
hive = amdgpu_get_xgmi_hive(adev);
- if (hive) {
- if (atomic_cmpxchg(&hive->in_reset, 0, 1) != 0) {
- DRM_INFO("Bailing on TDR for s_job:%llx, hive: %llx as another already in progress",
- job ? job->base.id : -1, hive->hive_id);
- amdgpu_put_xgmi_hive(hive);
- if (job && job->vm)
- drm_sched_increase_karma(&job->base);
- return 0;
- }
+ if (hive)
mutex_lock(&hive->hive_lock);
- }
reset_context.method = AMD_RESET_METHOD_NONE;
reset_context.reset_req_dev = adev;
@@ -5095,22 +5099,6 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags);
/*
- * lock the device before we try to operate the linked list
- * if didn't get the device lock, don't touch the linked list since
- * others may iterating it.
- */
- r = amdgpu_device_lock_hive_adev(adev, hive);
- if (r) {
- dev_info(adev->dev, "Bailing on TDR for s_job:%llx, as another already in progress",
- job ? job->base.id : -1);
-
- /* even we skipped this reset, still need to set the job to guilty */
- if (job && job->vm)
- drm_sched_increase_karma(&job->base);
- goto skip_recovery;
- }
-
- /*
* Build list of devices to reset.
* In case we are in XGMI hive mode, resort the device list
* to put adev in the 1st position.
@@ -5127,8 +5115,16 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
device_list_handle = &device_list;
}
+ /* We need to lock reset domain only once both for XGMI and single device */
+ tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device,
+ reset_list);
+ amdgpu_device_lock_reset_domain(tmp_adev->reset_domain);
+
/* block all schedulers and reset given job's ring */
list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
+
+ amdgpu_device_set_mp1_state(tmp_adev);
+
/*
* Try to put the audio codec into suspend state
* before gpu reset started.
@@ -5280,21 +5276,55 @@ skip_sched_resume:
if (audio_suspended)
amdgpu_device_resume_display_audio(tmp_adev);
- amdgpu_device_unlock_adev(tmp_adev);
+
+ amdgpu_device_unset_mp1_state(tmp_adev);
}
-skip_recovery:
+ tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device,
+ reset_list);
+ amdgpu_device_unlock_reset_domain(tmp_adev->reset_domain);
+
if (hive) {
- atomic_set(&hive->in_reset, 0);
mutex_unlock(&hive->hive_lock);
amdgpu_put_xgmi_hive(hive);
}
- if (r && r != -EAGAIN)
+ if (r)
dev_info(adev->dev, "GPU reset end with ret = %d\n", r);
return r;
}
+struct amdgpu_recover_work_struct {
+ struct work_struct base;
+ struct amdgpu_device *adev;
+ struct amdgpu_job *job;
+ int ret;
+};
+
+static void amdgpu_device_queue_gpu_recover_work(struct work_struct *work)
+{
+ struct amdgpu_recover_work_struct *recover_work = container_of(work, struct amdgpu_recover_work_struct, base);
+
+ recover_work->ret = amdgpu_device_gpu_recover_imp(recover_work->adev, recover_work->job);
+}
+/*
+ * Serialize gpu recover into reset domain single threaded wq
+ */
+int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
+ struct amdgpu_job *job)
+{
+ struct amdgpu_recover_work_struct work = {.adev = adev, .job = job};
+
+ INIT_WORK(&work.base, amdgpu_device_queue_gpu_recover_work);
+
+ if (!amdgpu_reset_domain_schedule(adev->reset_domain, &work.base))
+ return -EAGAIN;
+
+ flush_work(&work.base);
+
+ return work.ret;
+}
+
/**
* amdgpu_device_get_pcie_info - fence pcie info about the PCIE slot
*
@@ -5482,20 +5512,6 @@ int amdgpu_device_baco_exit(struct drm_device *dev)
return 0;
}
-static void amdgpu_cancel_all_tdr(struct amdgpu_device *adev)
-{
- int i;
-
- for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
- struct amdgpu_ring *ring = adev->rings[i];
-
- if (!ring || !ring->sched.thread)
- continue;
-
- cancel_delayed_work_sync(&ring->sched.work_tdr);
- }
-}
-
/**
* amdgpu_pci_error_detected - Called when a PCI error is detected.
* @pdev: PCI device struct
@@ -5526,14 +5542,11 @@ pci_ers_result_t amdgpu_pci_error_detected(struct pci_dev *pdev, pci_channel_sta
/* Fatal error, prepare for slot reset */
case pci_channel_io_frozen:
/*
- * Cancel and wait for all TDRs in progress if failing to
- * set adev->in_gpu_reset in amdgpu_device_lock_adev
- *
- * Locking adev->reset_sem will prevent any external access
+ * Locking adev->reset_domain->sem will prevent any external access
* to GPU during PCI error recovery
*/
- while (!amdgpu_device_lock_adev(adev, NULL))
- amdgpu_cancel_all_tdr(adev);
+ amdgpu_device_lock_reset_domain(adev->reset_domain);
+ amdgpu_device_set_mp1_state(adev);
/*
* Block any work scheduling as we do for regular GPU reset
@@ -5640,7 +5653,8 @@ out:
DRM_INFO("PCIe error recovery succeeded\n");
} else {
DRM_ERROR("PCIe error recovery failed, err:%d", r);
- amdgpu_device_unlock_adev(adev);
+ amdgpu_device_unset_mp1_state(adev);
+ amdgpu_device_unlock_reset_domain(adev->reset_domain);
}
return r ? PCI_ERS_RESULT_DISCONNECT : PCI_ERS_RESULT_RECOVERED;
@@ -5677,7 +5691,8 @@ void amdgpu_pci_resume(struct pci_dev *pdev)
drm_sched_start(&ring->sched, true);
}
- amdgpu_device_unlock_adev(adev);
+ amdgpu_device_unset_mp1_state(adev);
+ amdgpu_device_unlock_reset_domain(adev->reset_domain);
}
bool amdgpu_device_cache_pci_state(struct pci_dev *pdev)
@@ -5754,6 +5769,11 @@ void amdgpu_device_invalidate_hdp(struct amdgpu_device *adev,
amdgpu_asic_invalidate_hdp(adev, ring);
}
+int amdgpu_in_reset(struct amdgpu_device *adev)
+{
+ return atomic_read(&adev->reset_domain->in_gpu_reset);
+ }
+
/**
* amdgpu_device_halt() - bring hardware to some kind of halt state
*
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_display.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_display.c
index ec4c9ef5f795..9e5fc4cdb8ec 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_display.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_display.c
@@ -961,7 +961,7 @@ static int amdgpu_display_verify_sizes(struct amdgpu_framebuffer *rfb)
int ret;
unsigned int i, block_width, block_height, block_size_log2;
- if (!rfb->base.dev->mode_config.allow_fb_modifiers)
+ if (rfb->base.dev->mode_config.fb_modifiers_not_supported)
return 0;
for (i = 0; i < format_info->num_planes; ++i) {
@@ -1148,7 +1148,7 @@ int amdgpu_display_framebuffer_init(struct drm_device *dev,
if (ret)
return ret;
- if (!dev->mode_config.allow_fb_modifiers) {
+ if (dev->mode_config.fb_modifiers_not_supported) {
drm_WARN_ONCE(dev, adev->family >= AMDGPU_FAMILY_AI,
"GFX9+ requires FB check based on format modifier\n");
ret = check_tiling_flags_gfx6(rfb);
@@ -1156,7 +1156,7 @@ int amdgpu_display_framebuffer_init(struct drm_device *dev,
return ret;
}
- if (dev->mode_config.allow_fb_modifiers &&
+ if (!dev->mode_config.fb_modifiers_not_supported &&
!(rfb->base.flags & DRM_MODE_FB_MODIFIERS)) {
ret = convert_tiling_flags_to_modifier(rfb);
if (ret) {
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
index 45977a72b5dd..5d13ed376ab4 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
@@ -446,24 +446,18 @@ int amdgpu_fence_driver_start_ring(struct amdgpu_ring *ring,
* for the requested ring.
*
* @ring: ring to init the fence driver on
- * @num_hw_submission: number of entries on the hardware queue
- * @sched_score: optional score atomic shared with other schedulers
*
* Init the fence driver for the requested ring (all asics).
* Helper function for amdgpu_fence_driver_init().
*/
-int amdgpu_fence_driver_init_ring(struct amdgpu_ring *ring,
- unsigned num_hw_submission,
- atomic_t *sched_score)
+int amdgpu_fence_driver_init_ring(struct amdgpu_ring *ring)
{
struct amdgpu_device *adev = ring->adev;
- long timeout;
- int r;
if (!adev)
return -EINVAL;
- if (!is_power_of_2(num_hw_submission))
+ if (!is_power_of_2(ring->num_hw_submission))
return -EINVAL;
ring->fence_drv.cpu_addr = NULL;
@@ -474,41 +468,14 @@ int amdgpu_fence_driver_init_ring(struct amdgpu_ring *ring,
timer_setup(&ring->fence_drv.fallback_timer, amdgpu_fence_fallback, 0);
- ring->fence_drv.num_fences_mask = num_hw_submission * 2 - 1;
+ ring->fence_drv.num_fences_mask = ring->num_hw_submission * 2 - 1;
spin_lock_init(&ring->fence_drv.lock);
- ring->fence_drv.fences = kcalloc(num_hw_submission * 2, sizeof(void *),
+ ring->fence_drv.fences = kcalloc(ring->num_hw_submission * 2, sizeof(void *),
GFP_KERNEL);
+
if (!ring->fence_drv.fences)
return -ENOMEM;
- /* No need to setup the GPU scheduler for rings that don't need it */
- if (ring->no_scheduler)
- return 0;
-
- switch (ring->funcs->type) {
- case AMDGPU_RING_TYPE_GFX:
- timeout = adev->gfx_timeout;
- break;
- case AMDGPU_RING_TYPE_COMPUTE:
- timeout = adev->compute_timeout;
- break;
- case AMDGPU_RING_TYPE_SDMA:
- timeout = adev->sdma_timeout;
- break;
- default:
- timeout = adev->video_timeout;
- break;
- }
-
- r = drm_sched_init(&ring->sched, &amdgpu_sched_ops,
- num_hw_submission, amdgpu_job_hang_limit,
- timeout, NULL, sched_score, ring->name);
- if (r) {
- DRM_ERROR("Failed to create scheduler on ring %s.\n",
- ring->name);
- return r;
- }
-
return 0;
}
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gtt_mgr.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_gtt_mgr.c
index 899a47011a67..dd78402e3cb0 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gtt_mgr.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gtt_mgr.c
@@ -60,7 +60,7 @@ static ssize_t amdgpu_mem_info_gtt_total_show(struct device *dev,
struct ttm_resource_manager *man;
man = ttm_manager_type(&adev->mman.bdev, TTM_PL_TT);
- return sysfs_emit(buf, "%llu\n", man->size * PAGE_SIZE);
+ return sysfs_emit(buf, "%llu\n", man->size);
}
/**
@@ -77,8 +77,9 @@ static ssize_t amdgpu_mem_info_gtt_used_show(struct device *dev,
{
struct drm_device *ddev = dev_get_drvdata(dev);
struct amdgpu_device *adev = drm_to_adev(ddev);
+ struct ttm_resource_manager *man = &adev->mman.gtt_mgr.manager;
- return sysfs_emit(buf, "%llu\n", amdgpu_gtt_mgr_usage(&adev->mman.gtt_mgr));
+ return sysfs_emit(buf, "%llu\n", ttm_resource_manager_usage(man));
}
static DEVICE_ATTR(mem_info_gtt_total, S_IRUGO,
@@ -130,20 +131,17 @@ static int amdgpu_gtt_mgr_new(struct ttm_resource_manager *man,
struct amdgpu_gtt_node *node;
int r;
- if (!(place->flags & TTM_PL_FLAG_TEMPORARY) &&
- atomic64_add_return(num_pages, &mgr->used) > man->size) {
- atomic64_sub(num_pages, &mgr->used);
- return -ENOSPC;
- }
-
node = kzalloc(struct_size(node, base.mm_nodes, 1), GFP_KERNEL);
- if (!node) {
- r = -ENOMEM;
- goto err_out;
- }
+ if (!node)
+ return -ENOMEM;
node->tbo = tbo;
ttm_resource_init(tbo, place, &node->base.base);
+ if (!(place->flags & TTM_PL_FLAG_TEMPORARY) &&
+ ttm_resource_manager_usage(man) > man->size) {
+ r = -ENOSPC;
+ goto err_free;
+ }
if (place->lpfn) {
spin_lock(&mgr->lock);
@@ -169,11 +167,6 @@ static int amdgpu_gtt_mgr_new(struct ttm_resource_manager *man,
err_free:
ttm_resource_fini(man, &node->base.base);
kfree(node);
-
-err_out:
- if (!(place->flags & TTM_PL_FLAG_TEMPORARY))
- atomic64_sub(num_pages, &mgr->used);
-
return r;
}
@@ -196,26 +189,11 @@ static void amdgpu_gtt_mgr_del(struct ttm_resource_manager *man,
drm_mm_remove_node(&node->base.mm_nodes[0]);
spin_unlock(&mgr->lock);
- if (!(res->placement & TTM_PL_FLAG_TEMPORARY))
- atomic64_sub(res->num_pages, &mgr->used);
-
ttm_resource_fini(man, res);
kfree(node);
}
/**
- * amdgpu_gtt_mgr_usage - return usage of GTT domain
- *
- * @mgr: amdgpu_gtt_mgr pointer
- *
- * Return how many bytes are used in the GTT domain
- */
-uint64_t amdgpu_gtt_mgr_usage(struct amdgpu_gtt_mgr *mgr)
-{
- return atomic64_read(&mgr->used) * PAGE_SIZE;
-}
-
-/**
* amdgpu_gtt_mgr_recover - re-init gart
*
* @mgr: amdgpu_gtt_mgr pointer
@@ -255,9 +233,6 @@ static void amdgpu_gtt_mgr_debug(struct ttm_resource_manager *man,
spin_lock(&mgr->lock);
drm_mm_print(&mgr->mm, printer);
spin_unlock(&mgr->lock);
-
- drm_printf(printer, "man size:%llu pages, gtt used:%llu pages\n",
- man->size, atomic64_read(&mgr->used));
}
static const struct ttm_resource_manager_func amdgpu_gtt_mgr_func = {
@@ -283,14 +258,12 @@ int amdgpu_gtt_mgr_init(struct amdgpu_device *adev, uint64_t gtt_size)
man->use_tt = true;
man->func = &amdgpu_gtt_mgr_func;
- ttm_resource_manager_init(man, &adev->mman.bdev,
- gtt_size >> PAGE_SHIFT);
+ ttm_resource_manager_init(man, &adev->mman.bdev, gtt_size);
start = AMDGPU_GTT_MAX_TRANSFER_SIZE * AMDGPU_GTT_NUM_TRANSFER_WINDOWS;
size = (adev->gmc.gart_size >> PAGE_SHIFT) - start;
drm_mm_init(&mgr->mm, start, size);
spin_lock_init(&mgr->lock);
- atomic64_set(&mgr->used, 0);
ttm_set_driver_manager(&adev->mman.bdev, TTM_PL_TT, &mgr->manager);
ttm_resource_manager_set_used(man, true);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
index 4870e093213d..d970336d2261 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
@@ -64,10 +64,9 @@ static enum drm_gpu_sched_stat amdgpu_job_timedout(struct drm_sched_job *s_job)
ti.process_name, ti.tgid, ti.task_name, ti.pid);
if (amdgpu_device_should_recover_gpu(ring->adev)) {
- r = amdgpu_device_gpu_recover(ring->adev, job);
+ r = amdgpu_device_gpu_recover_imp(ring->adev, job);
if (r)
DRM_ERROR("GPU Recovery Failed: %d\n", r);
-
} else {
drm_sched_suspend_timeout(&ring->sched);
if (amdgpu_sriov_vf(adev))
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
index 9f985bd463be..6b626c293e72 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
@@ -604,13 +604,13 @@ int amdgpu_info_ioctl(struct drm_device *dev, void *data, struct drm_file *filp)
ui64 = atomic64_read(&adev->num_vram_cpu_page_faults);
return copy_to_user(out, &ui64, min(size, 8u)) ? -EFAULT : 0;
case AMDGPU_INFO_VRAM_USAGE:
- ui64 = amdgpu_vram_mgr_usage(&adev->mman.vram_mgr);
+ ui64 = ttm_resource_manager_usage(&adev->mman.vram_mgr.manager);
return copy_to_user(out, &ui64, min(size, 8u)) ? -EFAULT : 0;
case AMDGPU_INFO_VIS_VRAM_USAGE:
ui64 = amdgpu_vram_mgr_vis_usage(&adev->mman.vram_mgr);
return copy_to_user(out, &ui64, min(size, 8u)) ? -EFAULT : 0;
case AMDGPU_INFO_GTT_USAGE:
- ui64 = amdgpu_gtt_mgr_usage(&adev->mman.gtt_mgr);
+ ui64 = ttm_resource_manager_usage(&adev->mman.gtt_mgr.manager);
return copy_to_user(out, &ui64, min(size, 8u)) ? -EFAULT : 0;
case AMDGPU_INFO_GDS_CONFIG: {
struct drm_amdgpu_info_gds gds_info;
@@ -642,14 +642,17 @@ int amdgpu_info_ioctl(struct drm_device *dev, void *data, struct drm_file *filp)
case AMDGPU_INFO_MEMORY: {
struct drm_amdgpu_memory_info mem;
struct ttm_resource_manager *gtt_man =
- ttm_manager_type(&adev->mman.bdev, TTM_PL_TT);
+ &adev->mman.gtt_mgr.manager;
+ struct ttm_resource_manager *vram_man =
+ &adev->mman.vram_mgr.manager;
+
memset(&mem, 0, sizeof(mem));
mem.vram.total_heap_size = adev->gmc.real_vram_size;
mem.vram.usable_heap_size = adev->gmc.real_vram_size -
atomic64_read(&adev->vram_pin_size) -
AMDGPU_VM_RESERVED_VRAM;
mem.vram.heap_usage =
- amdgpu_vram_mgr_usage(&adev->mman.vram_mgr);
+ ttm_resource_manager_usage(vram_man);
mem.vram.max_allocation = mem.vram.usable_heap_size * 3 / 4;
mem.cpu_accessible_vram.total_heap_size =
@@ -667,8 +670,7 @@ int amdgpu_info_ioctl(struct drm_device *dev, void *data, struct drm_file *filp)
mem.gtt.total_heap_size *= PAGE_SIZE;
mem.gtt.usable_heap_size = mem.gtt.total_heap_size -
atomic64_read(&adev->gart_pin_size);
- mem.gtt.heap_usage =
- amdgpu_gtt_mgr_usage(&adev->mman.gtt_mgr);
+ mem.gtt.heap_usage = ttm_resource_manager_usage(gtt_man);
mem.gtt.max_allocation = mem.gtt.usable_heap_size * 3 / 4;
return copy_to_user(out, &mem,
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
index 23c9a60693ee..25731719c627 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
@@ -451,7 +451,7 @@ static bool amdgpu_bo_validate_size(struct amdgpu_device *adev,
if (domain & AMDGPU_GEM_DOMAIN_GTT) {
man = ttm_manager_type(&adev->mman.bdev, TTM_PL_TT);
- if (size < (man->size << PAGE_SHIFT))
+ if (size < man->size)
return true;
else
goto fail;
@@ -460,7 +460,7 @@ static bool amdgpu_bo_validate_size(struct amdgpu_device *adev,
if (domain & AMDGPU_GEM_DOMAIN_VRAM) {
man = ttm_manager_type(&adev->mman.bdev, TTM_PL_VRAM);
- if (size < (man->size << PAGE_SHIFT))
+ if (size < man->size)
return true;
else
goto fail;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_preempt_mgr.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_preempt_mgr.c
index 0d85c2096ab5..e8adfd0a570a 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_preempt_mgr.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_preempt_mgr.c
@@ -25,12 +25,6 @@
#include "amdgpu.h"
-static inline struct amdgpu_preempt_mgr *
-to_preempt_mgr(struct ttm_resource_manager *man)
-{
- return container_of(man, struct amdgpu_preempt_mgr, manager);
-}
-
/**
* DOC: mem_info_preempt_used
*
@@ -45,10 +39,9 @@ static ssize_t mem_info_preempt_used_show(struct device *dev,
{
struct drm_device *ddev = dev_get_drvdata(dev);
struct amdgpu_device *adev = drm_to_adev(ddev);
- struct ttm_resource_manager *man;
+ struct ttm_resource_manager *man = &adev->mman.preempt_mgr;
- man = ttm_manager_type(&adev->mman.bdev, AMDGPU_PL_PREEMPT);
- return sysfs_emit(buf, "%llu\n", amdgpu_preempt_mgr_usage(man));
+ return sysfs_emit(buf, "%llu\n", ttm_resource_manager_usage(man));
}
static DEVICE_ATTR_RO(mem_info_preempt_used);
@@ -68,16 +61,12 @@ static int amdgpu_preempt_mgr_new(struct ttm_resource_manager *man,
const struct ttm_place *place,
struct ttm_resource **res)
{
- struct amdgpu_preempt_mgr *mgr = to_preempt_mgr(man);
-
*res = kzalloc(sizeof(**res), GFP_KERNEL);
if (!*res)
return -ENOMEM;
ttm_resource_init(tbo, place, *res);
(*res)->start = AMDGPU_BO_INVALID_OFFSET;
-
- atomic64_add((*res)->num_pages, &mgr->used);
return 0;
}
@@ -92,49 +81,13 @@ static int amdgpu_preempt_mgr_new(struct ttm_resource_manager *man,
static void amdgpu_preempt_mgr_del(struct ttm_resource_manager *man,
struct ttm_resource *res)
{
- struct amdgpu_preempt_mgr *mgr = to_preempt_mgr(man);
-
- atomic64_sub(res->num_pages, &mgr->used);
ttm_resource_fini(man, res);
kfree(res);
}
-/**
- * amdgpu_preempt_mgr_usage - return usage of PREEMPT domain
- *
- * @man: TTM memory type manager
- *
- * Return how many bytes are used in the GTT domain
- */
-uint64_t amdgpu_preempt_mgr_usage(struct ttm_resource_manager *man)
-{
- struct amdgpu_preempt_mgr *mgr = to_preempt_mgr(man);
- s64 result = atomic64_read(&mgr->used);
-
- return (result > 0 ? result : 0) * PAGE_SIZE;
-}
-
-/**
- * amdgpu_preempt_mgr_debug - dump VRAM table
- *
- * @man: TTM memory type manager
- * @printer: DRM printer to use
- *
- * Dump the table content using printk.
- */
-static void amdgpu_preempt_mgr_debug(struct ttm_resource_manager *man,
- struct drm_printer *printer)
-{
- struct amdgpu_preempt_mgr *mgr = to_preempt_mgr(man);
-
- drm_printf(printer, "man size:%llu pages, preempt used:%lld pages\n",
- man->size, (u64)atomic64_read(&mgr->used));
-}
-
static const struct ttm_resource_manager_func amdgpu_preempt_mgr_func = {
.alloc = amdgpu_preempt_mgr_new,
.free = amdgpu_preempt_mgr_del,
- .debug = amdgpu_preempt_mgr_debug
};
/**
@@ -146,8 +99,7 @@ static const struct ttm_resource_manager_func amdgpu_preempt_mgr_func = {
*/
int amdgpu_preempt_mgr_init(struct amdgpu_device *adev)
{
- struct amdgpu_preempt_mgr *mgr = &adev->mman.preempt_mgr;
- struct ttm_resource_manager *man = &mgr->manager;
+ struct ttm_resource_manager *man = &adev->mman.preempt_mgr;
int ret;
man->use_tt = true;
@@ -155,16 +107,13 @@ int amdgpu_preempt_mgr_init(struct amdgpu_device *adev)
ttm_resource_manager_init(man, &adev->mman.bdev, (1 << 30));
- atomic64_set(&mgr->used, 0);
-
ret = device_create_file(adev->dev, &dev_attr_mem_info_preempt_used);
if (ret) {
DRM_ERROR("Failed to create device file mem_info_preempt_used\n");
return ret;
}
- ttm_set_driver_manager(&adev->mman.bdev, AMDGPU_PL_PREEMPT,
- &mgr->manager);
+ ttm_set_driver_manager(&adev->mman.bdev, AMDGPU_PL_PREEMPT, man);
ttm_resource_manager_set_used(man, true);
return 0;
}
@@ -179,8 +128,7 @@ int amdgpu_preempt_mgr_init(struct amdgpu_device *adev)
*/
void amdgpu_preempt_mgr_fini(struct amdgpu_device *adev)
{
- struct amdgpu_preempt_mgr *mgr = &adev->mman.preempt_mgr;
- struct ttm_resource_manager *man = &mgr->manager;
+ struct ttm_resource_manager *man = &adev->mman.preempt_mgr;
int ret;
ttm_resource_manager_set_used(man, false);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
index 2b844a5aafdb..a44f2eeed6ef 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
@@ -31,6 +31,8 @@
#include <linux/debugfs.h>
#include <linux/uaccess.h>
+#include "amdgpu_reset.h"
+
#define EEPROM_I2C_MADDR_VEGA20 0x0
#define EEPROM_I2C_MADDR_ARCTURUS 0x40000
#define EEPROM_I2C_MADDR_ARCTURUS_D342 0x0
@@ -193,12 +195,12 @@ static int __write_table_header(struct amdgpu_ras_eeprom_control *control)
__encode_table_header_to_buf(&control->tbl_hdr, buf);
/* i2c may be unstable in gpu reset */
- down_read(&adev->reset_sem);
+ down_read(&adev->reset_domain->sem);
res = amdgpu_eeprom_write(adev->pm.ras_eeprom_i2c_bus,
control->i2c_address +
control->ras_header_offset,
buf, RAS_TABLE_HEADER_SIZE);
- up_read(&adev->reset_sem);
+ up_read(&adev->reset_domain->sem);
if (res < 0) {
DRM_ERROR("Failed to write EEPROM table header:%d", res);
@@ -390,13 +392,13 @@ static int __amdgpu_ras_eeprom_write(struct amdgpu_ras_eeprom_control *control,
int res;
/* i2c may be unstable in gpu reset */
- down_read(&adev->reset_sem);
+ down_read(&adev->reset_domain->sem);
buf_size = num * RAS_TABLE_RECORD_SIZE;
res = amdgpu_eeprom_write(adev->pm.ras_eeprom_i2c_bus,
control->i2c_address +
RAS_INDEX_TO_OFFSET(control, fri),
buf, buf_size);
- up_read(&adev->reset_sem);
+ up_read(&adev->reset_domain->sem);
if (res < 0) {
DRM_ERROR("Writing %d EEPROM table records error:%d",
num, res);
@@ -550,12 +552,12 @@ amdgpu_ras_eeprom_update_header(struct amdgpu_ras_eeprom_control *control)
goto Out;
}
- down_read(&adev->reset_sem);
+ down_read(&adev->reset_domain->sem);
res = amdgpu_eeprom_read(adev->pm.ras_eeprom_i2c_bus,
control->i2c_address +
control->ras_record_offset,
buf, buf_size);
- up_read(&adev->reset_sem);
+ up_read(&adev->reset_domain->sem);
if (res < 0) {
DRM_ERROR("EEPROM failed reading records:%d\n",
res);
@@ -645,13 +647,13 @@ static int __amdgpu_ras_eeprom_read(struct amdgpu_ras_eeprom_control *control,
int res;
/* i2c may be unstable in gpu reset */
- down_read(&adev->reset_sem);
+ down_read(&adev->reset_domain->sem);
buf_size = num * RAS_TABLE_RECORD_SIZE;
res = amdgpu_eeprom_read(adev->pm.ras_eeprom_i2c_bus,
control->i2c_address +
RAS_INDEX_TO_OFFSET(control, fri),
buf, buf_size);
- up_read(&adev->reset_sem);
+ up_read(&adev->reset_domain->sem);
if (res < 0) {
DRM_ERROR("Reading %d EEPROM table records error:%d",
num, res);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c
index 02afd4115675..248d64158721 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c
@@ -96,3 +96,59 @@ int amdgpu_reset_perform_reset(struct amdgpu_device *adev,
return reset_handler->restore_hwcontext(adev->reset_cntl,
reset_context);
}
+
+
+void amdgpu_reset_destroy_reset_domain(struct kref *ref)
+{
+ struct amdgpu_reset_domain *reset_domain = container_of(ref,
+ struct amdgpu_reset_domain,
+ refcount);
+ if (reset_domain->wq)
+ destroy_workqueue(reset_domain->wq);
+
+ kvfree(reset_domain);
+}
+
+struct amdgpu_reset_domain *amdgpu_reset_create_reset_domain(enum amdgpu_reset_domain_type type,
+ char *wq_name)
+{
+ struct amdgpu_reset_domain *reset_domain;
+
+ reset_domain = kvzalloc(sizeof(struct amdgpu_reset_domain), GFP_KERNEL);
+ if (!reset_domain) {
+ DRM_ERROR("Failed to allocate amdgpu_reset_domain!");
+ return NULL;
+ }
+
+ reset_domain->type = type;
+ kref_init(&reset_domain->refcount);
+
+ reset_domain->wq = create_singlethread_workqueue(wq_name);
+ if (!reset_domain->wq) {
+ DRM_ERROR("Failed to allocate wq for amdgpu_reset_domain!");
+ amdgpu_reset_put_reset_domain(reset_domain);
+ return NULL;
+
+ }
+
+ atomic_set(&reset_domain->in_gpu_reset, 0);
+ init_rwsem(&reset_domain->sem);
+
+ return reset_domain;
+}
+
+void amdgpu_device_lock_reset_domain(struct amdgpu_reset_domain *reset_domain)
+{
+ atomic_set(&reset_domain->in_gpu_reset, 1);
+ down_write(&reset_domain->sem);
+}
+
+
+void amdgpu_device_unlock_reset_domain(struct amdgpu_reset_domain *reset_domain)
+{
+ atomic_set(&reset_domain->in_gpu_reset, 0);
+ up_write(&reset_domain->sem);
+}
+
+
+
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h
index e00d38d9160a..1949dbe28a86 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h
@@ -70,6 +70,21 @@ struct amdgpu_reset_control {
void (*async_reset)(struct work_struct *work);
};
+
+enum amdgpu_reset_domain_type {
+ SINGLE_DEVICE,
+ XGMI_HIVE
+};
+
+struct amdgpu_reset_domain {
+ struct kref refcount;
+ struct workqueue_struct *wq;
+ enum amdgpu_reset_domain_type type;
+ struct rw_semaphore sem;
+ atomic_t in_gpu_reset;
+};
+
+
int amdgpu_reset_init(struct amdgpu_device *adev);
int amdgpu_reset_fini(struct amdgpu_device *adev);
@@ -82,4 +97,29 @@ int amdgpu_reset_perform_reset(struct amdgpu_device *adev,
int amdgpu_reset_add_handler(struct amdgpu_reset_control *reset_ctl,
struct amdgpu_reset_handler *handler);
+struct amdgpu_reset_domain *amdgpu_reset_create_reset_domain(enum amdgpu_reset_domain_type type,
+ char *wq_name);
+
+void amdgpu_reset_destroy_reset_domain(struct kref *ref);
+
+static inline bool amdgpu_reset_get_reset_domain(struct amdgpu_reset_domain *domain)
+{
+ return kref_get_unless_zero(&domain->refcount) != 0;
+}
+
+static inline void amdgpu_reset_put_reset_domain(struct amdgpu_reset_domain *domain)
+{
+ kref_put(&domain->refcount, amdgpu_reset_destroy_reset_domain);
+}
+
+static inline bool amdgpu_reset_domain_schedule(struct amdgpu_reset_domain *domain,
+ struct work_struct *work)
+{
+ return queue_work(domain->wq, work);
+}
+
+void amdgpu_device_lock_reset_domain(struct amdgpu_reset_domain *reset_domain);
+
+void amdgpu_device_unlock_reset_domain(struct amdgpu_reset_domain *reset_domain);
+
#endif
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
index ab2351ba9574..35bcb6dc1816 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
@@ -191,8 +191,9 @@ int amdgpu_ring_init(struct amdgpu_device *adev, struct amdgpu_ring *ring,
ring->adev = adev;
ring->idx = adev->num_rings++;
adev->rings[ring->idx] = ring;
- r = amdgpu_fence_driver_init_ring(ring, sched_hw_submission,
- sched_score);
+ ring->num_hw_submission = sched_hw_submission;
+ ring->sched_score = sched_score;
+ r = amdgpu_fence_driver_init_ring(ring);
if (r)
return r;
}
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
index fae7d185ad0d..48365da213dc 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
@@ -114,9 +114,7 @@ struct amdgpu_fence_driver {
void amdgpu_fence_driver_clear_job_fences(struct amdgpu_ring *ring);
void amdgpu_fence_driver_force_completion(struct amdgpu_ring *ring);
-int amdgpu_fence_driver_init_ring(struct amdgpu_ring *ring,
- unsigned num_hw_submission,
- atomic_t *sched_score);
+int amdgpu_fence_driver_init_ring(struct amdgpu_ring *ring);
int amdgpu_fence_driver_start_ring(struct amdgpu_ring *ring,
struct amdgpu_irq_src *irq_src,
unsigned irq_type);
@@ -251,6 +249,8 @@ struct amdgpu_ring {
bool has_compute_vm_bug;
bool no_scheduler;
int hw_prio;
+ unsigned num_hw_submission;
+ atomic_t *sched_score;
};
#define amdgpu_ring_parse_cs(r, p, ib) ((r)->funcs->parse_cs((p), (ib)))
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_sync.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_sync.c
index f7d8487799b2..40e06745fae9 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_sync.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_sync.c
@@ -261,10 +261,9 @@ int amdgpu_sync_resv(struct amdgpu_device *adev, struct amdgpu_sync *sync,
dma_resv_for_each_fence(&cursor, resv, true, f) {
dma_fence_chain_for_each(f, f) {
- struct dma_fence_chain *chain = to_dma_fence_chain(f);
+ struct dma_fence *tmp = dma_fence_chain_contained(f);
- if (amdgpu_sync_test_fence(adev, mode, owner, chain ?
- chain->fence : f)) {
+ if (amdgpu_sync_test_fence(adev, mode, owner, tmp)) {
r = amdgpu_sync_fence(sync, f);
dma_fence_put(f);
if (r)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
index 414a22dddc78..4b9ee6e27f74 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
@@ -1941,7 +1941,7 @@ void amdgpu_ttm_set_buffer_funcs_status(struct amdgpu_device *adev, bool enable)
size = adev->gmc.real_vram_size;
else
size = adev->gmc.visible_vram_size;
- man->size = size >> PAGE_SHIFT;
+ man->size = size;
adev->mman.buffer_funcs_enabled = enable;
}
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h
index 0e4ecc77db3f..9120ae80ef52 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h
@@ -44,7 +44,6 @@ struct amdgpu_vram_mgr {
spinlock_t lock;
struct list_head reservations_pending;
struct list_head reserved_pages;
- atomic64_t usage;
atomic64_t vis_usage;
};
@@ -52,12 +51,6 @@ struct amdgpu_gtt_mgr {
struct ttm_resource_manager manager;
struct drm_mm mm;
spinlock_t lock;
- atomic64_t used;
-};
-
-struct amdgpu_preempt_mgr {
- struct ttm_resource_manager manager;
- atomic64_t used;
};
struct amdgpu_mman {
@@ -76,7 +69,7 @@ struct amdgpu_mman {
struct amdgpu_vram_mgr vram_mgr;
struct amdgpu_gtt_mgr gtt_mgr;
- struct amdgpu_preempt_mgr preempt_mgr;
+ struct ttm_resource_manager preempt_mgr;
uint64_t stolen_vga_size;
struct amdgpu_bo *stolen_vga_memory;
@@ -118,7 +111,6 @@ int amdgpu_vram_mgr_init(struct amdgpu_device *adev);
void amdgpu_vram_mgr_fini(struct amdgpu_device *adev);
bool amdgpu_gtt_mgr_has_gart_addr(struct ttm_resource *mem);
-uint64_t amdgpu_gtt_mgr_usage(struct amdgpu_gtt_mgr *mgr);
void amdgpu_gtt_mgr_recover(struct amdgpu_gtt_mgr *mgr);
uint64_t amdgpu_preempt_mgr_usage(struct ttm_resource_manager *man);
@@ -133,7 +125,6 @@ int amdgpu_vram_mgr_alloc_sgt(struct amdgpu_device *adev,
void amdgpu_vram_mgr_free_sgt(struct device *dev,
enum dma_data_direction dir,
struct sg_table *sgt);
-uint64_t amdgpu_vram_mgr_usage(struct amdgpu_vram_mgr *mgr);
uint64_t amdgpu_vram_mgr_vis_usage(struct amdgpu_vram_mgr *mgr);
int amdgpu_vram_mgr_reserve_range(struct amdgpu_vram_mgr *mgr,
uint64_t start, uint64_t size);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
index 5656bf7d9267..a025f080aa6a 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
@@ -575,8 +575,10 @@ static int amdgpu_virt_write_vf2pf_data(struct amdgpu_device *adev)
vf2pf_info->driver_cert = 0;
vf2pf_info->os_info.all = 0;
- vf2pf_info->fb_usage = amdgpu_vram_mgr_usage(&adev->mman.vram_mgr) >> 20;
- vf2pf_info->fb_vis_usage = amdgpu_vram_mgr_vis_usage(&adev->mman.vram_mgr) >> 20;
+ vf2pf_info->fb_usage =
+ ttm_resource_manager_usage(&adev->mman.vram_mgr.manager) >> 20;
+ vf2pf_info->fb_vis_usage =
+ amdgpu_vram_mgr_vis_usage(&adev->mman.vram_mgr) >> 20;
vf2pf_info->fb_size = adev->gmc.real_vram_size >> 20;
vf2pf_info->fb_vis_size = adev->gmc.visible_vram_size >> 20;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c
index fce9a13a6ba1..0a7611648573 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c
@@ -96,9 +96,9 @@ static ssize_t amdgpu_mem_info_vram_used_show(struct device *dev,
{
struct drm_device *ddev = dev_get_drvdata(dev);
struct amdgpu_device *adev = drm_to_adev(ddev);
+ struct ttm_resource_manager *man = &adev->mman.vram_mgr.manager;
- return sysfs_emit(buf, "%llu\n",
- amdgpu_vram_mgr_usage(&adev->mman.vram_mgr));
+ return sysfs_emit(buf, "%llu\n", ttm_resource_manager_usage(man));
}
/**
@@ -253,7 +253,9 @@ static void amdgpu_vram_mgr_do_reserve(struct ttm_resource_manager *man)
vis_usage = amdgpu_vram_mgr_vis_size(adev, &rsv->mm_node);
atomic64_add(vis_usage, &mgr->vis_usage);
- atomic64_add(rsv->mm_node.size << PAGE_SHIFT, &mgr->usage);
+ spin_lock(&man->bdev->lru_lock);
+ man->usage += rsv->mm_node.size << PAGE_SHIFT;
+ spin_unlock(&man->bdev->lru_lock);
list_move(&rsv->node, &mgr->reserved_pages);
}
}
@@ -378,19 +380,13 @@ static int amdgpu_vram_mgr_new(struct ttm_resource_manager *man,
lpfn = place->lpfn;
if (!lpfn)
- lpfn = man->size;
+ lpfn = man->size >> PAGE_SHIFT;
max_bytes = adev->gmc.mc_vram_size;
if (tbo->type != ttm_bo_type_kernel)
max_bytes -= AMDGPU_VM_RESERVED_VRAM;
- /* bail out quickly if there's likely not enough VRAM for this BO */
mem_bytes = tbo->base.size;
- if (atomic64_add_return(mem_bytes, &mgr->usage) > max_bytes) {
- r = -ENOSPC;
- goto error_sub;
- }
-
if (place->flags & TTM_PL_FLAG_CONTIGUOUS) {
pages_per_node = ~0ul;
num_nodes = 1;
@@ -408,13 +404,17 @@ static int amdgpu_vram_mgr_new(struct ttm_resource_manager *man,
node = kvmalloc(struct_size(node, mm_nodes, num_nodes),
GFP_KERNEL | __GFP_ZERO);
- if (!node) {
- r = -ENOMEM;
- goto error_sub;
- }
+ if (!node)
+ return -ENOMEM;
ttm_resource_init(tbo, place, &node->base);
+ /* bail out quickly if there's likely not enough VRAM for this BO */
+ if (ttm_resource_manager_usage(man) > max_bytes) {
+ r = -ENOSPC;
+ goto error_fini;
+ }
+
mode = DRM_MM_INSERT_BEST;
if (place->flags & TTM_PL_FLAG_TOPDOWN)
mode = DRM_MM_INSERT_HIGH;
@@ -472,11 +472,10 @@ error_free:
while (i--)
drm_mm_remove_node(&node->mm_nodes[i]);
spin_unlock(&mgr->lock);
+error_fini:
ttm_resource_fini(man, &node->base);
kvfree(node);
-error_sub:
- atomic64_sub(mem_bytes, &mgr->usage);
return r;
}
@@ -494,7 +493,7 @@ static void amdgpu_vram_mgr_del(struct ttm_resource_manager *man,
struct ttm_range_mgr_node *node = to_ttm_range_mgr_node(res);
struct amdgpu_vram_mgr *mgr = to_vram_mgr(man);
struct amdgpu_device *adev = to_amdgpu_device(mgr);
- uint64_t usage = 0, vis_usage = 0;
+ uint64_t vis_usage = 0;
unsigned i, pages;
spin_lock(&mgr->lock);
@@ -503,13 +502,11 @@ static void amdgpu_vram_mgr_del(struct ttm_resource_manager *man,
struct drm_mm_node *mm = &node->mm_nodes[i];
drm_mm_remove_node(mm);
- usage += mm->size << PAGE_SHIFT;
vis_usage += amdgpu_vram_mgr_vis_size(adev, mm);
}
amdgpu_vram_mgr_do_reserve(man);
spin_unlock(&mgr->lock);
- atomic64_sub(usage, &mgr->usage);
atomic64_sub(vis_usage, &mgr->vis_usage);
ttm_resource_fini(man, res);
@@ -628,18 +625,6 @@ void amdgpu_vram_mgr_free_sgt(struct device *dev,
}
/**
- * amdgpu_vram_mgr_usage - how many bytes are used in this domain
- *
- * @mgr: amdgpu_vram_mgr pointer
- *
- * Returns how many bytes are used in this domain.
- */
-uint64_t amdgpu_vram_mgr_usage(struct amdgpu_vram_mgr *mgr)
-{
- return atomic64_read(&mgr->usage);
-}
-
-/**
* amdgpu_vram_mgr_vis_usage - how many bytes are used in the visible part
*
* @mgr: amdgpu_vram_mgr pointer
@@ -664,13 +649,12 @@ static void amdgpu_vram_mgr_debug(struct ttm_resource_manager *man,
{
struct amdgpu_vram_mgr *mgr = to_vram_mgr(man);
+ drm_printf(printer, " vis usage:%llu\n",
+ amdgpu_vram_mgr_vis_usage(mgr));
+
spin_lock(&mgr->lock);
drm_mm_print(&mgr->mm, printer);
spin_unlock(&mgr->lock);
-
- drm_printf(printer, "man size:%llu pages, ram usage:%lluMB, vis usage:%lluMB\n",
- man->size, amdgpu_vram_mgr_usage(mgr) >> 20,
- amdgpu_vram_mgr_vis_usage(mgr) >> 20);
}
static const struct ttm_resource_manager_func amdgpu_vram_mgr_func = {
@@ -692,11 +676,11 @@ int amdgpu_vram_mgr_init(struct amdgpu_device *adev)
struct ttm_resource_manager *man = &mgr->manager;
ttm_resource_manager_init(man, &adev->mman.bdev,
- adev->gmc.real_vram_size >> PAGE_SHIFT);
+ adev->gmc.real_vram_size);
man->func = &amdgpu_vram_mgr_func;
- drm_mm_init(&mgr->mm, 0, man->size);
+ drm_mm_init(&mgr->mm, 0, man->size >> PAGE_SHIFT);
spin_lock_init(&mgr->lock);
INIT_LIST_HEAD(&mgr->reservations_pending);
INIT_LIST_HEAD(&mgr->reserved_pages);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
index 77b65434ccc2..91817a31f3e1 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
@@ -32,6 +32,8 @@
#include "wafl/wafl2_4_0_0_smn.h"
#include "wafl/wafl2_4_0_0_sh_mask.h"
+#include "amdgpu_reset.h"
+
#define smnPCS_XGMI23_PCS_ERROR_STATUS 0x11a01210
#define smnPCS_XGMI3X16_PCS_ERROR_STATUS 0x11a0020c
#define smnPCS_GOPX1_PCS_ERROR_STATUS 0x12200210
@@ -227,6 +229,9 @@ static void amdgpu_xgmi_hive_release(struct kobject *kobj)
struct amdgpu_hive_info *hive = container_of(
kobj, struct amdgpu_hive_info, kobj);
+ amdgpu_reset_put_reset_domain(hive->reset_domain);
+ hive->reset_domain = NULL;
+
mutex_destroy(&hive->hive_lock);
kfree(hive);
}
@@ -398,15 +403,35 @@ struct amdgpu_hive_info *amdgpu_get_xgmi_hive(struct amdgpu_device *adev)
goto pro_end;
}
+ /**
+ * Avoid recreating reset domain when hive is reconstructed for the case
+ * of reset the devices in the XGMI hive during probe for SRIOV
+ * See https://www.spinics.net/lists/amd-gfx/msg58836.html
+ */
+ if (adev->reset_domain->type != XGMI_HIVE) {
+ hive->reset_domain = amdgpu_reset_create_reset_domain(XGMI_HIVE, "amdgpu-reset-hive");
+ if (!hive->reset_domain) {
+ dev_err(adev->dev, "XGMI: failed initializing reset domain for xgmi hive\n");
+ ret = -ENOMEM;
+ kobject_put(&hive->kobj);
+ kfree(hive);
+ hive = NULL;
+ goto pro_end;
+ }
+ } else {
+ amdgpu_reset_get_reset_domain(adev->reset_domain);
+ hive->reset_domain = adev->reset_domain;
+ }
+
hive->hive_id = adev->gmc.xgmi.hive_id;
INIT_LIST_HEAD(&hive->device_list);
INIT_LIST_HEAD(&hive->node);
mutex_init(&hive->hive_lock);
- atomic_set(&hive->in_reset, 0);
atomic_set(&hive->number_devices, 0);
task_barrier_init(&hive->tb);
hive->pstate = AMDGPU_XGMI_PSTATE_UNKNOWN;
hive->hi_req_gpu = NULL;
+
/*
* hive pstate on boot is high in vega20 so we have to go to low
* pstate on after boot.
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h
index 0afca51c3c0c..b4a705545657 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h
@@ -33,7 +33,6 @@ struct amdgpu_hive_info {
struct list_head node;
atomic_t number_devices;
struct mutex hive_lock;
- atomic_t in_reset;
int hi_req_count;
struct amdgpu_device *hi_req_gpu;
struct task_barrier tb;
@@ -42,6 +41,8 @@ struct amdgpu_hive_info {
AMDGPU_XGMI_PSTATE_MAX_VEGA20,
AMDGPU_XGMI_PSTATE_UNKNOWN
} pstate;
+
+ struct amdgpu_reset_domain *reset_domain;
};
struct amdgpu_pcs_ras_field {
diff --git a/drivers/gpu/drm/amd/amdgpu/atom.c b/drivers/gpu/drm/amd/amdgpu/atom.c
index 6fa2229b7229..1c5d9388ad0b 100644
--- a/drivers/gpu/drm/amd/amdgpu/atom.c
+++ b/drivers/gpu/drm/amd/amdgpu/atom.c
@@ -25,6 +25,8 @@
#include <linux/module.h>
#include <linux/sched.h>
#include <linux/slab.h>
+#include <linux/string_helpers.h>
+
#include <asm/unaligned.h>
#include <drm/drm_util.h>
@@ -740,7 +742,7 @@ static void atom_op_jump(atom_exec_context *ctx, int *ptr, int arg)
break;
}
if (arg != ATOM_COND_ALWAYS)
- SDEBUG(" taken: %s\n", execute ? "yes" : "no");
+ SDEBUG(" taken: %s\n", str_yes_no(execute));
SDEBUG(" target: 0x%04X\n", target);
if (execute) {
if (ctx->last_jump == (ctx->start + target)) {
diff --git a/drivers/gpu/drm/amd/amdgpu/dce_v10_0.c b/drivers/gpu/drm/amd/amdgpu/dce_v10_0.c
index 5d5205870861..288fce7dc0ed 100644
--- a/drivers/gpu/drm/amd/amdgpu/dce_v10_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/dce_v10_0.c
@@ -2798,6 +2798,8 @@ static int dce_v10_0_sw_init(void *handle)
adev_to_drm(adev)->mode_config.preferred_depth = 24;
adev_to_drm(adev)->mode_config.prefer_shadow = 1;
+ adev_to_drm(adev)->mode_config.fb_modifiers_not_supported = true;
+
adev_to_drm(adev)->mode_config.fb_base = adev->gmc.aper_base;
r = amdgpu_display_modeset_create_props(adev);
diff --git a/drivers/gpu/drm/amd/amdgpu/dce_v11_0.c b/drivers/gpu/drm/amd/amdgpu/dce_v11_0.c
index 4d812b22c54f..cbe5250b31cb 100644
--- a/drivers/gpu/drm/amd/amdgpu/dce_v11_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/dce_v11_0.c
@@ -2916,6 +2916,8 @@ static int dce_v11_0_sw_init(void *handle)
adev_to_drm(adev)->mode_config.preferred_depth = 24;
adev_to_drm(adev)->mode_config.prefer_shadow = 1;
+ adev_to_drm(adev)->mode_config.fb_modifiers_not_supported = true;
+
adev_to_drm(adev)->mode_config.fb_base = adev->gmc.aper_base;
r = amdgpu_display_modeset_create_props(adev);
diff --git a/drivers/gpu/drm/amd/amdgpu/dce_v6_0.c b/drivers/gpu/drm/amd/amdgpu/dce_v6_0.c
index b90bc2adf778..982855e6cf52 100644
--- a/drivers/gpu/drm/amd/amdgpu/dce_v6_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/dce_v6_0.c
@@ -2674,6 +2674,7 @@ static int dce_v6_0_sw_init(void *handle)
adev_to_drm(adev)->mode_config.max_height = 16384;
adev_to_drm(adev)->mode_config.preferred_depth = 24;
adev_to_drm(adev)->mode_config.prefer_shadow = 1;
+ adev_to_drm(adev)->mode_config.fb_modifiers_not_supported = true;
adev_to_drm(adev)->mode_config.fb_base = adev->gmc.aper_base;
r = amdgpu_display_modeset_create_props(adev);
diff --git a/drivers/gpu/drm/amd/amdgpu/dce_v8_0.c b/drivers/gpu/drm/amd/amdgpu/dce_v8_0.c
index 7c1379b02f94..84440741c60b 100644
--- a/drivers/gpu/drm/amd/amdgpu/dce_v8_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/dce_v8_0.c
@@ -2695,6 +2695,8 @@ static int dce_v8_0_sw_init(void *handle)
adev_to_drm(adev)->mode_config.preferred_depth = 24;
adev_to_drm(adev)->mode_config.prefer_shadow = 1;
+ adev_to_drm(adev)->mode_config.fb_modifiers_not_supported = true;
+
adev_to_drm(adev)->mode_config.fb_base = adev->gmc.aper_base;
r = amdgpu_display_modeset_create_props(adev);
diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
index e7add2020d48..3dcd82b49481 100644
--- a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
@@ -48,6 +48,8 @@
#include "athub_v2_0.h"
#include "athub_v2_1.h"
+#include "amdgpu_reset.h"
+
#if 0
static const struct soc15_reg_golden golden_settings_navi10_hdp[] =
{
@@ -328,7 +330,7 @@ static void gmc_v10_0_flush_gpu_tlb(struct amdgpu_device *adev, uint32_t vmid,
*/
if (adev->gfx.kiq.ring.sched.ready &&
(amdgpu_sriov_runtime(adev) || !amdgpu_sriov_vf(adev)) &&
- down_read_trylock(&adev->reset_sem)) {
+ down_read_trylock(&adev->reset_domain->sem)) {
struct amdgpu_vmhub *hub = &adev->vmhub[vmhub];
const unsigned eng = 17;
u32 inv_req = hub->vmhub_funcs->get_invalidate_req(vmid, flush_type);
@@ -338,7 +340,7 @@ static void gmc_v10_0_flush_gpu_tlb(struct amdgpu_device *adev, uint32_t vmid,
amdgpu_virt_kiq_reg_write_reg_wait(adev, req, ack, inv_req,
1 << vmid);
- up_read(&adev->reset_sem);
+ up_read(&adev->reset_domain->sem);
return;
}
diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
index 412e44af1608..df35f0252eea 100644
--- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
@@ -62,6 +62,8 @@
#include "amdgpu_ras.h"
#include "amdgpu_xgmi.h"
+#include "amdgpu_reset.h"
+
/* add these here since we already include dce12 headers and these are for DCN */
#define mmHUBP0_DCSURF_PRI_VIEWPORT_DIMENSION 0x055d
#define mmHUBP0_DCSURF_PRI_VIEWPORT_DIMENSION_BASE_IDX 2
@@ -787,13 +789,13 @@ static void gmc_v9_0_flush_gpu_tlb(struct amdgpu_device *adev, uint32_t vmid,
*/
if (adev->gfx.kiq.ring.sched.ready &&
(amdgpu_sriov_runtime(adev) || !amdgpu_sriov_vf(adev)) &&
- down_read_trylock(&adev->reset_sem)) {
+ down_read_trylock(&adev->reset_domain->sem)) {
uint32_t req = hub->vm_inv_eng0_req + hub->eng_distance * eng;
uint32_t ack = hub->vm_inv_eng0_ack + hub->eng_distance * eng;
amdgpu_virt_kiq_reg_write_reg_wait(adev, req, ack, inv_req,
1 << vmid);
- up_read(&adev->reset_sem);
+ up_read(&adev->reset_domain->sem);
return;
}
@@ -900,7 +902,7 @@ static int gmc_v9_0_flush_gpu_tlb_pasid(struct amdgpu_device *adev,
if (amdgpu_in_reset(adev))
return -EIO;
- if (ring->sched.ready && down_read_trylock(&adev->reset_sem)) {
+ if (ring->sched.ready && down_read_trylock(&adev->reset_domain->sem)) {
/* Vega20+XGMI caches PTEs in TC and TLB. Add a
* heavy-weight TLB flush (type 2), which flushes
* both. Due to a race condition with concurrent
@@ -927,7 +929,7 @@ static int gmc_v9_0_flush_gpu_tlb_pasid(struct amdgpu_device *adev,
if (r) {
amdgpu_ring_undo(ring);
spin_unlock(&adev->gfx.kiq.ring_lock);
- up_read(&adev->reset_sem);
+ up_read(&adev->reset_domain->sem);
return -ETIME;
}
@@ -936,10 +938,10 @@ static int gmc_v9_0_flush_gpu_tlb_pasid(struct amdgpu_device *adev,
r = amdgpu_fence_wait_polling(ring, seq, adev->usec_timeout);
if (r < 1) {
dev_err(adev->dev, "wait for kiq fence error: %ld.\n", r);
- up_read(&adev->reset_sem);
+ up_read(&adev->reset_domain->sem);
return -ETIME;
}
- up_read(&adev->reset_sem);
+ up_read(&adev->reset_domain->sem);
return 0;
}
diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
index 56da5ab82987..b81acf59870c 100644
--- a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
+++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
@@ -32,6 +32,8 @@
#include "soc15_common.h"
#include "mxgpu_ai.h"
+#include "amdgpu_reset.h"
+
static void xgpu_ai_mailbox_send_ack(struct amdgpu_device *adev)
{
WREG8(AI_MAIBOX_CONTROL_RCV_OFFSET_BYTE, 2);
@@ -257,10 +259,10 @@ static void xgpu_ai_mailbox_flr_work(struct work_struct *work)
* otherwise the mailbox msg will be ruined/reseted by
* the VF FLR.
*/
- if (atomic_cmpxchg(&adev->in_gpu_reset, 0, 1) != 0)
+ if (atomic_cmpxchg(&adev->reset_domain->in_gpu_reset, 0, 1) != 0)
return;
- down_write(&adev->reset_sem);
+ down_write(&adev->reset_domain->sem);
amdgpu_virt_fini_data_exchange(adev);
@@ -275,14 +277,14 @@ static void xgpu_ai_mailbox_flr_work(struct work_struct *work)
} while (timeout > 1);
flr_done:
- atomic_set(&adev->in_gpu_reset, 0);
- up_write(&adev->reset_sem);
+ atomic_set(&adev->reset_domain->in_gpu_reset, 0);
+ up_write(&adev->reset_domain->sem);
/* Trigger recovery for world switch failure if no TDR */
if (amdgpu_device_should_recover_gpu(adev)
&& (!amdgpu_device_has_job_running(adev) ||
adev->sdma_timeout == MAX_SCHEDULE_TIMEOUT))
- amdgpu_device_gpu_recover(adev, NULL);
+ amdgpu_device_gpu_recover_imp(adev, NULL);
}
static int xgpu_ai_set_mailbox_rcv_irq(struct amdgpu_device *adev,
@@ -307,8 +309,11 @@ static int xgpu_ai_mailbox_rcv_irq(struct amdgpu_device *adev,
switch (event) {
case IDH_FLR_NOTIFICATION:
- if (amdgpu_sriov_runtime(adev))
- schedule_work(&adev->virt.flr_work);
+ if (amdgpu_sriov_runtime(adev) && !amdgpu_in_reset(adev))
+ WARN_ONCE(!amdgpu_reset_domain_schedule(adev->reset_domain,
+ &adev->virt.flr_work),
+ "Failed to queue work! at %s",
+ __func__);
break;
case IDH_QUERY_ALIVE:
xgpu_ai_mailbox_send_ack(adev);
diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
index 477d0dde19c5..22c10b97ea81 100644
--- a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
+++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
@@ -31,6 +31,8 @@
#include "soc15_common.h"
#include "mxgpu_nv.h"
+#include "amdgpu_reset.h"
+
static void xgpu_nv_mailbox_send_ack(struct amdgpu_device *adev)
{
WREG8(NV_MAIBOX_CONTROL_RCV_OFFSET_BYTE, 2);
@@ -281,10 +283,10 @@ static void xgpu_nv_mailbox_flr_work(struct work_struct *work)
* otherwise the mailbox msg will be ruined/reseted by
* the VF FLR.
*/
- if (atomic_cmpxchg(&adev->in_gpu_reset, 0, 1) != 0)
+ if (atomic_cmpxchg(&adev->reset_domain->in_gpu_reset, 0, 1) != 0)
return;
- down_write(&adev->reset_sem);
+ down_write(&adev->reset_domain->sem);
amdgpu_virt_fini_data_exchange(adev);
@@ -299,8 +301,8 @@ static void xgpu_nv_mailbox_flr_work(struct work_struct *work)
} while (timeout > 1);
flr_done:
- atomic_set(&adev->in_gpu_reset, 0);
- up_write(&adev->reset_sem);
+ atomic_set(&adev->reset_domain->in_gpu_reset, 0);
+ up_write(&adev->reset_domain->sem);
/* Trigger recovery for world switch failure if no TDR */
if (amdgpu_device_should_recover_gpu(adev)
@@ -309,7 +311,7 @@ flr_done:
adev->gfx_timeout == MAX_SCHEDULE_TIMEOUT ||
adev->compute_timeout == MAX_SCHEDULE_TIMEOUT ||
adev->video_timeout == MAX_SCHEDULE_TIMEOUT))
- amdgpu_device_gpu_recover(adev, NULL);
+ amdgpu_device_gpu_recover_imp(adev, NULL);
}
static int xgpu_nv_set_mailbox_rcv_irq(struct amdgpu_device *adev,
@@ -337,8 +339,11 @@ static int xgpu_nv_mailbox_rcv_irq(struct amdgpu_device *adev,
switch (event) {
case IDH_FLR_NOTIFICATION:
- if (amdgpu_sriov_runtime(adev))
- schedule_work(&adev->virt.flr_work);
+ if (amdgpu_sriov_runtime(adev) && !amdgpu_in_reset(adev))
+ WARN_ONCE(!amdgpu_reset_domain_schedule(adev->reset_domain,
+ &adev->virt.flr_work),
+ "Failed to queue work! at %s",
+ __func__);
break;
/* READY_TO_ACCESS_GPU is fetched by kernel polling, IRQ can ignore
* it byfar since that polling thread will handle it,
diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c b/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c
index a642c04cf17d..7b63d30b9b79 100644
--- a/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c
+++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c
@@ -42,6 +42,8 @@
#include "smu/smu_7_1_3_d.h"
#include "mxgpu_vi.h"
+#include "amdgpu_reset.h"
+
/* VI golden setting */
static const u32 xgpu_fiji_mgcg_cgcg_init[] = {
mmRLC_CGTT_MGCG_OVERRIDE, 0xffffffff, 0xffffffff,
@@ -521,7 +523,7 @@ static void xgpu_vi_mailbox_flr_work(struct work_struct *work)
/* Trigger recovery due to world switch failure */
if (amdgpu_device_should_recover_gpu(adev))
- amdgpu_device_gpu_recover(adev, NULL);
+ amdgpu_device_gpu_recover_imp(adev, NULL);
}
static int xgpu_vi_set_mailbox_rcv_irq(struct amdgpu_device *adev,
@@ -550,8 +552,11 @@ static int xgpu_vi_mailbox_rcv_irq(struct amdgpu_device *adev,
r = xgpu_vi_mailbox_rcv_msg(adev, IDH_FLR_NOTIFICATION);
/* only handle FLR_NOTIFY now */
- if (!r)
- schedule_work(&adev->virt.flr_work);
+ if (!r && !amdgpu_in_reset(adev))
+ WARN_ONCE(!amdgpu_reset_domain_schedule(adev->reset_domain,
+ &adev->virt.flr_work),
+ "Failed to queue work! at %s",
+ __func__);
}
return 0;