aboutsummaryrefslogtreecommitdiffstats
path: root/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_device.c')
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_device.c129
1 files changed, 86 insertions, 43 deletions
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 7ff3a28fc903..4f8fb4ecde34 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -30,8 +30,8 @@
#include <linux/console.h>
#include <linux/slab.h>
#include <drm/drmP.h>
-#include <drm/drm_crtc_helper.h>
#include <drm/drm_atomic_helper.h>
+#include <drm/drm_probe_helper.h>
#include <drm/amdgpu_drm.h>
#include <linux/vgaarb.h>
#include <linux/vga_switcheroo.h>
@@ -1645,7 +1645,7 @@ static int amdgpu_device_ip_init(struct amdgpu_device *adev)
if (r) {
DRM_ERROR("sw_init of IP block <%s> failed %d\n",
adev->ip_blocks[i].version->funcs->name, r);
- return r;
+ goto init_failed;
}
adev->ip_blocks[i].status.sw = true;
@@ -1654,17 +1654,17 @@ static int amdgpu_device_ip_init(struct amdgpu_device *adev)
r = amdgpu_device_vram_scratch_init(adev);
if (r) {
DRM_ERROR("amdgpu_vram_scratch_init failed %d\n", r);
- return r;
+ goto init_failed;
}
r = adev->ip_blocks[i].version->funcs->hw_init((void *)adev);
if (r) {
DRM_ERROR("hw_init %d failed %d\n", i, r);
- return r;
+ goto init_failed;
}
r = amdgpu_device_wb_init(adev);
if (r) {
DRM_ERROR("amdgpu_device_wb_init failed %d\n", r);
- return r;
+ goto init_failed;
}
adev->ip_blocks[i].status.hw = true;
@@ -1675,7 +1675,7 @@ static int amdgpu_device_ip_init(struct amdgpu_device *adev)
AMDGPU_CSA_SIZE);
if (r) {
DRM_ERROR("allocate CSA failed %d\n", r);
- return r;
+ goto init_failed;
}
}
}
@@ -1683,30 +1683,32 @@ static int amdgpu_device_ip_init(struct amdgpu_device *adev)
r = amdgpu_ucode_create_bo(adev); /* create ucode bo when sw_init complete*/
if (r)
- return r;
+ goto init_failed;
r = amdgpu_device_ip_hw_init_phase1(adev);
if (r)
- return r;
+ goto init_failed;
r = amdgpu_device_fw_loading(adev);
if (r)
- return r;
+ goto init_failed;
r = amdgpu_device_ip_hw_init_phase2(adev);
if (r)
- return r;
+ goto init_failed;
if (adev->gmc.xgmi.num_physical_nodes > 1)
amdgpu_xgmi_add_device(adev);
amdgpu_amdkfd_device_init(adev);
+init_failed:
if (amdgpu_sriov_vf(adev)) {
- amdgpu_virt_init_data_exchange(adev);
+ if (!r)
+ amdgpu_virt_init_data_exchange(adev);
amdgpu_virt_release_full_gpu(adev, true);
}
- return 0;
+ return r;
}
/**
@@ -2133,7 +2135,7 @@ static int amdgpu_device_ip_reinit_early_sriov(struct amdgpu_device *adev)
continue;
r = block->version->funcs->hw_init(adev);
- DRM_INFO("RE-INIT: %s %s\n", block->version->funcs->name, r?"failed":"succeeded");
+ DRM_INFO("RE-INIT-early: %s %s\n", block->version->funcs->name, r?"failed":"succeeded");
if (r)
return r;
}
@@ -2167,7 +2169,7 @@ static int amdgpu_device_ip_reinit_late_sriov(struct amdgpu_device *adev)
continue;
r = block->version->funcs->hw_init(adev);
- DRM_INFO("RE-INIT: %s %s\n", block->version->funcs->name, r?"failed":"succeeded");
+ DRM_INFO("RE-INIT-late: %s %s\n", block->version->funcs->name, r?"failed":"succeeded");
if (r)
return r;
}
@@ -2548,6 +2550,17 @@ int amdgpu_device_init(struct amdgpu_device *adev,
/* detect if we are with an SRIOV vbios */
amdgpu_device_detect_sriov_bios(adev);
+ /* check if we need to reset the asic
+ * E.g., driver was not cleanly unloaded previously, etc.
+ */
+ if (!amdgpu_sriov_vf(adev) && amdgpu_asic_need_reset_on_init(adev)) {
+ r = amdgpu_asic_reset(adev);
+ if (r) {
+ dev_err(adev->dev, "asic reset on init failed\n");
+ goto failed;
+ }
+ }
+
/* Post card if necessary */
if (amdgpu_device_need_post(adev)) {
if (!adev->bios) {
@@ -2612,6 +2625,8 @@ fence_driver_init:
}
dev_err(adev->dev, "amdgpu_device_ip_init failed\n");
amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_INIT_FAIL, 0, 0);
+ if (amdgpu_virt_request_full_gpu(adev, false))
+ amdgpu_virt_release_full_gpu(adev, false);
goto failed;
}
@@ -2707,7 +2722,7 @@ void amdgpu_device_fini(struct amdgpu_device *adev)
amdgpu_irq_disable_all(adev);
if (adev->mode_info.mode_config_initialized){
if (!amdgpu_device_has_dc_support(adev))
- drm_crtc_force_disable_all(adev->ddev);
+ drm_helper_force_disable_all(adev->ddev);
else
drm_atomic_helper_shutdown(adev->ddev);
}
@@ -3298,17 +3313,15 @@ static int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev,
if (!ring || !ring->sched.thread)
continue;
- kthread_park(ring->sched.thread);
-
- if (job && job->base.sched != &ring->sched)
- continue;
-
- drm_sched_hw_job_reset(&ring->sched, job ? &job->base : NULL);
+ drm_sched_stop(&ring->sched);
/* after all hw jobs are reset, hw fence is meaningless, so force_completion */
amdgpu_fence_driver_force_completion(ring);
}
+ if(job)
+ drm_sched_increase_karma(&job->base);
+
if (!amdgpu_sriov_vf(adev)) {
@@ -3454,14 +3467,10 @@ static void amdgpu_device_post_asic_reset(struct amdgpu_device *adev,
if (!ring || !ring->sched.thread)
continue;
- /* only need recovery sched of the given job's ring
- * or all rings (in the case @job is NULL)
- * after above amdgpu_reset accomplished
- */
- if ((!job || job->base.sched == &ring->sched) && !adev->asic_reset_res)
- drm_sched_job_recovery(&ring->sched);
+ if (!adev->asic_reset_res)
+ drm_sched_resubmit_jobs(&ring->sched);
- kthread_unpark(ring->sched.thread);
+ drm_sched_start(&ring->sched, !adev->asic_reset_res);
}
if (!amdgpu_device_has_dc_support(adev)) {
@@ -3521,9 +3530,9 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
* by different nodes. No point also since the one node already executing
* reset will also reset all the other nodes in the hive.
*/
- hive = amdgpu_get_xgmi_hive(adev);
+ hive = amdgpu_get_xgmi_hive(adev, 0);
if (hive && adev->gmc.xgmi.num_physical_nodes > 1 &&
- !mutex_trylock(&hive->hive_lock))
+ !mutex_trylock(&hive->reset_lock))
return 0;
/* Start with adev pre asic reset first for soft reset check.*/
@@ -3602,13 +3611,45 @@ retry: /* Rest of adevs pre asic reset from XGMI hive. */
}
if (hive && adev->gmc.xgmi.num_physical_nodes > 1)
- mutex_unlock(&hive->hive_lock);
+ mutex_unlock(&hive->reset_lock);
if (r)
dev_info(adev->dev, "GPU reset end with ret = %d\n", r);
return r;
}
+static void amdgpu_device_get_min_pci_speed_width(struct amdgpu_device *adev,
+ enum pci_bus_speed *speed,
+ enum pcie_link_width *width)
+{
+ struct pci_dev *pdev = adev->pdev;
+ enum pci_bus_speed cur_speed;
+ enum pcie_link_width cur_width;
+
+ *speed = PCI_SPEED_UNKNOWN;
+ *width = PCIE_LNK_WIDTH_UNKNOWN;
+
+ while (pdev) {
+ cur_speed = pcie_get_speed_cap(pdev);
+ cur_width = pcie_get_width_cap(pdev);
+
+ if (cur_speed != PCI_SPEED_UNKNOWN) {
+ if (*speed == PCI_SPEED_UNKNOWN)
+ *speed = cur_speed;
+ else if (cur_speed < *speed)
+ *speed = cur_speed;
+ }
+
+ if (cur_width != PCIE_LNK_WIDTH_UNKNOWN) {
+ if (*width == PCIE_LNK_WIDTH_UNKNOWN)
+ *width = cur_width;
+ else if (cur_width < *width)
+ *width = cur_width;
+ }
+ pdev = pci_upstream_bridge(pdev);
+ }
+}
+
/**
* amdgpu_device_get_pcie_info - fence pcie info about the PCIE slot
*
@@ -3621,8 +3662,8 @@ retry: /* Rest of adevs pre asic reset from XGMI hive. */
static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev)
{
struct pci_dev *pdev;
- enum pci_bus_speed speed_cap;
- enum pcie_link_width link_width;
+ enum pci_bus_speed speed_cap, platform_speed_cap;
+ enum pcie_link_width platform_link_width;
if (amdgpu_pcie_gen_cap)
adev->pm.pcie_gen_mask = amdgpu_pcie_gen_cap;
@@ -3639,6 +3680,12 @@ static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev)
return;
}
+ if (adev->pm.pcie_gen_mask && adev->pm.pcie_mlw_mask)
+ return;
+
+ amdgpu_device_get_min_pci_speed_width(adev, &platform_speed_cap,
+ &platform_link_width);
+
if (adev->pm.pcie_gen_mask == 0) {
/* asic caps */
pdev = adev->pdev;
@@ -3664,22 +3711,20 @@ static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev)
adev->pm.pcie_gen_mask |= CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1;
}
/* platform caps */
- pdev = adev->ddev->pdev->bus->self;
- speed_cap = pcie_get_speed_cap(pdev);
- if (speed_cap == PCI_SPEED_UNKNOWN) {
+ if (platform_speed_cap == PCI_SPEED_UNKNOWN) {
adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2);
} else {
- if (speed_cap == PCIE_SPEED_16_0GT)
+ if (platform_speed_cap == PCIE_SPEED_16_0GT)
adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 |
CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 |
CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4);
- else if (speed_cap == PCIE_SPEED_8_0GT)
+ else if (platform_speed_cap == PCIE_SPEED_8_0GT)
adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 |
CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3);
- else if (speed_cap == PCIE_SPEED_5_0GT)
+ else if (platform_speed_cap == PCIE_SPEED_5_0GT)
adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2);
else
@@ -3688,12 +3733,10 @@ static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev)
}
}
if (adev->pm.pcie_mlw_mask == 0) {
- pdev = adev->ddev->pdev->bus->self;
- link_width = pcie_get_width_cap(pdev);
- if (link_width == PCIE_LNK_WIDTH_UNKNOWN) {
+ if (platform_link_width == PCIE_LNK_WIDTH_UNKNOWN) {
adev->pm.pcie_mlw_mask |= AMDGPU_DEFAULT_PCIE_MLW_MASK;
} else {
- switch (link_width) {
+ switch (platform_link_width) {
case PCIE_LNK_X32:
adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X32 |
CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 |