aboutsummaryrefslogtreecommitdiffstats
path: root/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c')
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c394
1 files changed, 313 insertions, 81 deletions
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 96a8fd0ca1df..08133de21fdd 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -35,7 +35,11 @@
#include "amdgpu_xgmi.h"
#include "ivsrcid/nbio/irqsrcs_nbif_7_4.h"
#include "atom.h"
+#ifdef CONFIG_X86_MCE_AMD
+#include <asm/mce.h>
+static bool notifier_registered;
+#endif
static const char *RAS_FS_NAME = "ras";
const char *ras_error_string[] = {
@@ -61,8 +65,30 @@ const char *ras_block_string[] = {
"mp0",
"mp1",
"fuse",
+ "mca",
};
+const char *ras_mca_block_string[] = {
+ "mca_mp0",
+ "mca_mp1",
+ "mca_mpio",
+ "mca_iohc",
+};
+
+const char *get_ras_block_str(struct ras_common_if *ras_block)
+{
+ if (!ras_block)
+ return "NULL";
+
+ if (ras_block->block >= AMDGPU_RAS_BLOCK_COUNT)
+ return "OUT OF RANGE";
+
+ if (ras_block->block == AMDGPU_RAS_BLOCK__MCA)
+ return ras_mca_block_string[ras_block->sub_block_index];
+
+ return ras_block_string[ras_block->block];
+}
+
#define ras_err_str(i) (ras_error_string[ffs(i)])
#define RAS_DEFAULT_FLAGS (AMDGPU_RAS_FLAG_INIT_BY_VBIOS)
@@ -85,6 +111,14 @@ static bool amdgpu_ras_check_bad_page_unlock(struct amdgpu_ras *con,
uint64_t addr);
static bool amdgpu_ras_check_bad_page(struct amdgpu_device *adev,
uint64_t addr);
+#ifdef CONFIG_X86_MCE_AMD
+static void amdgpu_register_bad_pages_mca_notifier(struct amdgpu_device *adev);
+struct mce_notifier_adev_list {
+ struct amdgpu_device *devs[MAX_GPU_INSTANCE];
+ int num_gpu;
+};
+static struct mce_notifier_adev_list mce_adev_list;
+#endif
void amdgpu_ras_set_error_query_ready(struct amdgpu_device *adev, bool ready)
{
@@ -187,7 +221,7 @@ static int amdgpu_ras_find_block_id_by_name(const char *name, int *block_id)
for (i = 0; i < ARRAY_SIZE(ras_block_string); i++) {
*block_id = i;
- if (strcmp(name, ras_block_str(i)) == 0)
+ if (strcmp(name, ras_block_string[i]) == 0)
return 0;
}
return -EINVAL;
@@ -509,7 +543,6 @@ static ssize_t amdgpu_ras_sysfs_read(struct device *dev,
if (amdgpu_ras_query_error_status(obj->adev, &info))
return -EINVAL;
-
if (obj->adev->asic_type == CHIP_ALDEBARAN) {
if (amdgpu_ras_reset_error_status(obj->adev, info.head.block))
DRM_WARN("Failed to reset error counter and error status");
@@ -529,7 +562,7 @@ static inline void put_obj(struct ras_manager *obj)
if (obj && (--obj->use == 0))
list_del(&obj->node);
if (obj && (obj->use < 0))
- DRM_ERROR("RAS ERROR: Unbalance obj(%s) use\n", ras_block_str(obj->head.block));
+ DRM_ERROR("RAS ERROR: Unbalance obj(%s) use\n", get_ras_block_str(&obj->head));
}
/* make one obj and return it. */
@@ -545,7 +578,14 @@ static struct ras_manager *amdgpu_ras_create_obj(struct amdgpu_device *adev,
if (head->block >= AMDGPU_RAS_BLOCK_COUNT)
return NULL;
- obj = &con->objs[head->block];
+ if (head->block == AMDGPU_RAS_BLOCK__MCA) {
+ if (head->sub_block_index >= AMDGPU_RAS_MCA_BLOCK__LAST)
+ return NULL;
+
+ obj = &con->objs[AMDGPU_RAS_BLOCK__LAST + head->sub_block_index];
+ } else
+ obj = &con->objs[head->block];
+
/* already exist. return obj? */
if (alive_obj(obj))
return NULL;
@@ -573,19 +613,21 @@ struct ras_manager *amdgpu_ras_find_obj(struct amdgpu_device *adev,
if (head->block >= AMDGPU_RAS_BLOCK_COUNT)
return NULL;
- obj = &con->objs[head->block];
+ if (head->block == AMDGPU_RAS_BLOCK__MCA) {
+ if (head->sub_block_index >= AMDGPU_RAS_MCA_BLOCK__LAST)
+ return NULL;
+
+ obj = &con->objs[AMDGPU_RAS_BLOCK__LAST + head->sub_block_index];
+ } else
+ obj = &con->objs[head->block];
- if (alive_obj(obj)) {
- WARN_ON(head->block != obj->head.block);
+ if (alive_obj(obj))
return obj;
- }
} else {
- for (i = 0; i < AMDGPU_RAS_BLOCK_COUNT; i++) {
+ for (i = 0; i < AMDGPU_RAS_BLOCK_COUNT + AMDGPU_RAS_MCA_BLOCK_COUNT; i++) {
obj = &con->objs[i];
- if (alive_obj(obj)) {
- WARN_ON(i != obj->head.block);
+ if (alive_obj(obj))
return obj;
- }
}
}
@@ -626,8 +668,6 @@ static int __amdgpu_ras_feature_enable(struct amdgpu_device *adev,
*/
if (!amdgpu_ras_is_feature_allowed(adev, head))
return 0;
- if (!(!!enable ^ !!amdgpu_ras_is_feature_enabled(adev, head)))
- return 0;
if (enable) {
if (!obj) {
@@ -678,19 +718,14 @@ int amdgpu_ras_feature_enable(struct amdgpu_device *adev,
/* Do not enable if it is not allowed. */
WARN_ON(enable && !amdgpu_ras_is_feature_allowed(adev, head));
- /* Are we alerady in that state we are going to set? */
- if (!(!!enable ^ !!amdgpu_ras_is_feature_enabled(adev, head))) {
- ret = 0;
- goto out;
- }
if (!amdgpu_ras_intr_triggered()) {
ret = psp_ras_enable_features(&adev->psp, info, enable);
if (ret) {
- dev_err(adev->dev, "ras %s %s failed %d\n",
+ dev_err(adev->dev, "ras %s %s failed poison:%d ret:%d\n",
enable ? "enable":"disable",
- ras_block_str(head->block),
- ret);
+ get_ras_block_str(head),
+ amdgpu_ras_is_poison_mode_supported(adev), ret);
goto out;
}
}
@@ -731,7 +766,7 @@ int amdgpu_ras_feature_enable_on_boot(struct amdgpu_device *adev,
if (!ret)
dev_info(adev->dev,
"RAS INFO: %s setup object\n",
- ras_block_str(head->block));
+ get_ras_block_str(head));
}
} else {
/* setup the object then issue a ras TA disable cmd.*/
@@ -781,17 +816,39 @@ static int amdgpu_ras_enable_all_features(struct amdgpu_device *adev,
bool bypass)
{
struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
- int ras_block_count = AMDGPU_RAS_BLOCK_COUNT;
int i;
- const enum amdgpu_ras_error_type default_ras_type =
- AMDGPU_RAS_ERROR__NONE;
+ const enum amdgpu_ras_error_type default_ras_type = AMDGPU_RAS_ERROR__NONE;
- for (i = 0; i < ras_block_count; i++) {
+ for (i = 0; i < AMDGPU_RAS_BLOCK_COUNT; i++) {
struct ras_common_if head = {
.block = i,
.type = default_ras_type,
.sub_block_index = 0,
};
+
+ if (i == AMDGPU_RAS_BLOCK__MCA)
+ continue;
+
+ if (bypass) {
+ /*
+ * bypass psp. vbios enable ras for us.
+ * so just create the obj
+ */
+ if (__amdgpu_ras_feature_enable(adev, &head, 1))
+ break;
+ } else {
+ if (amdgpu_ras_feature_enable(adev, &head, 1))
+ break;
+ }
+ }
+
+ for (i = 0; i < AMDGPU_RAS_MCA_BLOCK_COUNT; i++) {
+ struct ras_common_if head = {
+ .block = AMDGPU_RAS_BLOCK__MCA,
+ .type = default_ras_type,
+ .sub_block_index = i,
+ };
+
if (bypass) {
/*
* bypass psp. vbios enable ras for us.
@@ -809,6 +866,32 @@ static int amdgpu_ras_enable_all_features(struct amdgpu_device *adev,
}
/* feature ctl end */
+
+void amdgpu_ras_mca_query_error_status(struct amdgpu_device *adev,
+ struct ras_common_if *ras_block,
+ struct ras_err_data *err_data)
+{
+ switch (ras_block->sub_block_index) {
+ case AMDGPU_RAS_MCA_BLOCK__MP0:
+ if (adev->mca.mp0.ras_funcs &&
+ adev->mca.mp0.ras_funcs->query_ras_error_count)
+ adev->mca.mp0.ras_funcs->query_ras_error_count(adev, &err_data);
+ break;
+ case AMDGPU_RAS_MCA_BLOCK__MP1:
+ if (adev->mca.mp1.ras_funcs &&
+ adev->mca.mp1.ras_funcs->query_ras_error_count)
+ adev->mca.mp1.ras_funcs->query_ras_error_count(adev, &err_data);
+ break;
+ case AMDGPU_RAS_MCA_BLOCK__MPIO:
+ if (adev->mca.mpio.ras_funcs &&
+ adev->mca.mpio.ras_funcs->query_ras_error_count)
+ adev->mca.mpio.ras_funcs->query_ras_error_count(adev, &err_data);
+ break;
+ default:
+ break;
+ }
+}
+
/* query/inject/cure begin */
int amdgpu_ras_query_error_status(struct amdgpu_device *adev,
struct ras_query_if *info)
@@ -872,6 +955,9 @@ int amdgpu_ras_query_error_status(struct amdgpu_device *adev,
adev->hdp.ras_funcs->query_ras_error_count)
adev->hdp.ras_funcs->query_ras_error_count(adev, &err_data);
break;
+ case AMDGPU_RAS_BLOCK__MCA:
+ amdgpu_ras_mca_query_error_status(adev, &info->head, &err_data);
+ break;
default:
break;
}
@@ -893,13 +979,13 @@ int amdgpu_ras_query_error_status(struct amdgpu_device *adev,
adev->smuio.funcs->get_socket_id(adev),
adev->smuio.funcs->get_die_id(adev),
obj->err_data.ce_count,
- ras_block_str(info->head.block));
+ get_ras_block_str(&info->head));
} else {
dev_info(adev->dev, "%ld correctable hardware errors "
"detected in %s block, no user "
"action is needed.\n",
obj->err_data.ce_count,
- ras_block_str(info->head.block));
+ get_ras_block_str(&info->head));
}
}
if (err_data.ue_count) {
@@ -912,15 +998,18 @@ int amdgpu_ras_query_error_status(struct amdgpu_device *adev,
adev->smuio.funcs->get_socket_id(adev),
adev->smuio.funcs->get_die_id(adev),
obj->err_data.ue_count,
- ras_block_str(info->head.block));
+ get_ras_block_str(&info->head));
} else {
dev_info(adev->dev, "%ld uncorrectable hardware errors "
"detected in %s block\n",
obj->err_data.ue_count,
- ras_block_str(info->head.block));
+ get_ras_block_str(&info->head));
}
}
+ if (!amdgpu_persistent_edc_harvesting_supported(adev))
+ amdgpu_ras_reset_error_status(adev, info->head.block);
+
return 0;
}
@@ -1027,6 +1116,7 @@ int amdgpu_ras_error_inject(struct amdgpu_device *adev,
case AMDGPU_RAS_BLOCK__SDMA:
case AMDGPU_RAS_BLOCK__MMHUB:
case AMDGPU_RAS_BLOCK__PCIE_BIF:
+ case AMDGPU_RAS_BLOCK__MCA:
ret = psp_ras_trigger_error(&adev->psp, &block_info);
break;
case AMDGPU_RAS_BLOCK__XGMI_WAFL:
@@ -1034,13 +1124,13 @@ int amdgpu_ras_error_inject(struct amdgpu_device *adev,
break;
default:
dev_info(adev->dev, "%s error injection is not supported yet\n",
- ras_block_str(info->head.block));
+ get_ras_block_str(&info->head));
ret = -EINVAL;
}
if (ret)
dev_err(adev->dev, "ras inject %s failed %d\n",
- ras_block_str(info->head.block), ret);
+ get_ras_block_str(&info->head), ret);
return ret;
}
@@ -1383,7 +1473,7 @@ void amdgpu_ras_debugfs_create_all(struct amdgpu_device *adev)
if (amdgpu_ras_is_supported(adev, obj->head.block) &&
(obj->attr_inuse == 1)) {
sprintf(fs_info.debugfs_name, "%s_err_inject",
- ras_block_str(obj->head.block));
+ get_ras_block_str(&obj->head));
fs_info.head = obj->head;
amdgpu_ras_debugfs_create(adev, &fs_info, dir);
}
@@ -1469,22 +1559,28 @@ static void amdgpu_ras_interrupt_handler(struct ras_manager *obj)
data->rptr = (data->aligned_element_size +
data->rptr) % data->ring_size;
- /* Let IP handle its data, maybe we need get the output
- * from the callback to udpate the error type/count, etc
- */
if (data->cb) {
- ret = data->cb(obj->adev, &err_data, &entry);
- /* ue will trigger an interrupt, and in that case
- * we need do a reset to recovery the whole system.
- * But leave IP do that recovery, here we just dispatch
- * the error.
- */
- if (ret == AMDGPU_RAS_SUCCESS) {
- /* these counts could be left as 0 if
- * some blocks do not count error number
+ if (amdgpu_ras_is_poison_mode_supported(obj->adev) &&
+ obj->head.block == AMDGPU_RAS_BLOCK__UMC)
+ dev_info(obj->adev->dev,
+ "Poison is created, no user action is needed.\n");
+ else {
+ /* Let IP handle its data, maybe we need get the output
+ * from the callback to udpate the error type/count, etc
+ */
+ ret = data->cb(obj->adev, &err_data, &entry);
+ /* ue will trigger an interrupt, and in that case
+ * we need do a reset to recovery the whole system.
+ * But leave IP do that recovery, here we just dispatch
+ * the error.
*/
- obj->err_data.ue_count += err_data.ue_count;
- obj->err_data.ce_count += err_data.ce_count;
+ if (ret == AMDGPU_RAS_SUCCESS) {
+ /* these counts could be left as 0 if
+ * some blocks do not count error number
+ */
+ obj->err_data.ue_count += err_data.ue_count;
+ obj->err_data.ce_count += err_data.ce_count;
+ }
}
}
}
@@ -2014,6 +2110,11 @@ int amdgpu_ras_recovery_init(struct amdgpu_device *adev)
adev->smu.ppt_funcs->send_hbm_bad_pages_num(&adev->smu, con->eeprom_control.ras_num_recs);
}
+#ifdef CONFIG_X86_MCE_AMD
+ if ((adev->asic_type == CHIP_ALDEBARAN) &&
+ (adev->gmc.xgmi.connected_to_cpu))
+ amdgpu_register_bad_pages_mca_notifier(adev);
+#endif
return 0;
free:
@@ -2056,19 +2157,6 @@ static int amdgpu_ras_recovery_fini(struct amdgpu_device *adev)
}
/* recovery end */
-/* return 0 if ras will reset gpu and repost.*/
-int amdgpu_ras_request_reset_on_boot(struct amdgpu_device *adev,
- unsigned int block)
-{
- struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
-
- if (!ras)
- return -EINVAL;
-
- ras->flags |= AMDGPU_RAS_FLAG_INIT_NEED_RESET;
- return 0;
-}
-
static bool amdgpu_ras_asic_supported(struct amdgpu_device *adev)
{
return adev->asic_type == CHIP_VEGA10 ||
@@ -2176,12 +2264,14 @@ int amdgpu_ras_init(struct amdgpu_device *adev)
{
struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
int r;
+ bool df_poison, umc_poison;
if (con)
return 0;
con = kmalloc(sizeof(struct amdgpu_ras) +
- sizeof(struct ras_manager) * AMDGPU_RAS_BLOCK_COUNT,
+ sizeof(struct ras_manager) * AMDGPU_RAS_BLOCK_COUNT +
+ sizeof(struct ras_manager) * AMDGPU_RAS_MCA_BLOCK_COUNT,
GFP_KERNEL|__GFP_ZERO);
if (!con)
return -ENOMEM;
@@ -2245,6 +2335,23 @@ int amdgpu_ras_init(struct amdgpu_device *adev)
goto release_con;
}
+ /* Init poison supported flag, the default value is false */
+ if (adev->df.funcs &&
+ adev->df.funcs->query_ras_poison_mode &&
+ adev->umc.ras_funcs &&
+ adev->umc.ras_funcs->query_ras_poison_mode) {
+ df_poison =
+ adev->df.funcs->query_ras_poison_mode(adev);
+ umc_poison =
+ adev->umc.ras_funcs->query_ras_poison_mode(adev);
+ /* Only poison is set in both DF and UMC, we can support it */
+ if (df_poison && umc_poison)
+ con->poison_supported = true;
+ else if (df_poison != umc_poison)
+ dev_warn(adev->dev, "Poison setting is inconsistent in DF/UMC(%d:%d)!\n",
+ df_poison, umc_poison);
+ }
+
if (amdgpu_ras_fs_init(adev)) {
r = -EINVAL;
goto release_con;
@@ -2288,6 +2395,16 @@ static int amdgpu_persistent_edc_harvesting(struct amdgpu_device *adev,
return 0;
}
+bool amdgpu_ras_is_poison_mode_supported(struct amdgpu_device *adev)
+{
+ struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
+
+ if (!con)
+ return false;
+
+ return con->poison_supported;
+}
+
/* helper function to handle common stuff in ip late init phase */
int amdgpu_ras_late_init(struct amdgpu_device *adev,
struct ras_common_if *ras_block,
@@ -2306,12 +2423,7 @@ int amdgpu_ras_late_init(struct amdgpu_device *adev,
r = amdgpu_ras_feature_enable_on_boot(adev, ras_block, 1);
if (r) {
- if (r == -EAGAIN) {
- /* request gpu reset. will run again */
- amdgpu_ras_request_reset_on_boot(adev,
- ras_block->block);
- return 0;
- } else if (adev->in_suspend || amdgpu_in_reset(adev)) {
+ if (adev->in_suspend || amdgpu_in_reset(adev)) {
/* in resume phase, if fail to enable ras,
* clean up all ras fs nodes, and disable ras */
goto cleanup;
@@ -2403,19 +2515,6 @@ void amdgpu_ras_resume(struct amdgpu_device *adev)
}
}
}
-
- if (con->flags & AMDGPU_RAS_FLAG_INIT_NEED_RESET) {
- con->flags &= ~AMDGPU_RAS_FLAG_INIT_NEED_RESET;
- /* setup ras obj state as disabled.
- * for init_by_vbios case.
- * if we want to enable ras, just enable it in a normal way.
- * If we want do disable it, need setup ras obj as enabled,
- * then issue another TA disable cmd.
- * See feature_enable_on_boot
- */
- amdgpu_ras_disable_all_features(adev, 1);
- amdgpu_ras_reset_gpu(adev);
- }
}
void amdgpu_ras_suspend(struct amdgpu_device *adev)
@@ -2507,3 +2606,136 @@ void amdgpu_release_ras_context(struct amdgpu_device *adev)
kfree(con);
}
}
+
+#ifdef CONFIG_X86_MCE_AMD
+static struct amdgpu_device *find_adev(uint32_t node_id)
+{
+ int i;
+ struct amdgpu_device *adev = NULL;
+
+ for (i = 0; i < mce_adev_list.num_gpu; i++) {
+ adev = mce_adev_list.devs[i];
+
+ if (adev && adev->gmc.xgmi.connected_to_cpu &&
+ adev->gmc.xgmi.physical_node_id == node_id)
+ break;
+ adev = NULL;
+ }
+
+ return adev;
+}
+
+#define GET_MCA_IPID_GPUID(m) (((m) >> 44) & 0xF)
+#define GET_UMC_INST(m) (((m) >> 21) & 0x7)
+#define GET_CHAN_INDEX(m) ((((m) >> 12) & 0x3) | (((m) >> 18) & 0x4))
+#define GPU_ID_OFFSET 8
+
+static int amdgpu_bad_page_notifier(struct notifier_block *nb,
+ unsigned long val, void *data)
+{
+ struct mce *m = (struct mce *)data;
+ struct amdgpu_device *adev = NULL;
+ uint32_t gpu_id = 0;
+ uint32_t umc_inst = 0;
+ uint32_t ch_inst, channel_index = 0;
+ struct ras_err_data err_data = {0, 0, 0, NULL};
+ struct eeprom_table_record err_rec;
+ uint64_t retired_page;
+
+ /*
+ * If the error was generated in UMC_V2, which belongs to GPU UMCs,
+ * and error occurred in DramECC (Extended error code = 0) then only
+ * process the error, else bail out.
+ */
+ if (!m || !((smca_get_bank_type(m->bank) == SMCA_UMC_V2) &&
+ (XEC(m->status, 0x3f) == 0x0)))
+ return NOTIFY_DONE;
+
+ /*
+ * If it is correctable error, return.
+ */
+ if (mce_is_correctable(m))
+ return NOTIFY_OK;
+
+ /*
+ * GPU Id is offset by GPU_ID_OFFSET in MCA_IPID_UMC register.
+ */
+ gpu_id = GET_MCA_IPID_GPUID(m->ipid) - GPU_ID_OFFSET;
+
+ adev = find_adev(gpu_id);
+ if (!adev) {
+ DRM_WARN("%s: Unable to find adev for gpu_id: %d\n", __func__,
+ gpu_id);
+ return NOTIFY_DONE;
+ }
+
+ /*
+ * If it is uncorrectable error, then find out UMC instance and
+ * channel index.
+ */
+ umc_inst = GET_UMC_INST(m->ipid);
+ ch_inst = GET_CHAN_INDEX(m->ipid);
+
+ dev_info(adev->dev, "Uncorrectable error detected in UMC inst: %d, chan_idx: %d",
+ umc_inst, ch_inst);
+
+ memset(&err_rec, 0x0, sizeof(struct eeprom_table_record));
+
+ /*
+ * Translate UMC channel address to Physical address
+ */
+ channel_index =
+ adev->umc.channel_idx_tbl[umc_inst * adev->umc.channel_inst_num
+ + ch_inst];
+
+ retired_page = ADDR_OF_8KB_BLOCK(m->addr) |
+ ADDR_OF_256B_BLOCK(channel_index) |
+ OFFSET_IN_256B_BLOCK(m->addr);
+
+ err_rec.address = m->addr;
+ err_rec.retired_page = retired_page >> AMDGPU_GPU_PAGE_SHIFT;
+ err_rec.ts = (uint64_t)ktime_get_real_seconds();
+ err_rec.err_type = AMDGPU_RAS_EEPROM_ERR_NON_RECOVERABLE;
+ err_rec.cu = 0;
+ err_rec.mem_channel = channel_index;
+ err_rec.mcumc_id = umc_inst;
+
+ err_data.err_addr = &err_rec;
+ err_data.err_addr_cnt = 1;
+
+ if (amdgpu_bad_page_threshold != 0) {
+ amdgpu_ras_add_bad_pages(adev, err_data.err_addr,
+ err_data.err_addr_cnt);
+ amdgpu_ras_save_bad_pages(adev);
+ }
+
+ return NOTIFY_OK;
+}
+
+static struct notifier_block amdgpu_bad_page_nb = {
+ .notifier_call = amdgpu_bad_page_notifier,
+ .priority = MCE_PRIO_UC,
+};
+
+static void amdgpu_register_bad_pages_mca_notifier(struct amdgpu_device *adev)
+{
+ /*
+ * Add the adev to the mce_adev_list.
+ * During mode2 reset, amdgpu device is temporarily
+ * removed from the mgpu_info list which can cause
+ * page retirement to fail.
+ * Use this list instead of mgpu_info to find the amdgpu
+ * device on which the UMC error was reported.
+ */
+ mce_adev_list.devs[mce_adev_list.num_gpu++] = adev;
+
+ /*
+ * Register the x86 notifier only once
+ * with MCE subsystem.
+ */
+ if (notifier_registered == false) {
+ mce_register_decode_chain(&amdgpu_bad_page_nb);
+ notifier_registered = true;
+ }
+}
+#endif