diff options
Diffstat (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c')
| -rw-r--r-- | drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c | 413 |
1 files changed, 377 insertions, 36 deletions
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c index 407dd16cc35c..61d13d8b7b20 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c @@ -24,7 +24,9 @@ #include <linux/list.h> #include "amdgpu.h" #include "amdgpu_xgmi.h" - +#include "amdgpu_smu.h" +#include "amdgpu_ras.h" +#include "df/df_3_6_offset.h" static DEFINE_MUTEX(xgmi_mutex); @@ -34,12 +36,194 @@ static DEFINE_MUTEX(xgmi_mutex); static struct amdgpu_hive_info xgmi_hives[AMDGPU_MAX_XGMI_HIVE]; static unsigned hive_count = 0; - void *amdgpu_xgmi_hive_try_lock(struct amdgpu_hive_info *hive) { return &hive->device_list; } +/** + * DOC: AMDGPU XGMI Support + * + * XGMI is a high speed interconnect that joins multiple GPU cards + * into a homogeneous memory space that is organized by a collective + * hive ID and individual node IDs, both of which are 64-bit numbers. + * + * The file xgmi_device_id contains the unique per GPU device ID and + * is stored in the /sys/class/drm/card${cardno}/device/ directory. + * + * Inside the device directory a sub-directory 'xgmi_hive_info' is + * created which contains the hive ID and the list of nodes. + * + * The hive ID is stored in: + * /sys/class/drm/card${cardno}/device/xgmi_hive_info/xgmi_hive_id + * + * The node information is stored in numbered directories: + * /sys/class/drm/card${cardno}/device/xgmi_hive_info/node${nodeno}/xgmi_device_id + * + * Each device has their own xgmi_hive_info direction with a mirror + * set of node sub-directories. + * + * The XGMI memory space is built by contiguously adding the power of + * two padded VRAM space from each node to each other. + * + */ + + +static ssize_t amdgpu_xgmi_show_hive_id(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct amdgpu_hive_info *hive = + container_of(attr, struct amdgpu_hive_info, dev_attr); + + return snprintf(buf, PAGE_SIZE, "%llu\n", hive->hive_id); +} + +static int amdgpu_xgmi_sysfs_create(struct amdgpu_device *adev, + struct amdgpu_hive_info *hive) +{ + int ret = 0; + + if (WARN_ON(hive->kobj)) + return -EINVAL; + + hive->kobj = kobject_create_and_add("xgmi_hive_info", &adev->dev->kobj); + if (!hive->kobj) { + dev_err(adev->dev, "XGMI: Failed to allocate sysfs entry!\n"); + return -EINVAL; + } + + hive->dev_attr = (struct device_attribute) { + .attr = { + .name = "xgmi_hive_id", + .mode = S_IRUGO, + + }, + .show = amdgpu_xgmi_show_hive_id, + }; + + ret = sysfs_create_file(hive->kobj, &hive->dev_attr.attr); + if (ret) { + dev_err(adev->dev, "XGMI: Failed to create device file xgmi_hive_id\n"); + kobject_del(hive->kobj); + kobject_put(hive->kobj); + hive->kobj = NULL; + } + + return ret; +} + +static void amdgpu_xgmi_sysfs_destroy(struct amdgpu_device *adev, + struct amdgpu_hive_info *hive) +{ + sysfs_remove_file(hive->kobj, &hive->dev_attr.attr); + kobject_del(hive->kobj); + kobject_put(hive->kobj); + hive->kobj = NULL; +} + +static ssize_t amdgpu_xgmi_show_device_id(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct drm_device *ddev = dev_get_drvdata(dev); + struct amdgpu_device *adev = ddev->dev_private; + + return snprintf(buf, PAGE_SIZE, "%llu\n", adev->gmc.xgmi.node_id); + +} + +#define AMDGPU_XGMI_SET_FICAA(o) ((o) | 0x456801) +static ssize_t amdgpu_xgmi_show_error(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct drm_device *ddev = dev_get_drvdata(dev); + struct amdgpu_device *adev = ddev->dev_private; + uint32_t ficaa_pie_ctl_in, ficaa_pie_status_in; + uint64_t fica_out; + unsigned int error_count = 0; + + ficaa_pie_ctl_in = AMDGPU_XGMI_SET_FICAA(0x200); + ficaa_pie_status_in = AMDGPU_XGMI_SET_FICAA(0x208); + + fica_out = adev->df_funcs->get_fica(adev, ficaa_pie_ctl_in); + if (fica_out != 0x1f) + pr_err("xGMI error counters not enabled!\n"); + + fica_out = adev->df_funcs->get_fica(adev, ficaa_pie_status_in); + + if ((fica_out & 0xffff) == 2) + error_count = ((fica_out >> 62) & 0x1) + (fica_out >> 63); + + adev->df_funcs->set_fica(adev, ficaa_pie_status_in, 0, 0); + + return snprintf(buf, PAGE_SIZE, "%d\n", error_count); +} + + +static DEVICE_ATTR(xgmi_device_id, S_IRUGO, amdgpu_xgmi_show_device_id, NULL); +static DEVICE_ATTR(xgmi_error, S_IRUGO, amdgpu_xgmi_show_error, NULL); + +static int amdgpu_xgmi_sysfs_add_dev_info(struct amdgpu_device *adev, + struct amdgpu_hive_info *hive) +{ + int ret = 0; + char node[10] = { 0 }; + + /* Create xgmi device id file */ + ret = device_create_file(adev->dev, &dev_attr_xgmi_device_id); + if (ret) { + dev_err(adev->dev, "XGMI: Failed to create device file xgmi_device_id\n"); + return ret; + } + + /* Create xgmi error file */ + ret = device_create_file(adev->dev, &dev_attr_xgmi_error); + if (ret) + pr_err("failed to create xgmi_error\n"); + + + /* Create sysfs link to hive info folder on the first device */ + if (adev != hive->adev) { + ret = sysfs_create_link(&adev->dev->kobj, hive->kobj, + "xgmi_hive_info"); + if (ret) { + dev_err(adev->dev, "XGMI: Failed to create link to hive info"); + goto remove_file; + } + } + + sprintf(node, "node%d", hive->number_devices); + /* Create sysfs link form the hive folder to yourself */ + ret = sysfs_create_link(hive->kobj, &adev->dev->kobj, node); + if (ret) { + dev_err(adev->dev, "XGMI: Failed to create link from hive info"); + goto remove_link; + } + + goto success; + + +remove_link: + sysfs_remove_link(&adev->dev->kobj, adev->ddev->unique); + +remove_file: + device_remove_file(adev->dev, &dev_attr_xgmi_device_id); + +success: + return ret; +} + +static void amdgpu_xgmi_sysfs_rem_dev_info(struct amdgpu_device *adev, + struct amdgpu_hive_info *hive) +{ + device_remove_file(adev->dev, &dev_attr_xgmi_device_id); + sysfs_remove_link(&adev->dev->kobj, adev->ddev->unique); + sysfs_remove_link(hive->kobj, adev->ddev->unique); +} + + + struct amdgpu_hive_info *amdgpu_get_xgmi_hive(struct amdgpu_device *adev, int lock) { int i; @@ -66,18 +250,83 @@ struct amdgpu_hive_info *amdgpu_get_xgmi_hive(struct amdgpu_device *adev, int lo /* initialize new hive if not exist */ tmp = &xgmi_hives[hive_count++]; + + if (amdgpu_xgmi_sysfs_create(adev, tmp)) { + mutex_unlock(&xgmi_mutex); + return NULL; + } + + tmp->adev = adev; tmp->hive_id = adev->gmc.xgmi.hive_id; INIT_LIST_HEAD(&tmp->device_list); mutex_init(&tmp->hive_lock); mutex_init(&tmp->reset_lock); + if (lock) mutex_lock(&tmp->hive_lock); - + tmp->pstate = -1; mutex_unlock(&xgmi_mutex); return tmp; } +int amdgpu_xgmi_set_pstate(struct amdgpu_device *adev, int pstate) +{ + int ret = 0; + struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev, 0); + struct amdgpu_device *tmp_adev; + bool update_hive_pstate = true; + bool is_high_pstate = pstate && adev->asic_type == CHIP_VEGA20; + + if (!hive) + return 0; + + mutex_lock(&hive->hive_lock); + + if (hive->pstate == pstate) { + adev->pstate = is_high_pstate ? pstate : adev->pstate; + goto out; + } + + dev_dbg(adev->dev, "Set xgmi pstate %d.\n", pstate); + + if (is_support_sw_smu_xgmi(adev)) + ret = smu_set_xgmi_pstate(&adev->smu, pstate); + else if (adev->powerplay.pp_funcs && + adev->powerplay.pp_funcs->set_xgmi_pstate) + ret = adev->powerplay.pp_funcs->set_xgmi_pstate(adev->powerplay.pp_handle, + pstate); + + if (ret) { + dev_err(adev->dev, + "XGMI: Set pstate failure on device %llx, hive %llx, ret %d", + adev->gmc.xgmi.node_id, + adev->gmc.xgmi.hive_id, ret); + goto out; + } + + /* Update device pstate */ + adev->pstate = pstate; + + /* + * Update the hive pstate only all devices of the hive + * are in the same pstate + */ + list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) { + if (tmp_adev->pstate != adev->pstate) { + update_hive_pstate = false; + break; + } + } + if (update_hive_pstate || is_high_pstate) + hive->pstate = pstate; + +out: + mutex_unlock(&hive->hive_lock); + + return ret; +} + int amdgpu_xgmi_update_topology(struct amdgpu_hive_info *hive, struct amdgpu_device *adev) { int ret = -EINVAL; @@ -85,7 +334,7 @@ int amdgpu_xgmi_update_topology(struct amdgpu_hive_info *hive, struct amdgpu_dev /* Each psp need to set the latest topology */ ret = psp_xgmi_set_topology_info(&adev->psp, hive->number_devices, - &hive->topology_info); + &adev->psp.xgmi_context.top_info); if (ret) dev_err(adev->dev, "XGMI: Set topology failure on device %llx, hive %llx, ret %d", @@ -95,30 +344,48 @@ int amdgpu_xgmi_update_topology(struct amdgpu_hive_info *hive, struct amdgpu_dev return ret; } + +int amdgpu_xgmi_get_hops_count(struct amdgpu_device *adev, + struct amdgpu_device *peer_adev) +{ + struct psp_xgmi_topology_info *top = &adev->psp.xgmi_context.top_info; + int i; + + for (i = 0 ; i < top->num_nodes; ++i) + if (top->nodes[i].node_id == peer_adev->gmc.xgmi.node_id) + return top->nodes[i].num_hops; + return -EINVAL; +} + int amdgpu_xgmi_add_device(struct amdgpu_device *adev) { - struct psp_xgmi_topology_info *hive_topology; + struct psp_xgmi_topology_info *top_info; struct amdgpu_hive_info *hive; struct amdgpu_xgmi *entry; struct amdgpu_device *tmp_adev = NULL; - int count = 0, ret = -EINVAL; + int count = 0, ret = 0; if (!adev->gmc.xgmi.supported) return 0; - ret = psp_xgmi_get_node_id(&adev->psp, &adev->gmc.xgmi.node_id); - if (ret) { - dev_err(adev->dev, - "XGMI: Failed to get node id\n"); - return ret; - } + if (amdgpu_device_ip_get_ip_block(adev, AMD_IP_BLOCK_TYPE_PSP)) { + ret = psp_xgmi_get_hive_id(&adev->psp, &adev->gmc.xgmi.hive_id); + if (ret) { + dev_err(adev->dev, + "XGMI: Failed to get hive id\n"); + return ret; + } - ret = psp_xgmi_get_hive_id(&adev->psp, &adev->gmc.xgmi.hive_id); - if (ret) { - dev_err(adev->dev, - "XGMI: Failed to get hive id\n"); - return ret; + ret = psp_xgmi_get_node_id(&adev->psp, &adev->gmc.xgmi.node_id); + if (ret) { + dev_err(adev->dev, + "XGMI: Failed to get node id\n"); + return ret; + } + } else { + adev->gmc.xgmi.hive_id = 16; + adev->gmc.xgmi.node_id = adev->gmc.xgmi.physical_node_id + 16; } hive = amdgpu_get_xgmi_hive(adev, 1); @@ -130,37 +397,60 @@ int amdgpu_xgmi_add_device(struct amdgpu_device *adev) goto exit; } - hive_topology = &hive->topology_info; + /* Set default device pstate */ + adev->pstate = -1; + + top_info = &adev->psp.xgmi_context.top_info; list_add_tail(&adev->gmc.xgmi.head, &hive->device_list); list_for_each_entry(entry, &hive->device_list, head) - hive_topology->nodes[count++].node_id = entry->node_id; + top_info->nodes[count++].node_id = entry->node_id; + top_info->num_nodes = count; hive->number_devices = count; - /* Each psp need to get the latest topology */ - list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) { - ret = psp_xgmi_get_topology_info(&tmp_adev->psp, count, hive_topology); - if (ret) { - dev_err(tmp_adev->dev, - "XGMI: Get topology failure on device %llx, hive %llx, ret %d", - tmp_adev->gmc.xgmi.node_id, - tmp_adev->gmc.xgmi.hive_id, ret); - /* To do : continue with some node failed or disable the whole hive */ - break; + if (amdgpu_device_ip_get_ip_block(adev, AMD_IP_BLOCK_TYPE_PSP)) { + list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) { + /* update node list for other device in the hive */ + if (tmp_adev != adev) { + top_info = &tmp_adev->psp.xgmi_context.top_info; + top_info->nodes[count - 1].node_id = + adev->gmc.xgmi.node_id; + top_info->num_nodes = count; + } + ret = amdgpu_xgmi_update_topology(hive, tmp_adev); + if (ret) + goto exit; } - } - list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) { - ret = amdgpu_xgmi_update_topology(hive, tmp_adev); - if (ret) - break; + /* get latest topology info for each device from psp */ + list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) { + ret = psp_xgmi_get_topology_info(&tmp_adev->psp, count, + &tmp_adev->psp.xgmi_context.top_info); + if (ret) { + dev_err(tmp_adev->dev, + "XGMI: Get topology failure on device %llx, hive %llx, ret %d", + tmp_adev->gmc.xgmi.node_id, + tmp_adev->gmc.xgmi.hive_id, ret); + /* To do : continue with some node failed or disable the whole hive */ + goto exit; + } + } } - dev_info(adev->dev, "XGMI: Add node %d, hive 0x%llx.\n", - adev->gmc.xgmi.physical_node_id, adev->gmc.xgmi.hive_id); + if (!ret) + ret = amdgpu_xgmi_sysfs_add_dev_info(adev, hive); + mutex_unlock(&hive->hive_lock); exit: + if (!ret) + dev_info(adev->dev, "XGMI: Add node %d, hive 0x%llx.\n", + adev->gmc.xgmi.physical_node_id, adev->gmc.xgmi.hive_id); + else + dev_err(adev->dev, "XGMI: Failed to add node %d, hive 0x%llx ret: %d\n", + adev->gmc.xgmi.physical_node_id, adev->gmc.xgmi.hive_id, + ret); + return ret; } @@ -176,9 +466,60 @@ void amdgpu_xgmi_remove_device(struct amdgpu_device *adev) return; if (!(hive->number_devices--)) { + amdgpu_xgmi_sysfs_destroy(adev, hive); mutex_destroy(&hive->hive_lock); mutex_destroy(&hive->reset_lock); } else { + amdgpu_xgmi_sysfs_rem_dev_info(adev, hive); mutex_unlock(&hive->hive_lock); } } + +int amdgpu_xgmi_ras_late_init(struct amdgpu_device *adev) +{ + int r; + struct ras_ih_if ih_info = { + .cb = NULL, + }; + struct ras_fs_if fs_info = { + .sysfs_name = "xgmi_wafl_err_count", + .debugfs_name = "xgmi_wafl_err_inject", + }; + + if (!adev->gmc.xgmi.supported || + adev->gmc.xgmi.num_physical_nodes == 0) + return 0; + + if (!adev->gmc.xgmi.ras_if) { + adev->gmc.xgmi.ras_if = kmalloc(sizeof(struct ras_common_if), GFP_KERNEL); + if (!adev->gmc.xgmi.ras_if) + return -ENOMEM; + adev->gmc.xgmi.ras_if->block = AMDGPU_RAS_BLOCK__XGMI_WAFL; + adev->gmc.xgmi.ras_if->type = AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE; + adev->gmc.xgmi.ras_if->sub_block_index = 0; + strcpy(adev->gmc.xgmi.ras_if->name, "xgmi_wafl"); + } + ih_info.head = fs_info.head = *adev->gmc.xgmi.ras_if; + r = amdgpu_ras_late_init(adev, adev->gmc.xgmi.ras_if, + &fs_info, &ih_info); + if (r || !amdgpu_ras_is_supported(adev, adev->gmc.xgmi.ras_if->block)) { + kfree(adev->gmc.xgmi.ras_if); + adev->gmc.xgmi.ras_if = NULL; + } + + return r; +} + +void amdgpu_xgmi_ras_fini(struct amdgpu_device *adev) +{ + if (amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__XGMI_WAFL) && + adev->gmc.xgmi.ras_if) { + struct ras_common_if *ras_if = adev->gmc.xgmi.ras_if; + struct ras_ih_if ih_info = { + .cb = NULL, + }; + + amdgpu_ras_late_fini(adev, ras_if, &ih_info); + kfree(ras_if); + } +} |
