aboutsummaryrefslogtreecommitdiffstats
path: root/drivers/gpu/drm/amd/amdkfd/kfd_topology.c
diff options
context:
space:
mode:
authorMukul Joshi <mukul.joshi@amd.com>2022-04-06 20:07:37 -0400
committerAlex Deucher <alexander.deucher@amd.com>2022-04-13 09:14:22 -0400
commit46d18d510d78318c4aa5aaeff66782f1ec42c2ec (patch)
treebdcd722661d52973c15ffee0ce00a51c5c1adc20 /drivers/gpu/drm/amd/amdkfd/kfd_topology.c
parentdrm/amd/amdgpu: Not request init data for MS_HYPERV with vega10 (diff)
downloadlinux-dev-46d18d510d78318c4aa5aaeff66782f1ec42c2ec.tar.xz
linux-dev-46d18d510d78318c4aa5aaeff66782f1ec42c2ec.zip
drm/amdkfd: Cleanup IO links during KFD device removal
Currently, the IO-links to the device being removed from topology, are not cleared. As a result, there would be dangling links left in the KFD topology. This patch aims to fix the following: 1. Cleanup all IO links to the device being removed. 2. Ensure that node numbering in sysfs and nodes proximity domain values are consistent after the device is removed: a. Adding a device and removing a GPU device are made mutually exclusive. b. The global proximity domain counter is no longer required to be an atomic counter. A normal 32-bit counter can be used instead. 3. Update generation_count to let user-mode know that topology has changed due to device removal. CC: Shuotao Xu <shuotaoxu@microsoft.com> Reviewed-by: Shuotao Xu <shuotaoxu@microsoft.com> Reviewed-by: Felix Kuehling <Felix.Kuehling@amd.com> Signed-off-by: Mukul Joshi <mukul.joshi@amd.com> Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
Diffstat (limited to 'drivers/gpu/drm/amd/amdkfd/kfd_topology.c')
-rw-r--r--drivers/gpu/drm/amd/amdkfd/kfd_topology.c83
1 files changed, 74 insertions, 9 deletions
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_topology.c b/drivers/gpu/drm/amd/amdkfd/kfd_topology.c
index 3bdcae239bc0..8b7710b4d3ed 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_topology.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_topology.c
@@ -46,22 +46,32 @@ static struct list_head topology_device_list;
static struct kfd_system_properties sys_props;
static DECLARE_RWSEM(topology_lock);
-static atomic_t topology_crat_proximity_domain;
+static uint32_t topology_crat_proximity_domain;
-struct kfd_topology_device *kfd_topology_device_by_proximity_domain(
+struct kfd_topology_device *kfd_topology_device_by_proximity_domain_no_lock(
uint32_t proximity_domain)
{
struct kfd_topology_device *top_dev;
struct kfd_topology_device *device = NULL;
- down_read(&topology_lock);
-
list_for_each_entry(top_dev, &topology_device_list, list)
if (top_dev->proximity_domain == proximity_domain) {
device = top_dev;
break;
}
+ return device;
+}
+
+struct kfd_topology_device *kfd_topology_device_by_proximity_domain(
+ uint32_t proximity_domain)
+{
+ struct kfd_topology_device *device = NULL;
+
+ down_read(&topology_lock);
+
+ device = kfd_topology_device_by_proximity_domain_no_lock(
+ proximity_domain);
up_read(&topology_lock);
return device;
@@ -1060,7 +1070,7 @@ int kfd_topology_init(void)
down_write(&topology_lock);
kfd_topology_update_device_list(&temp_topology_device_list,
&topology_device_list);
- atomic_set(&topology_crat_proximity_domain, sys_props.num_devices-1);
+ topology_crat_proximity_domain = sys_props.num_devices-1;
ret = kfd_topology_update_sysfs();
up_write(&topology_lock);
@@ -1295,8 +1305,6 @@ int kfd_topology_add_device(struct kfd_dev *gpu)
pr_debug("Adding new GPU (ID: 0x%x) to topology\n", gpu_id);
- proximity_domain = atomic_inc_return(&topology_crat_proximity_domain);
-
/* Include the CPU in xGMI hive if xGMI connected by assigning it the hive ID. */
if (gpu->hive_id && gpu->adev->gmc.xgmi.connected_to_cpu) {
struct kfd_topology_device *top_dev;
@@ -1321,12 +1329,16 @@ int kfd_topology_add_device(struct kfd_dev *gpu)
*/
dev = kfd_assign_gpu(gpu);
if (!dev) {
+ down_write(&topology_lock);
+ proximity_domain = ++topology_crat_proximity_domain;
+
res = kfd_create_crat_image_virtual(&crat_image, &image_size,
COMPUTE_UNIT_GPU, gpu,
proximity_domain);
if (res) {
pr_err("Error creating VCRAT for GPU (ID: 0x%x)\n",
gpu_id);
+ topology_crat_proximity_domain--;
return res;
}
res = kfd_parse_crat_table(crat_image,
@@ -1335,10 +1347,10 @@ int kfd_topology_add_device(struct kfd_dev *gpu)
if (res) {
pr_err("Error parsing VCRAT for GPU (ID: 0x%x)\n",
gpu_id);
+ topology_crat_proximity_domain--;
goto err;
}
- down_write(&topology_lock);
kfd_topology_update_device_list(&temp_topology_device_list,
&topology_device_list);
@@ -1485,25 +1497,78 @@ err:
return res;
}
+/**
+ * kfd_topology_update_io_links() - Update IO links after device removal.
+ * @proximity_domain: Proximity domain value of the dev being removed.
+ *
+ * The topology list currently is arranged in increasing order of
+ * proximity domain.
+ *
+ * Two things need to be done when a device is removed:
+ * 1. All the IO links to this device need to be removed.
+ * 2. All nodes after the current device node need to move
+ * up once this device node is removed from the topology
+ * list. As a result, the proximity domain values for
+ * all nodes after the node being deleted reduce by 1.
+ * This would also cause the proximity domain values for
+ * io links to be updated based on new proximity domain
+ * values.
+ *
+ * Context: The caller must hold write topology_lock.
+ */
+static void kfd_topology_update_io_links(int proximity_domain)
+{
+ struct kfd_topology_device *dev;
+ struct kfd_iolink_properties *iolink, *tmp;
+
+ list_for_each_entry(dev, &topology_device_list, list) {
+ if (dev->proximity_domain > proximity_domain)
+ dev->proximity_domain--;
+
+ list_for_each_entry_safe(iolink, tmp, &dev->io_link_props, list) {
+ /*
+ * If there is an io link to the dev being deleted
+ * then remove that IO link also.
+ */
+ if (iolink->node_to == proximity_domain) {
+ list_del(&iolink->list);
+ dev->io_link_count--;
+ dev->node_props.io_links_count--;
+ } else if (iolink->node_from > proximity_domain) {
+ iolink->node_from--;
+ } else if (iolink->node_to > proximity_domain) {
+ iolink->node_to--;
+ }
+ }
+
+ }
+}
+
int kfd_topology_remove_device(struct kfd_dev *gpu)
{
struct kfd_topology_device *dev, *tmp;
uint32_t gpu_id;
int res = -ENODEV;
+ int i = 0;
down_write(&topology_lock);
- list_for_each_entry_safe(dev, tmp, &topology_device_list, list)
+ list_for_each_entry_safe(dev, tmp, &topology_device_list, list) {
if (dev->gpu == gpu) {
gpu_id = dev->gpu_id;
kfd_remove_sysfs_node_entry(dev);
kfd_release_topology_device(dev);
sys_props.num_devices--;
+ kfd_topology_update_io_links(i);
+ topology_crat_proximity_domain = sys_props.num_devices-1;
+ sys_props.generation_count++;
res = 0;
if (kfd_topology_update_sysfs() < 0)
kfd_topology_release_sysfs();
break;
}
+ i++;
+ }
up_write(&topology_lock);