aboutsummaryrefslogtreecommitdiffstatshomepage
path: root/drivers/gpu/drm/xe/xe_survivability_mode.c
blob: d939ce70e6fa82b414353045f3264eb895aab831 (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
// SPDX-License-Identifier: MIT
/*
 * Copyright © 2025 Intel Corporation
 */

#include "xe_survivability_mode.h"
#include "xe_survivability_mode_types.h"

#include <linux/kobject.h>
#include <linux/pci.h>
#include <linux/sysfs.h>

#include "xe_device.h"
#include "xe_gt.h"
#include "xe_heci_gsc.h"
#include "xe_mmio.h"
#include "xe_pcode_api.h"
#include "xe_vsec.h"

#define MAX_SCRATCH_MMIO 8

/**
 * DOC: Xe Boot Survivability
 *
 * Boot Survivability is a software based workflow for recovering a system in a failed boot state
 * Here system recoverability is concerned with recovering the firmware responsible for boot.
 *
 * This is implemented by loading the driver with bare minimum (no drm card) to allow the firmware
 * to be flashed through mei and collect telemetry. The driver's probe flow is modified
 * such that it enters survivability mode when pcode initialization is incomplete and boot status
 * denotes a failure. The driver then  populates the survivability_mode PCI sysfs indicating
 * survivability mode and provides additional information required for debug
 *
 * KMD exposes below admin-only readable sysfs in survivability mode
 *
 * device/survivability_mode: The presence of this file indicates that the card is in survivability
 *			      mode. Also, provides additional information on why the driver entered
 *			      survivability mode.
 *
 *			      Capability Information - Provides boot status
 *			      Postcode Information   - Provides information about the failure
 *			      Overflow Information   - Provides history of previous failures
 *			      Auxiliary Information  - Certain failures may have information in
 *						       addition to postcode information
 */

static u32 aux_history_offset(u32 reg_value)
{
	return REG_FIELD_GET(AUXINFO_HISTORY_OFFSET, reg_value);
}

static void set_survivability_info(struct xe_mmio *mmio, struct xe_survivability_info *info,
				   int id, char *name)
{
	strscpy(info[id].name, name, sizeof(info[id].name));
	info[id].reg = PCODE_SCRATCH(id).raw;
	info[id].value = xe_mmio_read32(mmio, PCODE_SCRATCH(id));
}

static void populate_survivability_info(struct xe_device *xe)
{
	struct xe_survivability *survivability = &xe->survivability;
	struct xe_survivability_info *info = survivability->info;
	struct xe_mmio *mmio;
	u32 id = 0, reg_value;
	char name[NAME_MAX];
	int index;

	mmio = xe_root_tile_mmio(xe);
	set_survivability_info(mmio, info, id, "Capability Info");
	reg_value = info[id].value;

	if (reg_value & HISTORY_TRACKING) {
		id++;
		set_survivability_info(mmio, info, id, "Postcode Info");

		if (reg_value & OVERFLOW_SUPPORT) {
			id = REG_FIELD_GET(OVERFLOW_REG_OFFSET, reg_value);
			set_survivability_info(mmio, info, id, "Overflow Info");
		}
	}

	if (reg_value & AUXINFO_SUPPORT) {
		id = REG_FIELD_GET(AUXINFO_REG_OFFSET, reg_value);

		for (index = 0; id && reg_value; index++, reg_value = info[id].value,
		     id = aux_history_offset(reg_value)) {
			snprintf(name, NAME_MAX, "Auxiliary Info %d", index);
			set_survivability_info(mmio, info, id, name);
		}
	}
}

static void log_survivability_info(struct pci_dev *pdev)
{
	struct xe_device *xe = pdev_to_xe_device(pdev);
	struct xe_survivability *survivability = &xe->survivability;
	struct xe_survivability_info *info = survivability->info;
	int id;

	dev_info(&pdev->dev, "Survivability Boot Status : Critical Failure (%d)\n",
		 survivability->boot_status);
	for (id = 0; id < MAX_SCRATCH_MMIO; id++) {
		if (info[id].reg)
			dev_info(&pdev->dev, "%s: 0x%x - 0x%x\n", info[id].name,
				 info[id].reg, info[id].value);
	}
}

static ssize_t survivability_mode_show(struct device *dev,
				       struct device_attribute *attr, char *buff)
{
	struct pci_dev *pdev = to_pci_dev(dev);
	struct xe_device *xe = pdev_to_xe_device(pdev);
	struct xe_survivability *survivability = &xe->survivability;
	struct xe_survivability_info *info = survivability->info;
	int index = 0, count = 0;

	for (index = 0; index < MAX_SCRATCH_MMIO; index++) {
		if (info[index].reg)
			count += sysfs_emit_at(buff, count, "%s: 0x%x - 0x%x\n", info[index].name,
					       info[index].reg, info[index].value);
	}

	return count;
}

static DEVICE_ATTR_ADMIN_RO(survivability_mode);

static void xe_survivability_mode_fini(void *arg)
{
	struct xe_device *xe = arg;
	struct pci_dev *pdev = to_pci_dev(xe->drm.dev);
	struct device *dev = &pdev->dev;

	sysfs_remove_file(&dev->kobj, &dev_attr_survivability_mode.attr);
}

static int enable_survivability_mode(struct pci_dev *pdev)
{
	struct device *dev = &pdev->dev;
	struct xe_device *xe = pdev_to_xe_device(pdev);
	struct xe_survivability *survivability = &xe->survivability;
	int ret = 0;

	/* create survivability mode sysfs */
	ret = sysfs_create_file(&dev->kobj, &dev_attr_survivability_mode.attr);
	if (ret) {
		dev_warn(dev, "Failed to create survivability sysfs files\n");
		return ret;
	}

	ret = devm_add_action_or_reset(xe->drm.dev,
				       xe_survivability_mode_fini, xe);
	if (ret)
		return ret;

	ret = xe_heci_gsc_init(xe);
	if (ret)
		return ret;

	xe_vsec_init(xe);

	survivability->mode = true;
	dev_err(dev, "In Survivability Mode\n");

	return 0;
}

/**
 * xe_survivability_mode_is_enabled - check if survivability mode is enabled
 * @xe: xe device instance
 *
 * Returns true if in survivability mode, false otherwise
 */
bool xe_survivability_mode_is_enabled(struct xe_device *xe)
{
	return xe->survivability.mode;
}

/**
 * xe_survivability_mode_required - checks if survivability mode is required
 * @xe: xe device instance
 *
 * This function reads the boot status from Pcode
 *
 * Return: true if boot status indicates failure, false otherwise
 */
bool xe_survivability_mode_required(struct xe_device *xe)
{
	struct xe_survivability *survivability = &xe->survivability;
	struct xe_mmio *mmio = xe_root_tile_mmio(xe);
	u32 data;

	if (!IS_DGFX(xe) || xe->info.platform < XE_BATTLEMAGE || IS_SRIOV_VF(xe))
		return false;

	data = xe_mmio_read32(mmio, PCODE_SCRATCH(0));
	survivability->boot_status = REG_FIELD_GET(BOOT_STATUS, data);

	return survivability->boot_status == NON_CRITICAL_FAILURE ||
		survivability->boot_status == CRITICAL_FAILURE;
}

/**
 * xe_survivability_mode_enable - Initialize and enable the survivability mode
 * @xe: xe device instance
 *
 * Initialize survivability information and enable survivability mode
 *
 * Return: 0 for success, negative error code otherwise.
 */
int xe_survivability_mode_enable(struct xe_device *xe)
{
	struct xe_survivability *survivability = &xe->survivability;
	struct xe_survivability_info *info;
	struct pci_dev *pdev = to_pci_dev(xe->drm.dev);

	survivability->size = MAX_SCRATCH_MMIO;

	info = devm_kcalloc(xe->drm.dev, survivability->size, sizeof(*info),
			    GFP_KERNEL);
	if (!info)
		return -ENOMEM;

	survivability->info = info;

	populate_survivability_info(xe);

	/* Only log debug information and exit if it is a critical failure */
	if (survivability->boot_status == CRITICAL_FAILURE) {
		log_survivability_info(pdev);
		return -ENXIO;
	}

	return enable_survivability_mode(pdev);
}