aboutsummaryrefslogtreecommitdiffstats
path: root/include/uapi/misc/habanalabs.h
diff options
context:
space:
mode:
Diffstat (limited to 'include/uapi/misc/habanalabs.h')
-rw-r--r--include/uapi/misc/habanalabs.h166
1 files changed, 139 insertions, 27 deletions
diff --git a/include/uapi/misc/habanalabs.h b/include/uapi/misc/habanalabs.h
index 00b309590499..371dfc4243b3 100644
--- a/include/uapi/misc/habanalabs.h
+++ b/include/uapi/misc/habanalabs.h
@@ -333,7 +333,18 @@ enum hl_server_type {
* HL_INFO_SYNC_MANAGER - Retrieve sync manager info per dcore
* HL_INFO_TOTAL_ENERGY - Retrieve total energy consumption
* HL_INFO_PLL_FREQUENCY - Retrieve PLL frequency
+ * HL_INFO_POWER - Retrieve power information
* HL_INFO_OPEN_STATS - Retrieve info regarding recent device open calls
+ * HL_INFO_DRAM_REPLACED_ROWS - Retrieve DRAM replaced rows info
+ * HL_INFO_DRAM_PENDING_ROWS - Retrieve DRAM pending rows num
+ * HL_INFO_LAST_ERR_OPEN_DEV_TIME - Retrieve timestamp of the last time the device was opened
+ * and CS timeout or razwi error occurred.
+ * HL_INFO_CS_TIMEOUT_EVENT - Retrieve CS timeout timestamp and its related CS sequence number.
+ * HL_INFO_RAZWI_EVENT - Retrieve parameters of razwi:
+ * Timestamp of razwi.
+ * The address which accessing it caused the razwi.
+ * Razwi initiator.
+ * Razwi cause, was it a page fault or MMU access error.
*/
#define HL_INFO_HW_IP_INFO 0
#define HL_INFO_HW_EVENTS 1
@@ -353,8 +364,13 @@ enum hl_server_type {
#define HL_INFO_PLL_FREQUENCY 16
#define HL_INFO_POWER 17
#define HL_INFO_OPEN_STATS 18
+#define HL_INFO_DRAM_REPLACED_ROWS 21
+#define HL_INFO_DRAM_PENDING_ROWS 22
+#define HL_INFO_LAST_ERR_OPEN_DEV_TIME 23
+#define HL_INFO_CS_TIMEOUT_EVENT 24
+#define HL_INFO_RAZWI_EVENT 25
-#define HL_INFO_VERSION_MAX_LEN 128
+#define HL_INFO_VERSION_MAX_LEN 128
#define HL_INFO_CARD_NAME_MAX_LEN 16
/**
@@ -473,15 +489,27 @@ struct hl_info_pci_counters {
__u64 replay_cnt;
};
-#define HL_CLK_THROTTLE_POWER 0x1
-#define HL_CLK_THROTTLE_THERMAL 0x2
+enum hl_clk_throttling_type {
+ HL_CLK_THROTTLE_TYPE_POWER,
+ HL_CLK_THROTTLE_TYPE_THERMAL,
+ HL_CLK_THROTTLE_TYPE_MAX
+};
+
+/* clk_throttling_reason masks */
+#define HL_CLK_THROTTLE_POWER (1 << HL_CLK_THROTTLE_TYPE_POWER)
+#define HL_CLK_THROTTLE_THERMAL (1 << HL_CLK_THROTTLE_TYPE_THERMAL)
/**
* struct hl_info_clk_throttle - clock throttling reason
* @clk_throttling_reason: each bit represents a clk throttling reason
+ * @clk_throttling_timestamp_us: represents CPU timestamp in microseconds of the start-event
+ * @clk_throttling_duration_ns: the clock throttle time in nanosec
*/
struct hl_info_clk_throttle {
__u32 clk_throttling_reason;
+ __u32 pad;
+ __u64 clk_throttling_timestamp_us[HL_CLK_THROTTLE_TYPE_MAX];
+ __u64 clk_throttling_duration_ns[HL_CLK_THROTTLE_TYPE_MAX];
};
/**
@@ -559,6 +587,51 @@ struct hl_info_cs_counters {
__u64 ctx_validation_drop_cnt;
};
+/**
+ * struct hl_info_last_err_open_dev_time - last error boot information.
+ * @timestamp: timestamp of last time the device was opened and error occurred.
+ */
+struct hl_info_last_err_open_dev_time {
+ __s64 timestamp;
+};
+
+/**
+ * struct hl_info_cs_timeout_event - last CS timeout information.
+ * @timestamp: timestamp when last CS timeout event occurred.
+ * @seq: sequence number of last CS timeout event.
+ */
+struct hl_info_cs_timeout_event {
+ __s64 timestamp;
+ __u64 seq;
+};
+
+#define HL_RAZWI_PAGE_FAULT 0
+#define HL_RAZWI_MMU_ACCESS_ERROR 1
+
+/**
+ * struct hl_info_razwi_event - razwi information.
+ * @timestamp: timestamp of razwi.
+ * @addr: address which accessing it caused razwi.
+ * @engine_id_1: engine id of the razwi initiator, if it was initiated by engine that does not
+ * have engine id it will be set to U16_MAX.
+ * @engine_id_2: second engine id of razwi initiator. Might happen that razwi have 2 possible
+ * engines which one them caused the razwi. In that case, it will contain the
+ * second possible engine id, otherwise it will be set to U16_MAX.
+ * @no_engine_id: if razwi initiator does not have engine id, this field will be set to 1,
+ * otherwise 0.
+ * @error_type: cause of razwi, page fault or access error, otherwise it will be set to U8_MAX.
+ * @pad: padding to 64 bit.
+ */
+struct hl_info_razwi_event {
+ __s64 timestamp;
+ __u64 addr;
+ __u16 engine_id_1;
+ __u16 engine_id_2;
+ __u8 no_engine_id;
+ __u8 error_type;
+ __u8 pad[2];
+};
+
enum gaudi_dcores {
HL_GAUDI_WS_DCORE,
HL_GAUDI_WN_DCORE,
@@ -607,7 +680,10 @@ struct hl_info_args {
#define HL_MAX_CB_SIZE (0x200000 - 32)
/* Indicates whether the command buffer should be mapped to the device's MMU */
-#define HL_CB_FLAGS_MAP 0x1
+#define HL_CB_FLAGS_MAP 0x1
+
+/* Used with HL_CB_OP_INFO opcode to get the device va address for kernel mapped CB */
+#define HL_CB_FLAGS_GET_DEVICE_VA 0x2
struct hl_cb_in {
/* Handle of CB or 0 if we want to create one */
@@ -629,11 +705,16 @@ struct hl_cb_out {
/* Handle of CB */
__u64 cb_handle;
- /* Information about CB */
- struct {
- /* Usage count of CB */
- __u32 usage_cnt;
- __u32 pad;
+ union {
+ /* Information about CB */
+ struct {
+ /* Usage count of CB */
+ __u32 usage_cnt;
+ __u32 pad;
+ };
+
+ /* CB mapped address to device MMU */
+ __u64 device_va;
};
};
};
@@ -856,9 +937,17 @@ struct hl_cs_out {
/*
* SOB base address offset
- * Valid only when HL_CS_FLAGS_RESERVE_SIGNALS_ONLY is set
+ * Valid only when HL_CS_FLAGS_RESERVE_SIGNALS_ONLY or HL_CS_FLAGS_SIGNAL is set
*/
__u32 sob_base_addr_offset;
+
+ /*
+ * Count of completed signals in SOB before current signal submission.
+ * Valid only when (HL_CS_FLAGS_ENCAP_SIGNALS & HL_CS_FLAGS_STAGED_SUBMISSION)
+ * or HL_CS_FLAGS_SIGNAL is set
+ */
+ __u16 sob_count_before_submission;
+ __u16 pad[3];
};
union hl_cs_args {
@@ -866,9 +955,10 @@ union hl_cs_args {
struct hl_cs_out out;
};
-#define HL_WAIT_CS_FLAGS_INTERRUPT 0x2
-#define HL_WAIT_CS_FLAGS_INTERRUPT_MASK 0xFFF00000
-#define HL_WAIT_CS_FLAGS_MULTI_CS 0x4
+#define HL_WAIT_CS_FLAGS_INTERRUPT 0x2
+#define HL_WAIT_CS_FLAGS_INTERRUPT_MASK 0xFFF00000
+#define HL_WAIT_CS_FLAGS_MULTI_CS 0x4
+#define HL_WAIT_CS_FLAGS_INTERRUPT_KERNEL_CQ 0x10
#define HL_WAIT_MULTI_CS_LIST_MAX_LEN 32
@@ -888,14 +978,23 @@ struct hl_wait_cs_in {
};
struct {
- /* User address for completion comparison.
- * upon interrupt, driver will compare the value pointed
- * by this address with the supplied target value.
- * in order not to perform any comparison, set address
- * to all 1s.
- * Relevant only when HL_WAIT_CS_FLAGS_INTERRUPT is set
- */
- __u64 addr;
+ union {
+ /* User address for completion comparison.
+ * upon interrupt, driver will compare the value pointed
+ * by this address with the supplied target value.
+ * in order not to perform any comparison, set address
+ * to all 1s.
+ * Relevant only when HL_WAIT_CS_FLAGS_INTERRUPT is set
+ */
+ __u64 addr;
+
+ /* cq_counters_handle to a kernel mapped cb which contains
+ * cq counters.
+ * Relevant only when HL_WAIT_CS_FLAGS_INTERRUPT_KERNEL_CQ is set
+ */
+ __u64 cq_counters_handle;
+ };
+
/* Target value for completion comparison */
__u64 target;
};
@@ -911,14 +1010,27 @@ struct hl_wait_cs_in {
*/
__u32 flags;
- /* Multi CS API info- valid entries in multi-CS array */
- __u8 seq_arr_len;
- __u8 pad[3];
+ union {
+ struct {
+ /* Multi CS API info- valid entries in multi-CS array */
+ __u8 seq_arr_len;
+ __u8 pad[7];
+ };
+
+ /* Absolute timeout to wait for an interrupt in microseconds.
+ * Relevant only when HL_WAIT_CS_FLAGS_INTERRUPT is set
+ */
+ __u64 interrupt_timeout_us;
+ };
- /* Absolute timeout to wait for an interrupt in microseconds.
- * Relevant only when HL_WAIT_CS_FLAGS_INTERRUPT is set
+ /*
+ * cq counter offset inside the counters cb pointed by cq_counters_handle above.
+ * upon interrupt, driver will compare the value pointed
+ * by this address (cq_counters_handle + cq_counters_offset)
+ * with the supplied target value.
+ * relevant only when HL_WAIT_CS_FLAGS_INTERRUPT_KERNEL_CQ is set
*/
- __u32 interrupt_timeout_us;
+ __u64 cq_counters_offset;
};
#define HL_WAIT_CS_STATUS_COMPLETED 0