diff options
Diffstat (limited to 'drivers/staging/rdma/hfi1/chip.c')
-rw-r--r-- | drivers/staging/rdma/hfi1/chip.c | 647 |
1 files changed, 461 insertions, 186 deletions
diff --git a/drivers/staging/rdma/hfi1/chip.c b/drivers/staging/rdma/hfi1/chip.c index 16eb653903e0..dcae8e723f98 100644 --- a/drivers/staging/rdma/hfi1/chip.c +++ b/drivers/staging/rdma/hfi1/chip.c @@ -123,6 +123,8 @@ struct flag_table { #define MIN_KERNEL_KCTXTS 2 #define FIRST_KERNEL_KCTXT 1 +/* sizes for both the QP and RSM map tables */ +#define NUM_MAP_ENTRIES 256 #define NUM_MAP_REGS 32 /* Bit offset into the GUID which carries HFI id information */ @@ -1029,9 +1031,12 @@ static int thermal_init(struct hfi1_devdata *dd); static int wait_logical_linkstate(struct hfi1_pportdata *ppd, u32 state, int msecs); static void read_planned_down_reason_code(struct hfi1_devdata *dd, u8 *pdrrc); +static void read_link_down_reason(struct hfi1_devdata *dd, u8 *ldr); static void handle_temp_err(struct hfi1_devdata *); static void dc_shutdown(struct hfi1_devdata *); static void dc_start(struct hfi1_devdata *); +static int qos_rmt_entries(struct hfi1_devdata *dd, unsigned int *mp, + unsigned int *np); /* * Error interrupt table entry. This is used as input to the interrupt @@ -5661,7 +5666,7 @@ static int sc_to_vl(struct hfi1_devdata *dd, int sw_index) sci = &dd->send_contexts[sw_index]; /* there is no information for user (PSM) and ack contexts */ - if (sci->type != SC_KERNEL) + if ((sci->type != SC_KERNEL) && (sci->type != SC_VL15)) return -1; sc = sci->sc; @@ -6199,18 +6204,13 @@ static void hreq_response(struct hfi1_devdata *dd, u8 return_code, u16 rsp_data) /* * Handle host requests from the 8051. - * - * This is a work-queue function outside of the interrupt. */ -void handle_8051_request(struct work_struct *work) +static void handle_8051_request(struct hfi1_pportdata *ppd) { - struct hfi1_pportdata *ppd = container_of(work, struct hfi1_pportdata, - dc_host_req_work); struct hfi1_devdata *dd = ppd->dd; u64 reg; u16 data = 0; - u8 type, i, lanes, *cache = ppd->qsfp_info.cache; - u8 cdr_ctrl_byte = cache[QSFP_CDR_CTRL_BYTE_OFFS]; + u8 type; reg = read_csr(dd, DC_DC8051_CFG_EXT_DEV_1); if ((reg & DC_DC8051_CFG_EXT_DEV_1_REQ_NEW_SMASK) == 0) @@ -6231,46 +6231,11 @@ void handle_8051_request(struct work_struct *work) case HREQ_READ_CONFIG: case HREQ_SET_TX_EQ_ABS: case HREQ_SET_TX_EQ_REL: + case HREQ_ENABLE: dd_dev_info(dd, "8051 request: request 0x%x not supported\n", type); hreq_response(dd, HREQ_NOT_SUPPORTED, 0); break; - - case HREQ_ENABLE: - lanes = data & 0xF; - for (i = 0; lanes; lanes >>= 1, i++) { - if (!(lanes & 1)) - continue; - if (data & 0x200) { - /* enable TX CDR */ - if (cache[QSFP_MOD_PWR_OFFS] & 0x8 && - cache[QSFP_CDR_INFO_OFFS] & 0x80) - cdr_ctrl_byte |= (1 << (i + 4)); - } else { - /* disable TX CDR */ - if (cache[QSFP_MOD_PWR_OFFS] & 0x8 && - cache[QSFP_CDR_INFO_OFFS] & 0x80) - cdr_ctrl_byte &= ~(1 << (i + 4)); - } - - if (data & 0x800) { - /* enable RX CDR */ - if (cache[QSFP_MOD_PWR_OFFS] & 0x4 && - cache[QSFP_CDR_INFO_OFFS] & 0x40) - cdr_ctrl_byte |= (1 << i); - } else { - /* disable RX CDR */ - if (cache[QSFP_MOD_PWR_OFFS] & 0x4 && - cache[QSFP_CDR_INFO_OFFS] & 0x40) - cdr_ctrl_byte &= ~(1 << i); - } - } - one_qsfp_write(ppd, dd->hfi1_id, QSFP_CDR_CTRL_BYTE_OFFS, - &cdr_ctrl_byte, 1); - hreq_response(dd, HREQ_SUCCESS, data); - refresh_qsfp_cache(ppd, &ppd->qsfp_info); - break; - case HREQ_CONFIG_DONE: hreq_response(dd, HREQ_SUCCESS, 0); break; @@ -6278,7 +6243,6 @@ void handle_8051_request(struct work_struct *work) case HREQ_INTERFACE_TEST: hreq_response(dd, HREQ_SUCCESS, data); break; - default: dd_dev_err(dd, "8051 request: unknown request 0x%x\n", type); hreq_response(dd, HREQ_NOT_SUPPORTED, 0); @@ -6849,6 +6813,75 @@ static void reset_neighbor_info(struct hfi1_pportdata *ppd) ppd->neighbor_fm_security = 0; } +static const char * const link_down_reason_strs[] = { + [OPA_LINKDOWN_REASON_NONE] = "None", + [OPA_LINKDOWN_REASON_RCV_ERROR_0] = "Recive error 0", + [OPA_LINKDOWN_REASON_BAD_PKT_LEN] = "Bad packet length", + [OPA_LINKDOWN_REASON_PKT_TOO_LONG] = "Packet too long", + [OPA_LINKDOWN_REASON_PKT_TOO_SHORT] = "Packet too short", + [OPA_LINKDOWN_REASON_BAD_SLID] = "Bad SLID", + [OPA_LINKDOWN_REASON_BAD_DLID] = "Bad DLID", + [OPA_LINKDOWN_REASON_BAD_L2] = "Bad L2", + [OPA_LINKDOWN_REASON_BAD_SC] = "Bad SC", + [OPA_LINKDOWN_REASON_RCV_ERROR_8] = "Receive error 8", + [OPA_LINKDOWN_REASON_BAD_MID_TAIL] = "Bad mid tail", + [OPA_LINKDOWN_REASON_RCV_ERROR_10] = "Receive error 10", + [OPA_LINKDOWN_REASON_PREEMPT_ERROR] = "Preempt error", + [OPA_LINKDOWN_REASON_PREEMPT_VL15] = "Preempt vl15", + [OPA_LINKDOWN_REASON_BAD_VL_MARKER] = "Bad VL marker", + [OPA_LINKDOWN_REASON_RCV_ERROR_14] = "Receive error 14", + [OPA_LINKDOWN_REASON_RCV_ERROR_15] = "Receive error 15", + [OPA_LINKDOWN_REASON_BAD_HEAD_DIST] = "Bad head distance", + [OPA_LINKDOWN_REASON_BAD_TAIL_DIST] = "Bad tail distance", + [OPA_LINKDOWN_REASON_BAD_CTRL_DIST] = "Bad control distance", + [OPA_LINKDOWN_REASON_BAD_CREDIT_ACK] = "Bad credit ack", + [OPA_LINKDOWN_REASON_UNSUPPORTED_VL_MARKER] = "Unsupported VL marker", + [OPA_LINKDOWN_REASON_BAD_PREEMPT] = "Bad preempt", + [OPA_LINKDOWN_REASON_BAD_CONTROL_FLIT] = "Bad control flit", + [OPA_LINKDOWN_REASON_EXCEED_MULTICAST_LIMIT] = "Exceed multicast limit", + [OPA_LINKDOWN_REASON_RCV_ERROR_24] = "Receive error 24", + [OPA_LINKDOWN_REASON_RCV_ERROR_25] = "Receive error 25", + [OPA_LINKDOWN_REASON_RCV_ERROR_26] = "Receive error 26", + [OPA_LINKDOWN_REASON_RCV_ERROR_27] = "Receive error 27", + [OPA_LINKDOWN_REASON_RCV_ERROR_28] = "Receive error 28", + [OPA_LINKDOWN_REASON_RCV_ERROR_29] = "Receive error 29", + [OPA_LINKDOWN_REASON_RCV_ERROR_30] = "Receive error 30", + [OPA_LINKDOWN_REASON_EXCESSIVE_BUFFER_OVERRUN] = + "Excessive buffer overrun", + [OPA_LINKDOWN_REASON_UNKNOWN] = "Unknown", + [OPA_LINKDOWN_REASON_REBOOT] = "Reboot", + [OPA_LINKDOWN_REASON_NEIGHBOR_UNKNOWN] = "Neighbor unknown", + [OPA_LINKDOWN_REASON_FM_BOUNCE] = "FM bounce", + [OPA_LINKDOWN_REASON_SPEED_POLICY] = "Speed policy", + [OPA_LINKDOWN_REASON_WIDTH_POLICY] = "Width policy", + [OPA_LINKDOWN_REASON_DISCONNECTED] = "Disconnected", + [OPA_LINKDOWN_REASON_LOCAL_MEDIA_NOT_INSTALLED] = + "Local media not installed", + [OPA_LINKDOWN_REASON_NOT_INSTALLED] = "Not installed", + [OPA_LINKDOWN_REASON_CHASSIS_CONFIG] = "Chassis config", + [OPA_LINKDOWN_REASON_END_TO_END_NOT_INSTALLED] = + "End to end not installed", + [OPA_LINKDOWN_REASON_POWER_POLICY] = "Power policy", + [OPA_LINKDOWN_REASON_LINKSPEED_POLICY] = "Link speed policy", + [OPA_LINKDOWN_REASON_LINKWIDTH_POLICY] = "Link width policy", + [OPA_LINKDOWN_REASON_SWITCH_MGMT] = "Switch management", + [OPA_LINKDOWN_REASON_SMA_DISABLED] = "SMA disabled", + [OPA_LINKDOWN_REASON_TRANSIENT] = "Transient" +}; + +/* return the neighbor link down reason string */ +static const char *link_down_reason_str(u8 reason) +{ + const char *str = NULL; + + if (reason < ARRAY_SIZE(link_down_reason_strs)) + str = link_down_reason_strs[reason]; + if (!str) + str = "(invalid)"; + + return str; +} + /* * Handle a link down interrupt from the 8051. * @@ -6857,8 +6890,11 @@ static void reset_neighbor_info(struct hfi1_pportdata *ppd) void handle_link_down(struct work_struct *work) { u8 lcl_reason, neigh_reason = 0; + u8 link_down_reason; struct hfi1_pportdata *ppd = container_of(work, struct hfi1_pportdata, - link_down_work); + link_down_work); + int was_up; + static const char ldr_str[] = "Link down reason: "; if ((ppd->host_link_state & (HLS_DN_POLL | HLS_VERIFY_CAP | HLS_GOING_UP)) && @@ -6867,20 +6903,63 @@ void handle_link_down(struct work_struct *work) HFI1_ODR_MASK(OPA_LINKDOWN_REASON_NOT_INSTALLED); /* Go offline first, then deal with reading/writing through 8051 */ + was_up = !!(ppd->host_link_state & HLS_UP); set_link_state(ppd, HLS_DN_OFFLINE); - lcl_reason = 0; - read_planned_down_reason_code(ppd->dd, &neigh_reason); + if (was_up) { + lcl_reason = 0; + /* link down reason is only valid if the link was up */ + read_link_down_reason(ppd->dd, &link_down_reason); + switch (link_down_reason) { + case LDR_LINK_TRANSFER_ACTIVE_LOW: + /* the link went down, no idle message reason */ + dd_dev_info(ppd->dd, "%sUnexpected link down\n", + ldr_str); + break; + case LDR_RECEIVED_LINKDOWN_IDLE_MSG: + /* + * The neighbor reason is only valid if an idle message + * was received for it. + */ + read_planned_down_reason_code(ppd->dd, &neigh_reason); + dd_dev_info(ppd->dd, + "%sNeighbor link down message %d, %s\n", + ldr_str, neigh_reason, + link_down_reason_str(neigh_reason)); + break; + case LDR_RECEIVED_HOST_OFFLINE_REQ: + dd_dev_info(ppd->dd, + "%sHost requested link to go offline\n", + ldr_str); + break; + default: + dd_dev_info(ppd->dd, "%sUnknown reason 0x%x\n", + ldr_str, link_down_reason); + break; + } - /* - * If no reason, assume peer-initiated but missed - * LinkGoingDown idle flits. - */ - if (neigh_reason == 0) - lcl_reason = OPA_LINKDOWN_REASON_NEIGHBOR_UNKNOWN; + /* + * If no reason, assume peer-initiated but missed + * LinkGoingDown idle flits. + */ + if (neigh_reason == 0) + lcl_reason = OPA_LINKDOWN_REASON_NEIGHBOR_UNKNOWN; + } else { + /* went down while polling or going up */ + lcl_reason = OPA_LINKDOWN_REASON_TRANSIENT; + } set_link_down_reason(ppd, lcl_reason, neigh_reason, 0); + /* inform the SMA when the link transitions from up to down */ + if (was_up && ppd->local_link_down_reason.sma == 0 && + ppd->neigh_link_down_reason.sma == 0) { + ppd->local_link_down_reason.sma = + ppd->local_link_down_reason.latest; + ppd->neigh_link_down_reason.sma = + ppd->neigh_link_down_reason.latest; + } + reset_neighbor_info(ppd); /* disable the port */ @@ -6890,7 +6969,7 @@ void handle_link_down(struct work_struct *work) * If there is no cable attached, turn the DC off. Otherwise, * start the link bring up. */ - if (!qsfp_mod_present(ppd)) { + if (ppd->port_type == PORT_TYPE_QSFP && !qsfp_mod_present(ppd)) { dc_shutdown(ppd->dd); } else { tune_serdes(ppd); @@ -7373,7 +7452,11 @@ retry: ppd->link_width_downgrade_rx_active = rx; } - if (lwde == 0) { + if (ppd->link_width_downgrade_tx_active == 0 || + ppd->link_width_downgrade_rx_active == 0) { + /* the 8051 reported a dead link as a downgrade */ + dd_dev_err(ppd->dd, "Link downgrade is really a link down, ignoring\n"); + } else if (lwde == 0) { /* downgrade is disabled */ /* bounce if not at starting active width */ @@ -7534,7 +7617,7 @@ static void handle_8051_interrupt(struct hfi1_devdata *dd, u32 unused, u64 reg) host_msg &= ~(u64)LINKUP_ACHIEVED; } if (host_msg & EXT_DEVICE_CFG_REQ) { - queue_work(ppd->hfi1_wq, &ppd->dc_host_req_work); + handle_8051_request(ppd); host_msg &= ~(u64)EXT_DEVICE_CFG_REQ; } if (host_msg & VERIFY_CAP_FRAME) { @@ -8660,6 +8743,14 @@ static void read_planned_down_reason_code(struct hfi1_devdata *dd, u8 *pdrrc) *pdrrc = (frame >> DOWN_REMOTE_REASON_SHIFT) & DOWN_REMOTE_REASON_MASK; } +static void read_link_down_reason(struct hfi1_devdata *dd, u8 *ldr) +{ + u32 frame; + + read_8051_config(dd, LINK_DOWN_REASON, GENERAL_CONFIG, &frame); + *ldr = (frame & 0xff); +} + static int read_tx_settings(struct hfi1_devdata *dd, u8 *enable_lane_tx, u8 *tx_polarity_inversion, @@ -9049,9 +9140,9 @@ set_local_link_attributes_fail: } /* - * Call this to start the link. Schedule a retry if the cable is not - * present or if unable to start polling. Do not do anything if the - * link is disabled. Returns 0 if link is disabled or moved to polling + * Call this to start the link. + * Do not do anything if the link is disabled. + * Returns 0 if link is disabled, moved to polling, or the driver is not ready. */ int start_link(struct hfi1_pportdata *ppd) { @@ -9068,15 +9159,7 @@ int start_link(struct hfi1_pportdata *ppd) return 0; } - if (qsfp_mod_present(ppd) || loopback == LOOPBACK_SERDES || - loopback == LOOPBACK_LCB || - ppd->dd->icode == ICODE_FUNCTIONAL_SIMULATOR) - return set_link_state(ppd, HLS_DN_POLL); - - dd_dev_info(ppd->dd, - "%s: stopping link start because no cable is present\n", - __func__); - return -EAGAIN; + return set_link_state(ppd, HLS_DN_POLL); } static void wait_for_qsfp_init(struct hfi1_pportdata *ppd) @@ -9247,7 +9330,7 @@ static int handle_qsfp_error_conditions(struct hfi1_pportdata *ppd, return 0; } -/* This routine will only be scheduled if the QSFP module is present */ +/* This routine will only be scheduled if the QSFP module present is asserted */ void qsfp_event(struct work_struct *work) { struct qsfp_data *qd; @@ -9676,6 +9759,7 @@ static void set_send_length(struct hfi1_pportdata *ppd) & SEND_LEN_CHECK1_LEN_VL15_MASK) << SEND_LEN_CHECK1_LEN_VL15_SHIFT; int i; + u32 thres; for (i = 0; i < ppd->vls_supported; i++) { if (dd->vld[i].mtu > maxvlmtu) @@ -9694,16 +9778,17 @@ static void set_send_length(struct hfi1_pportdata *ppd) /* adjust kernel credit return thresholds based on new MTUs */ /* all kernel receive contexts have the same hdrqentsize */ for (i = 0; i < ppd->vls_supported; i++) { - sc_set_cr_threshold(dd->vld[i].sc, - sc_mtu_to_threshold(dd->vld[i].sc, - dd->vld[i].mtu, - dd->rcd[0]-> - rcvhdrqentsize)); - } - sc_set_cr_threshold(dd->vld[15].sc, - sc_mtu_to_threshold(dd->vld[15].sc, - dd->vld[15].mtu, + thres = min(sc_percent_to_threshold(dd->vld[i].sc, 50), + sc_mtu_to_threshold(dd->vld[i].sc, + dd->vld[i].mtu, dd->rcd[0]->rcvhdrqentsize)); + sc_set_cr_threshold(dd->vld[i].sc, thres); + } + thres = min(sc_percent_to_threshold(dd->vld[15].sc, 50), + sc_mtu_to_threshold(dd->vld[15].sc, + dd->vld[15].mtu, + dd->rcd[0]->rcvhdrqentsize)); + sc_set_cr_threshold(dd->vld[15].sc, thres); /* Adjust maximum MTU for the port in DC */ dcmtu = maxvlmtu == 10240 ? DCC_CFG_PORT_MTU_CAP_10240 : @@ -10030,7 +10115,6 @@ int set_link_state(struct hfi1_pportdata *ppd, u32 state) struct hfi1_devdata *dd = ppd->dd; struct ib_event event = {.device = NULL}; int ret1, ret = 0; - int was_up, is_down; int orig_new_state, poll_bounce; mutex_lock(&ppd->hls_lock); @@ -10049,8 +10133,6 @@ int set_link_state(struct hfi1_pportdata *ppd, u32 state) poll_bounce ? "(bounce) " : "", link_state_reason_name(ppd, state)); - was_up = !!(ppd->host_link_state & HLS_UP); - /* * If we're going to a (HLS_*) link state that implies the logical * link state is neither of (IB_PORT_ARMED, IB_PORT_ACTIVE), then @@ -10261,17 +10343,6 @@ int set_link_state(struct hfi1_pportdata *ppd, u32 state) break; } - is_down = !!(ppd->host_link_state & (HLS_DN_POLL | - HLS_DN_DISABLE | HLS_DN_OFFLINE)); - - if (was_up && is_down && ppd->local_link_down_reason.sma == 0 && - ppd->neigh_link_down_reason.sma == 0) { - ppd->local_link_down_reason.sma = - ppd->local_link_down_reason.latest; - ppd->neigh_link_down_reason.sma = - ppd->neigh_link_down_reason.latest; - } - goto done; unexpected: @@ -12673,22 +12744,24 @@ static int set_up_context_variables(struct hfi1_devdata *dd) int total_contexts; int ret; unsigned ngroups; + int qos_rmt_count; + int user_rmt_reduced; /* - * Kernel contexts: (to be fixed later): - * - min or 2 or 1 context/numa + * Kernel receive contexts: + * - min of 2 or 1 context/numa (excluding control context) * - Context 0 - control context (VL15/multicast/error) - * - Context 1 - default context + * - Context 1 - first kernel context + * - Context 2 - second kernel context + * ... */ if (n_krcvqs) /* - * Don't count context 0 in n_krcvqs since - * is isn't used for normal verbs traffic. - * - * krcvqs will reflect number of kernel - * receive contexts above 0. + * n_krcvqs is the sum of module parameter kernel receive + * contexts, krcvqs[]. It does not include the control + * context, so add that. */ - num_kernel_contexts = n_krcvqs + MIN_KERNEL_KCTXTS - 1; + num_kernel_contexts = n_krcvqs + 1; else num_kernel_contexts = num_online_nodes() + 1; num_kernel_contexts = @@ -12705,12 +12778,13 @@ static int set_up_context_variables(struct hfi1_devdata *dd) num_kernel_contexts = dd->chip_send_contexts - num_vls - 1; } /* - * User contexts: (to be fixed later) - * - default to 1 user context per CPU if num_user_contexts is - * negative + * User contexts: + * - default to 1 user context per real (non-HT) CPU core if + * num_user_contexts is negative */ if (num_user_contexts < 0) - num_user_contexts = num_online_cpus(); + num_user_contexts = + cpumask_weight(&dd->affinity->real_cpu_mask); total_contexts = num_kernel_contexts + num_user_contexts; @@ -12727,6 +12801,19 @@ static int set_up_context_variables(struct hfi1_devdata *dd) total_contexts = num_kernel_contexts + num_user_contexts; } + /* each user context requires an entry in the RMT */ + qos_rmt_count = qos_rmt_entries(dd, NULL, NULL); + if (qos_rmt_count + num_user_contexts > NUM_MAP_ENTRIES) { + user_rmt_reduced = NUM_MAP_ENTRIES - qos_rmt_count; + dd_dev_err(dd, + "RMT size is reducing the number of user receive contexts from %d to %d\n", + (int)num_user_contexts, + user_rmt_reduced); + /* recalculate */ + num_user_contexts = user_rmt_reduced; + total_contexts = num_kernel_contexts + num_user_contexts; + } + /* the first N are kernel contexts, the rest are user contexts */ dd->num_rcv_contexts = total_contexts; dd->n_krcv_queues = num_kernel_contexts; @@ -12776,12 +12863,13 @@ static int set_up_context_variables(struct hfi1_devdata *dd) dd->num_send_contexts = ret; dd_dev_info( dd, - "send contexts: chip %d, used %d (kernel %d, ack %d, user %d)\n", + "send contexts: chip %d, used %d (kernel %d, ack %d, user %d, vl15 %d)\n", dd->chip_send_contexts, dd->num_send_contexts, dd->sc_sizes[SC_KERNEL].count, dd->sc_sizes[SC_ACK].count, - dd->sc_sizes[SC_USER].count); + dd->sc_sizes[SC_USER].count, + dd->sc_sizes[SC_VL15].count); ret = 0; /* success */ } @@ -13451,122 +13539,224 @@ static void init_qpmap_table(struct hfi1_devdata *dd, int i; u64 ctxt = first_ctxt; - for (i = 0; i < 256;) { + for (i = 0; i < 256; i++) { reg |= ctxt << (8 * (i % 8)); - i++; ctxt++; if (ctxt > last_ctxt) ctxt = first_ctxt; - if (i % 8 == 0) { + if (i % 8 == 7) { write_csr(dd, regno, reg); reg = 0; regno += 8; } } - if (i % 8) - write_csr(dd, regno, reg); add_rcvctrl(dd, RCV_CTRL_RCV_QP_MAP_ENABLE_SMASK | RCV_CTRL_RCV_BYPASS_ENABLE_SMASK); } -/** - * init_qos - init RX qos - * @dd - device data - * @first_context - * - * This routine initializes Rule 0 and the - * RSM map table to implement qos. - * - * If all of the limit tests succeed, - * qos is applied based on the array - * interpretation of krcvqs where - * entry 0 is VL0. - * - * The number of vl bits (n) and the number of qpn - * bits (m) are computed to feed both the RSM map table - * and the single rule. - * +struct rsm_map_table { + u64 map[NUM_MAP_REGS]; + unsigned int used; +}; + +struct rsm_rule_data { + u8 offset; + u8 pkt_type; + u32 field1_off; + u32 field2_off; + u32 index1_off; + u32 index1_width; + u32 index2_off; + u32 index2_width; + u32 mask1; + u32 value1; + u32 mask2; + u32 value2; +}; + +/* + * Return an initialized RMT map table for users to fill in. OK if it + * returns NULL, indicating no table. */ -static void init_qos(struct hfi1_devdata *dd, u32 first_ctxt) +static struct rsm_map_table *alloc_rsm_map_table(struct hfi1_devdata *dd) { + struct rsm_map_table *rmt; + u8 rxcontext = is_ax(dd) ? 0 : 0xff; /* 0 is default if a0 ver. */ + + rmt = kmalloc(sizeof(*rmt), GFP_KERNEL); + if (rmt) { + memset(rmt->map, rxcontext, sizeof(rmt->map)); + rmt->used = 0; + } + + return rmt; +} + +/* + * Write the final RMT map table to the chip and free the table. OK if + * table is NULL. + */ +static void complete_rsm_map_table(struct hfi1_devdata *dd, + struct rsm_map_table *rmt) +{ + int i; + + if (rmt) { + /* write table to chip */ + for (i = 0; i < NUM_MAP_REGS; i++) + write_csr(dd, RCV_RSM_MAP_TABLE + (8 * i), rmt->map[i]); + + /* enable RSM */ + add_rcvctrl(dd, RCV_CTRL_RCV_RSM_ENABLE_SMASK); + } +} + +/* + * Add a receive side mapping rule. + */ +static void add_rsm_rule(struct hfi1_devdata *dd, u8 rule_index, + struct rsm_rule_data *rrd) +{ + write_csr(dd, RCV_RSM_CFG + (8 * rule_index), + (u64)rrd->offset << RCV_RSM_CFG_OFFSET_SHIFT | + 1ull << rule_index | /* enable bit */ + (u64)rrd->pkt_type << RCV_RSM_CFG_PACKET_TYPE_SHIFT); + write_csr(dd, RCV_RSM_SELECT + (8 * rule_index), + (u64)rrd->field1_off << RCV_RSM_SELECT_FIELD1_OFFSET_SHIFT | + (u64)rrd->field2_off << RCV_RSM_SELECT_FIELD2_OFFSET_SHIFT | + (u64)rrd->index1_off << RCV_RSM_SELECT_INDEX1_OFFSET_SHIFT | + (u64)rrd->index1_width << RCV_RSM_SELECT_INDEX1_WIDTH_SHIFT | + (u64)rrd->index2_off << RCV_RSM_SELECT_INDEX2_OFFSET_SHIFT | + (u64)rrd->index2_width << RCV_RSM_SELECT_INDEX2_WIDTH_SHIFT); + write_csr(dd, RCV_RSM_MATCH + (8 * rule_index), + (u64)rrd->mask1 << RCV_RSM_MATCH_MASK1_SHIFT | + (u64)rrd->value1 << RCV_RSM_MATCH_VALUE1_SHIFT | + (u64)rrd->mask2 << RCV_RSM_MATCH_MASK2_SHIFT | + (u64)rrd->value2 << RCV_RSM_MATCH_VALUE2_SHIFT); +} + +/* return the number of RSM map table entries that will be used for QOS */ +static int qos_rmt_entries(struct hfi1_devdata *dd, unsigned int *mp, + unsigned int *np) +{ + int i; + unsigned int m, n; u8 max_by_vl = 0; - unsigned qpns_per_vl, ctxt, i, qpn, n = 1, m; - u64 *rsmmap; - u64 reg; - u8 rxcontext = is_ax(dd) ? 0 : 0xff; /* 0 is default if a0 ver. */ - /* validate */ + /* is QOS active at all? */ if (dd->n_krcv_queues <= MIN_KERNEL_KCTXTS || num_vls == 1 || krcvqsset <= 1) - goto bail; - for (i = 0; i < min_t(unsigned, num_vls, krcvqsset); i++) + goto no_qos; + + /* determine bits for qpn */ + for (i = 0; i < min_t(unsigned int, num_vls, krcvqsset); i++) if (krcvqs[i] > max_by_vl) max_by_vl = krcvqs[i]; if (max_by_vl > 32) - goto bail; - qpns_per_vl = __roundup_pow_of_two(max_by_vl); - /* determine bits vl */ - n = ilog2(num_vls); - /* determine bits for qpn */ - m = ilog2(qpns_per_vl); + goto no_qos; + m = ilog2(__roundup_pow_of_two(max_by_vl)); + + /* determine bits for vl */ + n = ilog2(__roundup_pow_of_two(num_vls)); + + /* reject if too much is used */ if ((m + n) > 7) + goto no_qos; + + if (mp) + *mp = m; + if (np) + *np = n; + + return 1 << (m + n); + +no_qos: + if (mp) + *mp = 0; + if (np) + *np = 0; + return 0; +} + +/** + * init_qos - init RX qos + * @dd - device data + * @rmt - RSM map table + * + * This routine initializes Rule 0 and the RSM map table to implement + * quality of service (qos). + * + * If all of the limit tests succeed, qos is applied based on the array + * interpretation of krcvqs where entry 0 is VL0. + * + * The number of vl bits (n) and the number of qpn bits (m) are computed to + * feed both the RSM map table and the single rule. + */ +static void init_qos(struct hfi1_devdata *dd, struct rsm_map_table *rmt) +{ + struct rsm_rule_data rrd; + unsigned qpns_per_vl, ctxt, i, qpn, n = 1, m; + unsigned int rmt_entries; + u64 reg; + + if (!rmt) goto bail; - if (num_vls * qpns_per_vl > dd->chip_rcv_contexts) + rmt_entries = qos_rmt_entries(dd, &m, &n); + if (rmt_entries == 0) goto bail; - rsmmap = kmalloc_array(NUM_MAP_REGS, sizeof(u64), GFP_KERNEL); - if (!rsmmap) + qpns_per_vl = 1 << m; + + /* enough room in the map table? */ + rmt_entries = 1 << (m + n); + if (rmt->used + rmt_entries >= NUM_MAP_ENTRIES) goto bail; - memset(rsmmap, rxcontext, NUM_MAP_REGS * sizeof(u64)); - /* init the local copy of the table */ - for (i = 0, ctxt = first_ctxt; i < num_vls; i++) { + + /* add qos entries to the the RSM map table */ + for (i = 0, ctxt = FIRST_KERNEL_KCTXT; i < num_vls; i++) { unsigned tctxt; for (qpn = 0, tctxt = ctxt; krcvqs[i] && qpn < qpns_per_vl; qpn++) { unsigned idx, regoff, regidx; - /* generate index <= 128 */ - idx = (qpn << n) ^ i; + /* generate the index the hardware will produce */ + idx = rmt->used + ((qpn << n) ^ i); regoff = (idx % 8) * 8; regidx = idx / 8; - reg = rsmmap[regidx]; - /* replace 0xff with context number */ + /* replace default with context number */ + reg = rmt->map[regidx]; reg &= ~(RCV_RSM_MAP_TABLE_RCV_CONTEXT_A_MASK << regoff); reg |= (u64)(tctxt++) << regoff; - rsmmap[regidx] = reg; + rmt->map[regidx] = reg; if (tctxt == ctxt + krcvqs[i]) tctxt = ctxt; } ctxt += krcvqs[i]; } - /* flush cached copies to chip */ - for (i = 0; i < NUM_MAP_REGS; i++) - write_csr(dd, RCV_RSM_MAP_TABLE + (8 * i), rsmmap[i]); - /* add rule0 */ - write_csr(dd, RCV_RSM_CFG /* + (8 * 0) */, - RCV_RSM_CFG_ENABLE_OR_CHAIN_RSM0_MASK << - RCV_RSM_CFG_ENABLE_OR_CHAIN_RSM0_SHIFT | - 2ull << RCV_RSM_CFG_PACKET_TYPE_SHIFT); - write_csr(dd, RCV_RSM_SELECT /* + (8 * 0) */, - LRH_BTH_MATCH_OFFSET << RCV_RSM_SELECT_FIELD1_OFFSET_SHIFT | - LRH_SC_MATCH_OFFSET << RCV_RSM_SELECT_FIELD2_OFFSET_SHIFT | - LRH_SC_SELECT_OFFSET << RCV_RSM_SELECT_INDEX1_OFFSET_SHIFT | - ((u64)n) << RCV_RSM_SELECT_INDEX1_WIDTH_SHIFT | - QPN_SELECT_OFFSET << RCV_RSM_SELECT_INDEX2_OFFSET_SHIFT | - ((u64)m + (u64)n) << RCV_RSM_SELECT_INDEX2_WIDTH_SHIFT); - write_csr(dd, RCV_RSM_MATCH /* + (8 * 0) */, - LRH_BTH_MASK << RCV_RSM_MATCH_MASK1_SHIFT | - LRH_BTH_VALUE << RCV_RSM_MATCH_VALUE1_SHIFT | - LRH_SC_MASK << RCV_RSM_MATCH_MASK2_SHIFT | - LRH_SC_VALUE << RCV_RSM_MATCH_VALUE2_SHIFT); - /* Enable RSM */ - add_rcvctrl(dd, RCV_CTRL_RCV_RSM_ENABLE_SMASK); - kfree(rsmmap); - /* map everything else to first context */ - init_qpmap_table(dd, FIRST_KERNEL_KCTXT, MIN_KERNEL_KCTXTS - 1); + + rrd.offset = rmt->used; + rrd.pkt_type = 2; + rrd.field1_off = LRH_BTH_MATCH_OFFSET; + rrd.field2_off = LRH_SC_MATCH_OFFSET; + rrd.index1_off = LRH_SC_SELECT_OFFSET; + rrd.index1_width = n; + rrd.index2_off = QPN_SELECT_OFFSET; + rrd.index2_width = m + n; + rrd.mask1 = LRH_BTH_MASK; + rrd.value1 = LRH_BTH_VALUE; + rrd.mask2 = LRH_SC_MASK; + rrd.value2 = LRH_SC_VALUE; + + /* add rule 0 */ + add_rsm_rule(dd, 0, &rrd); + + /* mark RSM map entries as used */ + rmt->used += rmt_entries; + /* map everything else to the mcast/err/vl15 context */ + init_qpmap_table(dd, HFI1_CTRL_CTXT, HFI1_CTRL_CTXT); dd->qos_shift = n + 1; return; bail: @@ -13574,13 +13764,86 @@ bail: init_qpmap_table(dd, FIRST_KERNEL_KCTXT, dd->n_krcv_queues - 1); } +static void init_user_fecn_handling(struct hfi1_devdata *dd, + struct rsm_map_table *rmt) +{ + struct rsm_rule_data rrd; + u64 reg; + int i, idx, regoff, regidx; + u8 offset; + + /* there needs to be enough room in the map table */ + if (rmt->used + dd->num_user_contexts >= NUM_MAP_ENTRIES) { + dd_dev_err(dd, "User FECN handling disabled - too many user contexts allocated\n"); + return; + } + + /* + * RSM will extract the destination context as an index into the + * map table. The destination contexts are a sequential block + * in the range first_user_ctxt...num_rcv_contexts-1 (inclusive). + * Map entries are accessed as offset + extracted value. Adjust + * the added offset so this sequence can be placed anywhere in + * the table - as long as the entries themselves do not wrap. + * There are only enough bits in offset for the table size, so + * start with that to allow for a "negative" offset. + */ + offset = (u8)(NUM_MAP_ENTRIES + (int)rmt->used - + (int)dd->first_user_ctxt); + + for (i = dd->first_user_ctxt, idx = rmt->used; + i < dd->num_rcv_contexts; i++, idx++) { + /* replace with identity mapping */ + regoff = (idx % 8) * 8; + regidx = idx / 8; + reg = rmt->map[regidx]; + reg &= ~(RCV_RSM_MAP_TABLE_RCV_CONTEXT_A_MASK << regoff); + reg |= (u64)i << regoff; + rmt->map[regidx] = reg; + } + + /* + * For RSM intercept of Expected FECN packets: + * o packet type 0 - expected + * o match on F (bit 95), using select/match 1, and + * o match on SH (bit 133), using select/match 2. + * + * Use index 1 to extract the 8-bit receive context from DestQP + * (start at bit 64). Use that as the RSM map table index. + */ + rrd.offset = offset; + rrd.pkt_type = 0; + rrd.field1_off = 95; + rrd.field2_off = 133; + rrd.index1_off = 64; + rrd.index1_width = 8; + rrd.index2_off = 0; + rrd.index2_width = 0; + rrd.mask1 = 1; + rrd.value1 = 1; + rrd.mask2 = 1; + rrd.value2 = 1; + + /* add rule 1 */ + add_rsm_rule(dd, 1, &rrd); + + rmt->used += dd->num_user_contexts; +} + static void init_rxe(struct hfi1_devdata *dd) { + struct rsm_map_table *rmt; + /* enable all receive errors */ write_csr(dd, RCV_ERR_MASK, ~0ull); - /* setup QPN map table - start where VL15 context leaves off */ - init_qos(dd, dd->n_krcv_queues > MIN_KERNEL_KCTXTS ? - MIN_KERNEL_KCTXTS : 0); + + rmt = alloc_rsm_map_table(dd); + /* set up QOS, including the QPN map table */ + init_qos(dd, rmt); + init_user_fecn_handling(dd, rmt); + complete_rsm_map_table(dd, rmt); + kfree(rmt); + /* * make sure RcvCtrl.RcvWcb <= PCIe Device Control * Register Max_Payload_Size (PCI_EXP_DEVCTL in Linux PCIe config @@ -13762,6 +14025,7 @@ int hfi1_set_ctxt_pkey(struct hfi1_devdata *dd, unsigned ctxt, u16 pkey) write_kctxt_csr(dd, sctxt, SEND_CTXT_CHECK_PARTITION_KEY, reg); reg = read_kctxt_csr(dd, sctxt, SEND_CTXT_CHECK_ENABLE); reg |= SEND_CTXT_CHECK_ENABLE_CHECK_PARTITION_KEY_SMASK; + reg &= ~SEND_CTXT_CHECK_ENABLE_DISALLOW_KDETH_PACKETS_SMASK; write_kctxt_csr(dd, sctxt, SEND_CTXT_CHECK_ENABLE, reg); done: return ret; @@ -14148,6 +14412,19 @@ struct hfi1_devdata *hfi1_init_dd(struct pci_dev *pdev, (dd->revision >> CCE_REVISION_SW_SHIFT) & CCE_REVISION_SW_MASK); + /* + * The real cpu mask is part of the affinity struct but has to be + * initialized earlier than the rest of the affinity struct because it + * is needed to calculate the number of user contexts in + * set_up_context_variables(). However, hfi1_dev_affinity_init(), + * which initializes the rest of the affinity struct members, + * depends on set_up_context_variables() for the number of kernel + * contexts, so it cannot be called before set_up_context_variables(). + */ + ret = init_real_cpu_mask(dd); + if (ret) + goto bail_cleanup; + ret = set_up_context_variables(dd); if (ret) goto bail_cleanup; @@ -14161,9 +14438,7 @@ struct hfi1_devdata *hfi1_init_dd(struct pci_dev *pdev, /* set up KDETH QP prefix in both RX and TX CSRs */ init_kdeth_qp(dd); - ret = hfi1_dev_affinity_init(dd); - if (ret) - goto bail_cleanup; + hfi1_dev_affinity_init(dd); /* send contexts must be set up before receive contexts */ ret = init_send_contexts(dd); |