From 2d4f545cb14fa2d9865124183f3100d46dc9b3b5 Mon Sep 17 00:00:00 2001 From: Peter Meerwald Date: Mon, 4 Jun 2018 23:34:06 +0200 Subject: rfkill: Fixes and cleanup of kernel-doc in the header file Fixes kerneldoc parameter names to match implementation, rfkill_set_hw_state(), rfkill_set_sw_state(). Fix description of rfkill_resume_polling(). Fix typos in documentation of rfkill_find_type(). Consistently start kerneldoc description with uppercase letter. Signed-off-by: Peter Meerwald-Stadler Signed-off-by: Johannes Berg --- include/linux/rfkill.h | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) (limited to 'include/linux') diff --git a/include/linux/rfkill.h b/include/linux/rfkill.h index e6a0031d1b1f..8ad2487a86d5 100644 --- a/include/linux/rfkill.h +++ b/include/linux/rfkill.h @@ -66,7 +66,7 @@ struct rfkill_ops { #if defined(CONFIG_RFKILL) || defined(CONFIG_RFKILL_MODULE) /** - * rfkill_alloc - allocate rfkill structure + * rfkill_alloc - Allocate rfkill structure * @name: name of the struct -- the string is not copied internally * @parent: device that has rf switch on it * @type: type of the switch (RFKILL_TYPE_*) @@ -112,7 +112,7 @@ void rfkill_pause_polling(struct rfkill *rfkill); /** * rfkill_resume_polling(struct rfkill *rfkill) * - * Pause polling -- say transmitter is off for other reasons. + * Resume polling * NOTE: not necessary for suspend/resume -- in that case the * core stops polling anyway */ @@ -130,7 +130,7 @@ void rfkill_resume_polling(struct rfkill *rfkill); void rfkill_unregister(struct rfkill *rfkill); /** - * rfkill_destroy - free rfkill structure + * rfkill_destroy - Free rfkill structure * @rfkill: rfkill structure to be destroyed * * Destroys the rfkill structure. @@ -140,7 +140,7 @@ void rfkill_destroy(struct rfkill *rfkill); /** * rfkill_set_hw_state - Set the internal rfkill hardware block state * @rfkill: pointer to the rfkill class to modify. - * @state: the current hardware block state to set + * @blocked: the current hardware block state to set * * rfkill drivers that get events when the hard-blocked state changes * use this function to notify the rfkill core (and through that also @@ -161,7 +161,7 @@ bool rfkill_set_hw_state(struct rfkill *rfkill, bool blocked); /** * rfkill_set_sw_state - Set the internal rfkill software block state * @rfkill: pointer to the rfkill class to modify. - * @state: the current software block state to set + * @blocked: the current software block state to set * * rfkill drivers that get events when the soft-blocked state changes * (yes, some platforms directly act on input but allow changing again) @@ -183,7 +183,7 @@ bool rfkill_set_sw_state(struct rfkill *rfkill, bool blocked); /** * rfkill_init_sw_state - Initialize persistent software block state * @rfkill: pointer to the rfkill class to modify. - * @state: the current software block state to set + * @blocked: the current software block state to set * * rfkill drivers that preserve their software block state over power off * use this function to notify the rfkill core (and through that also @@ -208,17 +208,17 @@ void rfkill_init_sw_state(struct rfkill *rfkill, bool blocked); void rfkill_set_states(struct rfkill *rfkill, bool sw, bool hw); /** - * rfkill_blocked - query rfkill block + * rfkill_blocked - Query rfkill block state * * @rfkill: rfkill struct to query */ bool rfkill_blocked(struct rfkill *rfkill); /** - * rfkill_find_type - Helpper for finding rfkill type by name + * rfkill_find_type - Helper for finding rfkill type by name * @name: the name of the type * - * Returns enum rfkill_type that conrresponds the name. + * Returns enum rfkill_type that corresponds to the name. */ enum rfkill_type rfkill_find_type(const char *name); @@ -296,7 +296,7 @@ static inline enum rfkill_type rfkill_find_type(const char *name) const char *rfkill_get_led_trigger_name(struct rfkill *rfkill); /** - * rfkill_set_led_trigger_name -- set the LED trigger name + * rfkill_set_led_trigger_name - Set the LED trigger name * @rfkill: rfkill struct * @name: LED trigger name * -- cgit v1.2.3-59-g8ed1b From c4cbaf7973a794839af080f13748335976cf3f3f Mon Sep 17 00:00:00 2001 From: Luca Coelho Date: Sat, 9 Jun 2018 09:14:42 +0300 Subject: cfg80211: Add support for HE Add support for the HE in cfg80211 and also add userspace API to nl80211 to send rate information out, conforming with P802.11ax_D2.0. Signed-off-by: Liad Kaufman Signed-off-by: Johannes Berg Signed-off-by: Ilan Peer Signed-off-by: Ido Yariv Signed-off-by: Luca Coelho Signed-off-by: Johannes Berg --- include/linux/ieee80211.h | 427 +++++++++++++++++++++++++++++++++++++++++++ include/net/cfg80211.h | 106 ++++++++++- include/uapi/linux/nl80211.h | 87 ++++++++- net/wireless/core.c | 21 ++- net/wireless/nl80211.c | 99 +++++++++- net/wireless/util.c | 82 +++++++++ 6 files changed, 817 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h index 8fe7e4306816..e6a6503bfa33 100644 --- a/include/linux/ieee80211.h +++ b/include/linux/ieee80211.h @@ -1539,6 +1539,106 @@ struct ieee80211_vht_operation { __le16 basic_mcs_set; } __packed; +/** + * struct ieee80211_he_cap_elem - HE capabilities element + * + * This structure is the "HE capabilities element" fixed fields as + * described in P802.11ax_D2.0 section 9.4.2.237.2 and 9.4.2.237.3 + */ +struct ieee80211_he_cap_elem { + u8 mac_cap_info[5]; + u8 phy_cap_info[9]; +} __packed; + +#define IEEE80211_TX_RX_MCS_NSS_DESC_MAX_LEN 5 + +/** + * enum ieee80211_he_mcs_support - HE MCS support definitions + * @IEEE80211_HE_MCS_SUPPORT_0_7: MCSes 0-7 are supported for the + * number of streams + * @IEEE80211_HE_MCS_SUPPORT_0_9: MCSes 0-9 are supported + * @IEEE80211_HE_MCS_SUPPORT_0_11: MCSes 0-11 are supported + * @IEEE80211_HE_MCS_NOT_SUPPORTED: This number of streams isn't supported + * + * These definitions are used in each 2-bit subfield of the rx_mcs_* + * and tx_mcs_* fields of &struct ieee80211_he_mcs_nss_supp, which are + * both split into 8 subfields by number of streams. These values indicate + * which MCSes are supported for the number of streams the value appears + * for. + */ +enum ieee80211_he_mcs_support { + IEEE80211_HE_MCS_SUPPORT_0_7 = 0, + IEEE80211_HE_MCS_SUPPORT_0_9 = 1, + IEEE80211_HE_MCS_SUPPORT_0_11 = 2, + IEEE80211_HE_MCS_NOT_SUPPORTED = 3, +}; + +/** + * struct ieee80211_he_mcs_nss_supp - HE Tx/Rx HE MCS NSS Support Field + * + * This structure holds the data required for the Tx/Rx HE MCS NSS Support Field + * described in P802.11ax_D2.0 section 9.4.2.237.4 + * + * @rx_mcs_80: Rx MCS map 2 bits for each stream, total 8 streams, for channel + * widths less than 80MHz. + * @tx_mcs_80: Tx MCS map 2 bits for each stream, total 8 streams, for channel + * widths less than 80MHz. + * @rx_mcs_160: Rx MCS map 2 bits for each stream, total 8 streams, for channel + * width 160MHz. + * @tx_mcs_160: Tx MCS map 2 bits for each stream, total 8 streams, for channel + * width 160MHz. + * @rx_mcs_80p80: Rx MCS map 2 bits for each stream, total 8 streams, for + * channel width 80p80MHz. + * @tx_mcs_80p80: Tx MCS map 2 bits for each stream, total 8 streams, for + * channel width 80p80MHz. + */ +struct ieee80211_he_mcs_nss_supp { + __le16 rx_mcs_80; + __le16 tx_mcs_80; + __le16 rx_mcs_160; + __le16 tx_mcs_160; + __le16 rx_mcs_80p80; + __le16 tx_mcs_80p80; +} __packed; + +/** + * struct ieee80211_he_operation - HE capabilities element + * + * This structure is the "HE operation element" fields as + * described in P802.11ax_D2.0 section 9.4.2.238 + */ +struct ieee80211_he_operation { + __le32 he_oper_params; + __le16 he_mcs_nss_set; + /* Optional 0,1,3 or 4 bytes: depends on @he_oper_params */ + u8 optional[0]; +} __packed; + +/** + * struct ieee80211_he_mu_edca_param_ac_rec - MU AC Parameter Record field + * + * This structure is the "MU AC Parameter Record" fields as + * described in P802.11ax_D2.0 section 9.4.2.240 + */ +struct ieee80211_he_mu_edca_param_ac_rec { + u8 aifsn; + u8 ecw_min_max; + u8 mu_edca_timer; +} __packed; + +/** + * struct ieee80211_mu_edca_param_set - MU EDCA Parameter Set element + * + * This structure is the "MU EDCA Parameter Set element" fields as + * described in P802.11ax_D2.0 section 9.4.2.240 + */ +struct ieee80211_mu_edca_param_set { + u8 mu_qos_info; + struct ieee80211_he_mu_edca_param_ac_rec ac_be; + struct ieee80211_he_mu_edca_param_ac_rec ac_bk; + struct ieee80211_he_mu_edca_param_ac_rec ac_vi; + struct ieee80211_he_mu_edca_param_ac_rec ac_vo; +} __packed; /* 802.11ac VHT Capabilities */ #define IEEE80211_VHT_CAP_MAX_MPDU_LENGTH_3895 0x00000000 @@ -1577,6 +1677,328 @@ struct ieee80211_vht_operation { #define IEEE80211_VHT_CAP_RX_ANTENNA_PATTERN 0x10000000 #define IEEE80211_VHT_CAP_TX_ANTENNA_PATTERN 0x20000000 +/* 802.11ax HE MAC capabilities */ +#define IEEE80211_HE_MAC_CAP0_HTC_HE 0x01 +#define IEEE80211_HE_MAC_CAP0_TWT_REQ 0x02 +#define IEEE80211_HE_MAC_CAP0_TWT_RES 0x04 +#define IEEE80211_HE_MAC_CAP0_DYNAMIC_FRAG_NOT_SUPP 0x00 +#define IEEE80211_HE_MAC_CAP0_DYNAMIC_FRAG_LEVEL_1 0x08 +#define IEEE80211_HE_MAC_CAP0_DYNAMIC_FRAG_LEVEL_2 0x10 +#define IEEE80211_HE_MAC_CAP0_DYNAMIC_FRAG_LEVEL_3 0x18 +#define IEEE80211_HE_MAC_CAP0_DYNAMIC_FRAG_MASK 0x18 +#define IEEE80211_HE_MAC_CAP0_MAX_NUM_FRAG_MSDU_1 0x00 +#define IEEE80211_HE_MAC_CAP0_MAX_NUM_FRAG_MSDU_2 0x20 +#define IEEE80211_HE_MAC_CAP0_MAX_NUM_FRAG_MSDU_4 0x40 +#define IEEE80211_HE_MAC_CAP0_MAX_NUM_FRAG_MSDU_8 0x60 +#define IEEE80211_HE_MAC_CAP0_MAX_NUM_FRAG_MSDU_16 0x80 +#define IEEE80211_HE_MAC_CAP0_MAX_NUM_FRAG_MSDU_32 0xa0 +#define IEEE80211_HE_MAC_CAP0_MAX_NUM_FRAG_MSDU_64 0xc0 +#define IEEE80211_HE_MAC_CAP0_MAX_NUM_FRAG_MSDU_UNLIMITED 0xe0 +#define IEEE80211_HE_MAC_CAP0_MAX_NUM_FRAG_MSDU_MASK 0xe0 + +#define IEEE80211_HE_MAC_CAP1_MIN_FRAG_SIZE_UNLIMITED 0x00 +#define IEEE80211_HE_MAC_CAP1_MIN_FRAG_SIZE_128 0x01 +#define IEEE80211_HE_MAC_CAP1_MIN_FRAG_SIZE_256 0x02 +#define IEEE80211_HE_MAC_CAP1_MIN_FRAG_SIZE_512 0x03 +#define IEEE80211_HE_MAC_CAP1_MIN_FRAG_SIZE_MASK 0x03 +#define IEEE80211_HE_MAC_CAP1_TF_MAC_PAD_DUR_0US 0x00 +#define IEEE80211_HE_MAC_CAP1_TF_MAC_PAD_DUR_8US 0x04 +#define IEEE80211_HE_MAC_CAP1_TF_MAC_PAD_DUR_16US 0x08 +#define IEEE80211_HE_MAC_CAP1_TF_MAC_PAD_DUR_MASK 0x0c +#define IEEE80211_HE_MAC_CAP1_MULTI_TID_AGG_QOS_1 0x00 +#define IEEE80211_HE_MAC_CAP1_MULTI_TID_AGG_QOS_2 0x10 +#define IEEE80211_HE_MAC_CAP1_MULTI_TID_AGG_QOS_3 0x20 +#define IEEE80211_HE_MAC_CAP1_MULTI_TID_AGG_QOS_4 0x30 +#define IEEE80211_HE_MAC_CAP1_MULTI_TID_AGG_QOS_5 0x40 +#define IEEE80211_HE_MAC_CAP1_MULTI_TID_AGG_QOS_6 0x50 +#define IEEE80211_HE_MAC_CAP1_MULTI_TID_AGG_QOS_7 0x60 +#define IEEE80211_HE_MAC_CAP1_MULTI_TID_AGG_QOS_8 0x70 +#define IEEE80211_HE_MAC_CAP1_MULTI_TID_AGG_QOS_MASK 0x70 + +/* Link adaptation is split between byte HE_MAC_CAP1 and + * HE_MAC_CAP2. It should be set only if IEEE80211_HE_MAC_CAP0_HTC_HE + * in which case the following values apply: + * 0 = No feedback. + * 1 = reserved. + * 2 = Unsolicited feedback. + * 3 = both + */ +#define IEEE80211_HE_MAC_CAP1_LINK_ADAPTATION 0x80 + +#define IEEE80211_HE_MAC_CAP2_LINK_ADAPTATION 0x01 +#define IEEE80211_HE_MAC_CAP2_ALL_ACK 0x02 +#define IEEE80211_HE_MAC_CAP2_UL_MU_RESP_SCHED 0x04 +#define IEEE80211_HE_MAC_CAP2_BSR 0x08 +#define IEEE80211_HE_MAC_CAP2_BCAST_TWT 0x10 +#define IEEE80211_HE_MAC_CAP2_32BIT_BA_BITMAP 0x20 +#define IEEE80211_HE_MAC_CAP2_MU_CASCADING 0x40 +#define IEEE80211_HE_MAC_CAP2_ACK_EN 0x80 + +#define IEEE80211_HE_MAC_CAP3_GRP_ADDR_MULTI_STA_BA_DL_MU 0x01 +#define IEEE80211_HE_MAC_CAP3_OMI_CONTROL 0x02 +#define IEEE80211_HE_MAC_CAP3_OFDMA_RA 0x04 + +/* The maximum length of an A-MDPU is defined by the combination of the Maximum + * A-MDPU Length Exponent field in the HT capabilities, VHT capabilities and the + * same field in the HE capabilities. + */ +#define IEEE80211_HE_MAC_CAP3_MAX_A_AMPDU_LEN_EXP_USE_VHT 0x00 +#define IEEE80211_HE_MAC_CAP3_MAX_A_AMPDU_LEN_EXP_VHT_1 0x08 +#define IEEE80211_HE_MAC_CAP3_MAX_A_AMPDU_LEN_EXP_VHT_2 0x10 +#define IEEE80211_HE_MAC_CAP3_MAX_A_AMPDU_LEN_EXP_RESERVED 0x18 +#define IEEE80211_HE_MAC_CAP3_MAX_A_AMPDU_LEN_EXP_MASK 0x18 +#define IEEE80211_HE_MAC_CAP3_A_AMSDU_FRAG 0x20 +#define IEEE80211_HE_MAC_CAP3_FLEX_TWT_SCHED 0x40 +#define IEEE80211_HE_MAC_CAP3_RX_CTRL_FRAME_TO_MULTIBSS 0x80 + +#define IEEE80211_HE_MAC_CAP4_BSRP_BQRP_A_MPDU_AGG 0x01 +#define IEEE80211_HE_MAC_CAP4_QTP 0x02 +#define IEEE80211_HE_MAC_CAP4_BQR 0x04 +#define IEEE80211_HE_MAC_CAP4_SR_RESP 0x08 +#define IEEE80211_HE_MAC_CAP4_NDP_FB_REP 0x10 +#define IEEE80211_HE_MAC_CAP4_OPS 0x20 +#define IEEE80211_HE_MAC_CAP4_AMDSU_IN_AMPDU 0x40 + +/* 802.11ax HE PHY capabilities */ +#define IEEE80211_HE_PHY_CAP0_DUAL_BAND 0x01 +#define IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_40MHZ_IN_2G 0x02 +#define IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_40MHZ_80MHZ_IN_5G 0x04 +#define IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_160MHZ_IN_5G 0x08 +#define IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_80PLUS80_MHZ_IN_5G 0x10 +#define IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_RU_MAPPING_IN_2G 0x20 +#define IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_RU_MAPPING_IN_5G 0x40 +#define IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_MASK 0xfe + +#define IEEE80211_HE_PHY_CAP1_PREAMBLE_PUNC_RX_80MHZ_ONLY_SECOND_20MHZ 0x01 +#define IEEE80211_HE_PHY_CAP1_PREAMBLE_PUNC_RX_80MHZ_ONLY_SECOND_40MHZ 0x02 +#define IEEE80211_HE_PHY_CAP1_PREAMBLE_PUNC_RX_160MHZ_ONLY_SECOND_20MHZ 0x04 +#define IEEE80211_HE_PHY_CAP1_PREAMBLE_PUNC_RX_160MHZ_ONLY_SECOND_40MHZ 0x08 +#define IEEE80211_HE_PHY_CAP1_PREAMBLE_PUNC_RX_MASK 0x0f +#define IEEE80211_HE_PHY_CAP1_DEVICE_CLASS_A 0x10 +#define IEEE80211_HE_PHY_CAP1_LDPC_CODING_IN_PAYLOAD 0x20 +#define IEEE80211_HE_PHY_CAP1_HE_LTF_AND_GI_FOR_HE_PPDUS_0_8US 0x40 +/* Midamble RX Max NSTS is split between byte #2 and byte #3 */ +#define IEEE80211_HE_PHY_CAP1_MIDAMBLE_RX_MAX_NSTS 0x80 + +#define IEEE80211_HE_PHY_CAP2_MIDAMBLE_RX_MAX_NSTS 0x01 +#define IEEE80211_HE_PHY_CAP2_NDP_4x_LTF_AND_3_2US 0x02 +#define IEEE80211_HE_PHY_CAP2_STBC_TX_UNDER_80MHZ 0x04 +#define IEEE80211_HE_PHY_CAP2_STBC_RX_UNDER_80MHZ 0x08 +#define IEEE80211_HE_PHY_CAP2_DOPPLER_TX 0x10 +#define IEEE80211_HE_PHY_CAP2_DOPPLER_RX 0x20 + +/* Note that the meaning of UL MU below is different between an AP and a non-AP + * sta, where in the AP case it indicates support for Rx and in the non-AP sta + * case it indicates support for Tx. + */ +#define IEEE80211_HE_PHY_CAP2_UL_MU_FULL_MU_MIMO 0x40 +#define IEEE80211_HE_PHY_CAP2_UL_MU_PARTIAL_MU_MIMO 0x80 + +#define IEEE80211_HE_PHY_CAP3_DCM_MAX_CONST_TX_NO_DCM 0x00 +#define IEEE80211_HE_PHY_CAP3_DCM_MAX_CONST_TX_BPSK 0x01 +#define IEEE80211_HE_PHY_CAP3_DCM_MAX_CONST_TX_QPSK 0x02 +#define IEEE80211_HE_PHY_CAP3_DCM_MAX_CONST_TX_16_QAM 0x03 +#define IEEE80211_HE_PHY_CAP3_DCM_MAX_CONST_TX_MASK 0x03 +#define IEEE80211_HE_PHY_CAP3_DCM_MAX_TX_NSS_1 0x00 +#define IEEE80211_HE_PHY_CAP3_DCM_MAX_TX_NSS_2 0x04 +#define IEEE80211_HE_PHY_CAP3_DCM_MAX_CONST_RX_NO_DCM 0x00 +#define IEEE80211_HE_PHY_CAP3_DCM_MAX_CONST_RX_BPSK 0x08 +#define IEEE80211_HE_PHY_CAP3_DCM_MAX_CONST_RX_QPSK 0x10 +#define IEEE80211_HE_PHY_CAP3_DCM_MAX_CONST_RX_16_QAM 0x18 +#define IEEE80211_HE_PHY_CAP3_DCM_MAX_CONST_RX_MASK 0x18 +#define IEEE80211_HE_PHY_CAP3_DCM_MAX_RX_NSS_1 0x00 +#define IEEE80211_HE_PHY_CAP3_DCM_MAX_RX_NSS_2 0x20 +#define IEEE80211_HE_PHY_CAP3_RX_HE_MU_PPDU_FROM_NON_AP_STA 0x40 +#define IEEE80211_HE_PHY_CAP3_SU_BEAMFORMER 0x80 + +#define IEEE80211_HE_PHY_CAP4_SU_BEAMFORMEE 0x01 +#define IEEE80211_HE_PHY_CAP4_MU_BEAMFORMER 0x02 + +/* Minimal allowed value of Max STS under 80MHz is 3 */ +#define IEEE80211_HE_PHY_CAP4_BEAMFORMEE_MAX_STS_UNDER_80MHZ_4 0x0c +#define IEEE80211_HE_PHY_CAP4_BEAMFORMEE_MAX_STS_UNDER_80MHZ_5 0x10 +#define IEEE80211_HE_PHY_CAP4_BEAMFORMEE_MAX_STS_UNDER_80MHZ_6 0x14 +#define IEEE80211_HE_PHY_CAP4_BEAMFORMEE_MAX_STS_UNDER_80MHZ_7 0x18 +#define IEEE80211_HE_PHY_CAP4_BEAMFORMEE_MAX_STS_UNDER_80MHZ_8 0x1c +#define IEEE80211_HE_PHY_CAP4_BEAMFORMEE_MAX_STS_UNDER_80MHZ_MASK 0x1c + +/* Minimal allowed value of Max STS above 80MHz is 3 */ +#define IEEE80211_HE_PHY_CAP4_BEAMFORMEE_MAX_STS_ABOVE_80MHZ_4 0x60 +#define IEEE80211_HE_PHY_CAP4_BEAMFORMEE_MAX_STS_ABOVE_80MHZ_5 0x80 +#define IEEE80211_HE_PHY_CAP4_BEAMFORMEE_MAX_STS_ABOVE_80MHZ_6 0xa0 +#define IEEE80211_HE_PHY_CAP4_BEAMFORMEE_MAX_STS_ABOVE_80MHZ_7 0xc0 +#define IEEE80211_HE_PHY_CAP4_BEAMFORMEE_MAX_STS_ABOVE_80MHZ_8 0xe0 +#define IEEE80211_HE_PHY_CAP4_BEAMFORMEE_MAX_STS_ABOVE_80MHZ_MASK 0xe0 + +#define IEEE80211_HE_PHY_CAP5_BEAMFORMEE_NUM_SND_DIM_UNDER_80MHZ_1 0x00 +#define IEEE80211_HE_PHY_CAP5_BEAMFORMEE_NUM_SND_DIM_UNDER_80MHZ_2 0x01 +#define IEEE80211_HE_PHY_CAP5_BEAMFORMEE_NUM_SND_DIM_UNDER_80MHZ_3 0x02 +#define IEEE80211_HE_PHY_CAP5_BEAMFORMEE_NUM_SND_DIM_UNDER_80MHZ_4 0x03 +#define IEEE80211_HE_PHY_CAP5_BEAMFORMEE_NUM_SND_DIM_UNDER_80MHZ_5 0x04 +#define IEEE80211_HE_PHY_CAP5_BEAMFORMEE_NUM_SND_DIM_UNDER_80MHZ_6 0x05 +#define IEEE80211_HE_PHY_CAP5_BEAMFORMEE_NUM_SND_DIM_UNDER_80MHZ_7 0x06 +#define IEEE80211_HE_PHY_CAP5_BEAMFORMEE_NUM_SND_DIM_UNDER_80MHZ_8 0x07 +#define IEEE80211_HE_PHY_CAP5_BEAMFORMEE_NUM_SND_DIM_UNDER_80MHZ_MASK 0x07 + +#define IEEE80211_HE_PHY_CAP5_BEAMFORMEE_NUM_SND_DIM_ABOVE_80MHZ_1 0x00 +#define IEEE80211_HE_PHY_CAP5_BEAMFORMEE_NUM_SND_DIM_ABOVE_80MHZ_2 0x08 +#define IEEE80211_HE_PHY_CAP5_BEAMFORMEE_NUM_SND_DIM_ABOVE_80MHZ_3 0x10 +#define IEEE80211_HE_PHY_CAP5_BEAMFORMEE_NUM_SND_DIM_ABOVE_80MHZ_4 0x18 +#define IEEE80211_HE_PHY_CAP5_BEAMFORMEE_NUM_SND_DIM_ABOVE_80MHZ_5 0x20 +#define IEEE80211_HE_PHY_CAP5_BEAMFORMEE_NUM_SND_DIM_ABOVE_80MHZ_6 0x28 +#define IEEE80211_HE_PHY_CAP5_BEAMFORMEE_NUM_SND_DIM_ABOVE_80MHZ_7 0x30 +#define IEEE80211_HE_PHY_CAP5_BEAMFORMEE_NUM_SND_DIM_ABOVE_80MHZ_8 0x38 +#define IEEE80211_HE_PHY_CAP5_BEAMFORMEE_NUM_SND_DIM_ABOVE_80MHZ_MASK 0x38 + +#define IEEE80211_HE_PHY_CAP5_NG16_SU_FEEDBACK 0x40 +#define IEEE80211_HE_PHY_CAP5_NG16_MU_FEEDBACK 0x80 + +#define IEEE80211_HE_PHY_CAP6_CODEBOOK_SIZE_42_SU 0x01 +#define IEEE80211_HE_PHY_CAP6_CODEBOOK_SIZE_75_MU 0x02 +#define IEEE80211_HE_PHY_CAP6_TRIG_SU_BEAMFORMER_FB 0x04 +#define IEEE80211_HE_PHY_CAP6_TRIG_MU_BEAMFORMER_FB 0x08 +#define IEEE80211_HE_PHY_CAP6_TRIG_CQI_FB 0x10 +#define IEEE80211_HE_PHY_CAP6_PARTIAL_BW_EXT_RANGE 0x20 +#define IEEE80211_HE_PHY_CAP6_PARTIAL_BANDWIDTH_DL_MUMIMO 0x40 +#define IEEE80211_HE_PHY_CAP6_PPE_THRESHOLD_PRESENT 0x80 + +#define IEEE80211_HE_PHY_CAP7_SRP_BASED_SR 0x01 +#define IEEE80211_HE_PHY_CAP7_POWER_BOOST_FACTOR_AR 0x02 +#define IEEE80211_HE_PHY_CAP7_HE_SU_MU_PPDU_4XLTF_AND_08_US_GI 0x04 +#define IEEE80211_HE_PHY_CAP7_MAX_NC_1 0x08 +#define IEEE80211_HE_PHY_CAP7_MAX_NC_2 0x10 +#define IEEE80211_HE_PHY_CAP7_MAX_NC_3 0x18 +#define IEEE80211_HE_PHY_CAP7_MAX_NC_4 0x20 +#define IEEE80211_HE_PHY_CAP7_MAX_NC_5 0x28 +#define IEEE80211_HE_PHY_CAP7_MAX_NC_6 0x30 +#define IEEE80211_HE_PHY_CAP7_MAX_NC_7 0x38 +#define IEEE80211_HE_PHY_CAP7_MAX_NC_MASK 0x38 +#define IEEE80211_HE_PHY_CAP7_STBC_TX_ABOVE_80MHZ 0x40 +#define IEEE80211_HE_PHY_CAP7_STBC_RX_ABOVE_80MHZ 0x80 + +#define IEEE80211_HE_PHY_CAP8_HE_ER_SU_PPDU_4XLTF_AND_08_US_GI 0x01 +#define IEEE80211_HE_PHY_CAP8_20MHZ_IN_40MHZ_HE_PPDU_IN_2G 0x02 +#define IEEE80211_HE_PHY_CAP8_20MHZ_IN_160MHZ_HE_PPDU 0x04 +#define IEEE80211_HE_PHY_CAP8_80MHZ_IN_160MHZ_HE_PPDU 0x08 +#define IEEE80211_HE_PHY_CAP8_HE_ER_SU_1XLTF_AND_08_US_GI 0x10 +#define IEEE80211_HE_PHY_CAP8_MIDAMBLE_RX_2X_AND_1XLTF 0x20 + +/* 802.11ax HE TX/RX MCS NSS Support */ +#define IEEE80211_TX_RX_MCS_NSS_SUPP_HIGHEST_MCS_POS (3) +#define IEEE80211_TX_RX_MCS_NSS_SUPP_TX_BITMAP_POS (6) +#define IEEE80211_TX_RX_MCS_NSS_SUPP_RX_BITMAP_POS (11) +#define IEEE80211_TX_RX_MCS_NSS_SUPP_TX_BITMAP_MASK 0x07c0 +#define IEEE80211_TX_RX_MCS_NSS_SUPP_RX_BITMAP_MASK 0xf800 + +/* TX/RX HE MCS Support field Highest MCS subfield encoding */ +enum ieee80211_he_highest_mcs_supported_subfield_enc { + HIGHEST_MCS_SUPPORTED_MCS7 = 0, + HIGHEST_MCS_SUPPORTED_MCS8, + HIGHEST_MCS_SUPPORTED_MCS9, + HIGHEST_MCS_SUPPORTED_MCS10, + HIGHEST_MCS_SUPPORTED_MCS11, +}; + +/* Calculate 802.11ax HE capabilities IE Tx/Rx HE MCS NSS Support Field size */ +static inline u8 +ieee80211_he_mcs_nss_size(const struct ieee80211_he_cap_elem *he_cap) +{ + u8 count = 4; + + if (he_cap->phy_cap_info[0] & + IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_160MHZ_IN_5G) + count += 4; + + if (he_cap->phy_cap_info[0] & + IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_80PLUS80_MHZ_IN_5G) + count += 4; + + return count; +} + +/* 802.11ax HE PPE Thresholds */ +#define IEEE80211_PPE_THRES_NSS_SUPPORT_2NSS (1) +#define IEEE80211_PPE_THRES_NSS_POS (0) +#define IEEE80211_PPE_THRES_NSS_MASK (7) +#define IEEE80211_PPE_THRES_RU_INDEX_BITMASK_2x966_AND_966_RU \ + (BIT(5) | BIT(6)) +#define IEEE80211_PPE_THRES_RU_INDEX_BITMASK_MASK 0x78 +#define IEEE80211_PPE_THRES_RU_INDEX_BITMASK_POS (3) +#define IEEE80211_PPE_THRES_INFO_PPET_SIZE (3) + +/* + * Calculate 802.11ax HE capabilities IE PPE field size + * Input: Header byte of ppe_thres (first byte), and HE capa IE's PHY cap u8* + */ +static inline u8 +ieee80211_he_ppe_size(u8 ppe_thres_hdr, const u8 *phy_cap_info) +{ + u8 n; + + if ((phy_cap_info[6] & + IEEE80211_HE_PHY_CAP6_PPE_THRESHOLD_PRESENT) == 0) + return 0; + + n = hweight8(ppe_thres_hdr & + IEEE80211_PPE_THRES_RU_INDEX_BITMASK_MASK); + n *= (1 + ((ppe_thres_hdr & IEEE80211_PPE_THRES_NSS_MASK) >> + IEEE80211_PPE_THRES_NSS_POS)); + + /* + * Each pair is 6 bits, and we need to add the 7 "header" bits to the + * total size. + */ + n = (n * IEEE80211_PPE_THRES_INFO_PPET_SIZE * 2) + 7; + n = DIV_ROUND_UP(n, 8); + + return n; +} + +/* HE Operation defines */ +#define IEEE80211_HE_OPERATION_BSS_COLOR_MASK 0x0000003f +#define IEEE80211_HE_OPERATION_DFLT_PE_DURATION_MASK 0x000001c0 +#define IEEE80211_HE_OPERATION_DFLT_PE_DURATION_OFFSET 6 +#define IEEE80211_HE_OPERATION_TWT_REQUIRED 0x00000200 +#define IEEE80211_HE_OPERATION_RTS_THRESHOLD_MASK 0x000ffc00 +#define IEEE80211_HE_OPERATION_RTS_THRESHOLD_OFFSET 10 +#define IEEE80211_HE_OPERATION_PARTIAL_BSS_COLOR 0x000100000 +#define IEEE80211_HE_OPERATION_VHT_OPER_INFO 0x000200000 +#define IEEE80211_HE_OPERATION_MULTI_BSSID_AP 0x10000000 +#define IEEE80211_HE_OPERATION_TX_BSSID_INDICATOR 0x20000000 +#define IEEE80211_HE_OPERATION_BSS_COLOR_DISABLED 0x40000000 + +/* + * ieee80211_he_oper_size - calculate 802.11ax HE Operations IE size + * @he_oper_ie: byte data of the He Operations IE, stating from the the byte + * after the ext ID byte. It is assumed that he_oper_ie has at least + * sizeof(struct ieee80211_he_operation) bytes, checked already in + * ieee802_11_parse_elems_crc() + * @return the actual size of the IE data (not including header), or 0 on error + */ +static inline u8 +ieee80211_he_oper_size(const u8 *he_oper_ie) +{ + struct ieee80211_he_operation *he_oper = (void *)he_oper_ie; + u8 oper_len = sizeof(struct ieee80211_he_operation); + u32 he_oper_params; + + /* Make sure the input is not NULL */ + if (!he_oper_ie) + return 0; + + /* Calc required length */ + he_oper_params = le32_to_cpu(he_oper->he_oper_params); + if (he_oper_params & IEEE80211_HE_OPERATION_VHT_OPER_INFO) + oper_len += 3; + if (he_oper_params & IEEE80211_HE_OPERATION_MULTI_BSSID_AP) + oper_len++; + + /* Add the first byte (extension ID) to the total length */ + oper_len++; + + return oper_len; +} + /* Authentication algorithms */ #define WLAN_AUTH_OPEN 0 #define WLAN_AUTH_SHARED_KEY 1 @@ -1992,6 +2414,11 @@ enum ieee80211_eid_ext { WLAN_EID_EXT_FILS_WRAPPED_DATA = 8, WLAN_EID_EXT_FILS_PUBLIC_KEY = 12, WLAN_EID_EXT_FILS_NONCE = 13, + WLAN_EID_EXT_FUTURE_CHAN_GUIDANCE = 14, + WLAN_EID_EXT_HE_CAPABILITY = 35, + WLAN_EID_EXT_HE_OPERATION = 36, + WLAN_EID_EXT_UORA = 37, + WLAN_EID_EXT_HE_MU_EDCA = 38, }; /* Action category code */ diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index 5fbfe61f41c6..9ba1f289c439 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -285,6 +285,41 @@ struct ieee80211_sta_vht_cap { struct ieee80211_vht_mcs_info vht_mcs; }; +#define IEEE80211_HE_PPE_THRES_MAX_LEN 25 + +/** + * struct ieee80211_sta_he_cap - STA's HE capabilities + * + * This structure describes most essential parameters needed + * to describe 802.11ax HE capabilities for a STA. + * + * @has_he: true iff HE data is valid. + * @he_cap_elem: Fixed portion of the HE capabilities element. + * @he_mcs_nss_supp: The supported NSS/MCS combinations. + * @ppe_thres: Holds the PPE Thresholds data. + */ +struct ieee80211_sta_he_cap { + bool has_he; + struct ieee80211_he_cap_elem he_cap_elem; + struct ieee80211_he_mcs_nss_supp he_mcs_nss_supp; + u8 ppe_thres[IEEE80211_HE_PPE_THRES_MAX_LEN]; +}; + +/** + * struct ieee80211_sband_iftype_data + * + * This structure encapsulates sband data that is relevant for the + * interface types defined in @types_mask. Each type in the + * @types_mask must be unique across all instances of iftype_data. + * + * @types_mask: interface types mask + * @he_cap: holds the HE capabilities + */ +struct ieee80211_sband_iftype_data { + u16 types_mask; + struct ieee80211_sta_he_cap he_cap; +}; + /** * struct ieee80211_supported_band - frequency band definition * @@ -301,6 +336,11 @@ struct ieee80211_sta_vht_cap { * @n_bitrates: Number of bitrates in @bitrates * @ht_cap: HT capabilities in this band * @vht_cap: VHT capabilities in this band + * @n_iftype_data: number of iftype data entries + * @iftype_data: interface type data entries. Note that the bits in + * @types_mask inside this structure cannot overlap (i.e. only + * one occurrence of each type is allowed across all instances of + * iftype_data). */ struct ieee80211_supported_band { struct ieee80211_channel *channels; @@ -310,8 +350,55 @@ struct ieee80211_supported_band { int n_bitrates; struct ieee80211_sta_ht_cap ht_cap; struct ieee80211_sta_vht_cap vht_cap; + u16 n_iftype_data; + const struct ieee80211_sband_iftype_data *iftype_data; }; +/** + * ieee80211_get_sband_iftype_data - return sband data for a given iftype + * @sband: the sband to search for the STA on + * @iftype: enum nl80211_iftype + * + * Return: pointer to struct ieee80211_sband_iftype_data, or NULL is none found + */ +static inline const struct ieee80211_sband_iftype_data * +ieee80211_get_sband_iftype_data(const struct ieee80211_supported_band *sband, + u8 iftype) +{ + int i; + + if (WARN_ON(iftype >= NL80211_IFTYPE_MAX)) + return NULL; + + for (i = 0; i < sband->n_iftype_data; i++) { + const struct ieee80211_sband_iftype_data *data = + &sband->iftype_data[i]; + + if (data->types_mask & BIT(iftype)) + return data; + } + + return NULL; +} + +/** + * ieee80211_get_he_sta_cap - return HE capabilities for an sband's STA + * @sband: the sband to search for the STA on + * + * Return: pointer to the struct ieee80211_sta_he_cap, or NULL is none found + */ +static inline const struct ieee80211_sta_he_cap * +ieee80211_get_he_sta_cap(const struct ieee80211_supported_band *sband) +{ + const struct ieee80211_sband_iftype_data *data = + ieee80211_get_sband_iftype_data(sband, NL80211_IFTYPE_STATION); + + if (data && data->he_cap.has_he) + return &data->he_cap; + + return NULL; +} + /** * wiphy_read_of_freq_limits - read frequency limits from device tree * @@ -899,6 +986,8 @@ enum station_parameters_apply_mask { * @opmode_notif: operating mode field from Operating Mode Notification * @opmode_notif_used: information if operating mode field is used * @support_p2p_ps: information if station supports P2P PS mechanism + * @he_capa: HE capabilities of station + * @he_capa_len: the length of the HE capabilities */ struct station_parameters { const u8 *supported_rates; @@ -926,6 +1015,8 @@ struct station_parameters { u8 opmode_notif; bool opmode_notif_used; int support_p2p_ps; + const struct ieee80211_he_cap_elem *he_capa; + u8 he_capa_len; }; /** @@ -1000,12 +1091,14 @@ int cfg80211_check_station_change(struct wiphy *wiphy, * @RATE_INFO_FLAGS_VHT_MCS: mcs field filled with VHT MCS * @RATE_INFO_FLAGS_SHORT_GI: 400ns guard interval * @RATE_INFO_FLAGS_60G: 60GHz MCS + * @RATE_INFO_FLAGS_HE_MCS: HE MCS information */ enum rate_info_flags { RATE_INFO_FLAGS_MCS = BIT(0), RATE_INFO_FLAGS_VHT_MCS = BIT(1), RATE_INFO_FLAGS_SHORT_GI = BIT(2), RATE_INFO_FLAGS_60G = BIT(3), + RATE_INFO_FLAGS_HE_MCS = BIT(4), }; /** @@ -1019,6 +1112,7 @@ enum rate_info_flags { * @RATE_INFO_BW_40: 40 MHz bandwidth * @RATE_INFO_BW_80: 80 MHz bandwidth * @RATE_INFO_BW_160: 160 MHz bandwidth + * @RATE_INFO_BW_HE_RU: bandwidth determined by HE RU allocation */ enum rate_info_bw { RATE_INFO_BW_20 = 0, @@ -1027,6 +1121,7 @@ enum rate_info_bw { RATE_INFO_BW_40, RATE_INFO_BW_80, RATE_INFO_BW_160, + RATE_INFO_BW_HE_RU, }; /** @@ -1035,10 +1130,14 @@ enum rate_info_bw { * Information about a receiving or transmitting bitrate * * @flags: bitflag of flags from &enum rate_info_flags - * @mcs: mcs index if struct describes a 802.11n bitrate + * @mcs: mcs index if struct describes an HT/VHT/HE rate * @legacy: bitrate in 100kbit/s for 802.11abg - * @nss: number of streams (VHT only) + * @nss: number of streams (VHT & HE only) * @bw: bandwidth (from &enum rate_info_bw) + * @he_gi: HE guard interval (from &enum nl80211_he_gi) + * @he_dcm: HE DCM value + * @he_ru_alloc: HE RU allocation (from &enum nl80211_he_ru_alloc, + * only valid if bw is %RATE_INFO_BW_HE_RU) */ struct rate_info { u8 flags; @@ -1046,6 +1145,9 @@ struct rate_info { u16 legacy; u8 nss; u8 bw; + u8 he_gi; + u8 he_dcm; + u8 he_ru_alloc; }; /** diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index 49f718e821a3..f82ce3c89ab7 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -2237,6 +2237,9 @@ enum nl80211_commands { * enforced. * @NL80211_ATTR_TXQ_QUANTUM: TXQ scheduler quantum (bytes). Number of bytes * a flow is assigned on each round of the DRR scheduler. + * @NL80211_ATTR_HE_CAPABILITY: HE Capability information element (from + * association request when used with NL80211_CMD_NEW_STATION). Can be set + * only if %NL80211_STA_FLAG_WME is set. * * @NUM_NL80211_ATTR: total number of nl80211_attrs available * @NL80211_ATTR_MAX: highest attribute number currently defined @@ -2677,6 +2680,8 @@ enum nl80211_attrs { NL80211_ATTR_TXQ_MEMORY_LIMIT, NL80211_ATTR_TXQ_QUANTUM, + NL80211_ATTR_HE_CAPABILITY, + /* add attributes here, update the policy in nl80211.c */ __NL80211_ATTR_AFTER_LAST, @@ -2726,7 +2731,8 @@ enum nl80211_attrs { #define NL80211_TKIP_DATA_OFFSET_RX_MIC_KEY 24 #define NL80211_HT_CAPABILITY_LEN 26 #define NL80211_VHT_CAPABILITY_LEN 12 - +#define NL80211_HE_MIN_CAPABILITY_LEN 16 +#define NL80211_HE_MAX_CAPABILITY_LEN 51 #define NL80211_MAX_NR_CIPHER_SUITES 5 #define NL80211_MAX_NR_AKM_SUITES 2 @@ -2853,6 +2859,38 @@ struct nl80211_sta_flag_update { __u32 set; } __attribute__((packed)); +/** + * enum nl80211_he_gi - HE guard interval + * @NL80211_RATE_INFO_HE_GI_0_8: 0.8 usec + * @NL80211_RATE_INFO_HE_GI_1_6: 1.6 usec + * @NL80211_RATE_INFO_HE_GI_3_2: 3.2 usec + */ +enum nl80211_he_gi { + NL80211_RATE_INFO_HE_GI_0_8, + NL80211_RATE_INFO_HE_GI_1_6, + NL80211_RATE_INFO_HE_GI_3_2, +}; + +/** + * enum nl80211_he_ru_alloc - HE RU allocation values + * @NL80211_RATE_INFO_HE_RU_ALLOC_26: 26-tone RU allocation + * @NL80211_RATE_INFO_HE_RU_ALLOC_52: 52-tone RU allocation + * @NL80211_RATE_INFO_HE_RU_ALLOC_106: 106-tone RU allocation + * @NL80211_RATE_INFO_HE_RU_ALLOC_242: 242-tone RU allocation + * @NL80211_RATE_INFO_HE_RU_ALLOC_484: 484-tone RU allocation + * @NL80211_RATE_INFO_HE_RU_ALLOC_996: 996-tone RU allocation + * @NL80211_RATE_INFO_HE_RU_ALLOC_2x996: 2x996-tone RU allocation + */ +enum nl80211_he_ru_alloc { + NL80211_RATE_INFO_HE_RU_ALLOC_26, + NL80211_RATE_INFO_HE_RU_ALLOC_52, + NL80211_RATE_INFO_HE_RU_ALLOC_106, + NL80211_RATE_INFO_HE_RU_ALLOC_242, + NL80211_RATE_INFO_HE_RU_ALLOC_484, + NL80211_RATE_INFO_HE_RU_ALLOC_996, + NL80211_RATE_INFO_HE_RU_ALLOC_2x996, +}; + /** * enum nl80211_rate_info - bitrate information * @@ -2885,6 +2923,13 @@ struct nl80211_sta_flag_update { * @NL80211_RATE_INFO_5_MHZ_WIDTH: 5 MHz width - note that this is * a legacy rate and will be reported as the actual bitrate, i.e. * a quarter of the base (20 MHz) rate + * @NL80211_RATE_INFO_HE_MCS: HE MCS index (u8, 0-11) + * @NL80211_RATE_INFO_HE_NSS: HE NSS value (u8, 1-8) + * @NL80211_RATE_INFO_HE_GI: HE guard interval identifier + * (u8, see &enum nl80211_he_gi) + * @NL80211_RATE_INFO_HE_DCM: HE DCM value (u8, 0/1) + * @NL80211_RATE_INFO_RU_ALLOC: HE RU allocation, if not present then + * non-OFDMA was used (u8, see &enum nl80211_he_ru_alloc) * @__NL80211_RATE_INFO_AFTER_LAST: internal use */ enum nl80211_rate_info { @@ -2901,6 +2946,11 @@ enum nl80211_rate_info { NL80211_RATE_INFO_160_MHZ_WIDTH, NL80211_RATE_INFO_10_MHZ_WIDTH, NL80211_RATE_INFO_5_MHZ_WIDTH, + NL80211_RATE_INFO_HE_MCS, + NL80211_RATE_INFO_HE_NSS, + NL80211_RATE_INFO_HE_GI, + NL80211_RATE_INFO_HE_DCM, + NL80211_RATE_INFO_HE_RU_ALLOC, /* keep last */ __NL80211_RATE_INFO_AFTER_LAST, @@ -3166,6 +3216,38 @@ enum nl80211_mpath_info { NL80211_MPATH_INFO_MAX = __NL80211_MPATH_INFO_AFTER_LAST - 1 }; +/** + * enum nl80211_band_iftype_attr - Interface type data attributes + * + * @__NL80211_BAND_IFTYPE_ATTR_INVALID: attribute number 0 is reserved + * @NL80211_BAND_IFTYPE_ATTR_IFTYPES: nested attribute containing a flag attribute + * for each interface type that supports the band data + * @NL80211_BAND_IFTYPE_ATTR_HE_CAP_MAC: HE MAC capabilities as in HE + * capabilities IE + * @NL80211_BAND_IFTYPE_ATTR_HE_CAP_PHY: HE PHY capabilities as in HE + * capabilities IE + * @NL80211_BAND_IFTYPE_ATTR_HE_CAP_MCS_SET: HE supported NSS/MCS as in HE + * capabilities IE + * @NL80211_BAND_IFTYPE_ATTR_HE_CAP_PPE: HE PPE thresholds information as + * defined in HE capabilities IE + * @NL80211_BAND_IFTYPE_ATTR_MAX: highest band HE capability attribute currently + * defined + * @__NL80211_BAND_IFTYPE_ATTR_AFTER_LAST: internal use + */ +enum nl80211_band_iftype_attr { + __NL80211_BAND_IFTYPE_ATTR_INVALID, + + NL80211_BAND_IFTYPE_ATTR_IFTYPES, + NL80211_BAND_IFTYPE_ATTR_HE_CAP_MAC, + NL80211_BAND_IFTYPE_ATTR_HE_CAP_PHY, + NL80211_BAND_IFTYPE_ATTR_HE_CAP_MCS_SET, + NL80211_BAND_IFTYPE_ATTR_HE_CAP_PPE, + + /* keep last */ + __NL80211_BAND_IFTYPE_ATTR_AFTER_LAST, + NL80211_BAND_IFTYPE_ATTR_MAX = __NL80211_BAND_IFTYPE_ATTR_AFTER_LAST - 1 +}; + /** * enum nl80211_band_attr - band attributes * @__NL80211_BAND_ATTR_INVALID: attribute number 0 is reserved @@ -3181,6 +3263,8 @@ enum nl80211_mpath_info { * @NL80211_BAND_ATTR_VHT_MCS_SET: 32-byte attribute containing the MCS set as * defined in 802.11ac * @NL80211_BAND_ATTR_VHT_CAPA: VHT capabilities, as in the HT information IE + * @NL80211_BAND_ATTR_IFTYPE_DATA: nested array attribute, with each entry using + * attributes from &enum nl80211_band_iftype_attr * @NL80211_BAND_ATTR_MAX: highest band attribute currently defined * @__NL80211_BAND_ATTR_AFTER_LAST: internal use */ @@ -3196,6 +3280,7 @@ enum nl80211_band_attr { NL80211_BAND_ATTR_VHT_MCS_SET, NL80211_BAND_ATTR_VHT_CAPA, + NL80211_BAND_ATTR_IFTYPE_DATA, /* keep last */ __NL80211_BAND_ATTR_AFTER_LAST, diff --git a/net/wireless/core.c b/net/wireless/core.c index 5fe35aafdd9c..d23abc619e77 100644 --- a/net/wireless/core.c +++ b/net/wireless/core.c @@ -3,7 +3,7 @@ * * Copyright 2006-2010 Johannes Berg * Copyright 2013-2014 Intel Mobile Communications GmbH - * Copyright 2015 Intel Deutschland GmbH + * Copyright 2015-2017 Intel Deutschland GmbH */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt @@ -744,6 +744,8 @@ int wiphy_register(struct wiphy *wiphy) /* sanity check supported bands/channels */ for (band = 0; band < NUM_NL80211_BANDS; band++) { + u16 types = 0; + sband = wiphy->bands[band]; if (!sband) continue; @@ -788,6 +790,23 @@ int wiphy_register(struct wiphy *wiphy) sband->channels[i].band = band; } + for (i = 0; i < sband->n_iftype_data; i++) { + const struct ieee80211_sband_iftype_data *iftd; + + iftd = &sband->iftype_data[i]; + + if (WARN_ON(!iftd->types_mask)) + return -EINVAL; + if (WARN_ON(types & iftd->types_mask)) + return -EINVAL; + + /* at least one piece of information must be present */ + if (WARN_ON(!iftd->he_cap.has_he)) + return -EINVAL; + + types |= iftd->types_mask; + } + have_band = true; } diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index 7b21914ae18b..0ccce338a66e 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -428,6 +428,8 @@ static const struct nla_policy nl80211_policy[NUM_NL80211_ATTR] = { [NL80211_ATTR_TXQ_LIMIT] = { .type = NLA_U32 }, [NL80211_ATTR_TXQ_MEMORY_LIMIT] = { .type = NLA_U32 }, [NL80211_ATTR_TXQ_QUANTUM] = { .type = NLA_U32 }, + [NL80211_ATTR_HE_CAPABILITY] = { .type = NLA_BINARY, + .len = NL80211_HE_MAX_CAPABILITY_LEN }, }; /* policy for the key attributes */ @@ -1324,6 +1326,34 @@ static int nl80211_send_coalesce(struct sk_buff *msg, return 0; } +static int +nl80211_send_iftype_data(struct sk_buff *msg, + const struct ieee80211_sband_iftype_data *iftdata) +{ + const struct ieee80211_sta_he_cap *he_cap = &iftdata->he_cap; + + if (nl80211_put_iftypes(msg, NL80211_BAND_IFTYPE_ATTR_IFTYPES, + iftdata->types_mask)) + return -ENOBUFS; + + if (he_cap->has_he) { + if (nla_put(msg, NL80211_BAND_IFTYPE_ATTR_HE_CAP_MAC, + sizeof(he_cap->he_cap_elem.mac_cap_info), + he_cap->he_cap_elem.mac_cap_info) || + nla_put(msg, NL80211_BAND_IFTYPE_ATTR_HE_CAP_PHY, + sizeof(he_cap->he_cap_elem.phy_cap_info), + he_cap->he_cap_elem.phy_cap_info) || + nla_put(msg, NL80211_BAND_IFTYPE_ATTR_HE_CAP_MCS_SET, + sizeof(he_cap->he_mcs_nss_supp), + &he_cap->he_mcs_nss_supp) || + nla_put(msg, NL80211_BAND_IFTYPE_ATTR_HE_CAP_PPE, + sizeof(he_cap->ppe_thres), he_cap->ppe_thres)) + return -ENOBUFS; + } + + return 0; +} + static int nl80211_send_band_rateinfo(struct sk_buff *msg, struct ieee80211_supported_band *sband) { @@ -1353,6 +1383,32 @@ static int nl80211_send_band_rateinfo(struct sk_buff *msg, sband->vht_cap.cap))) return -ENOBUFS; + if (sband->n_iftype_data) { + struct nlattr *nl_iftype_data = + nla_nest_start(msg, NL80211_BAND_ATTR_IFTYPE_DATA); + int err; + + if (!nl_iftype_data) + return -ENOBUFS; + + for (i = 0; i < sband->n_iftype_data; i++) { + struct nlattr *iftdata; + + iftdata = nla_nest_start(msg, i + 1); + if (!iftdata) + return -ENOBUFS; + + err = nl80211_send_iftype_data(msg, + &sband->iftype_data[i]); + if (err) + return err; + + nla_nest_end(msg, iftdata); + } + + nla_nest_end(msg, nl_iftype_data); + } + /* add bitrates */ nl_rates = nla_nest_start(msg, NL80211_BAND_ATTR_RATES); if (!nl_rates) @@ -4472,6 +4528,9 @@ static bool nl80211_put_sta_rate(struct sk_buff *msg, struct rate_info *info, case RATE_INFO_BW_160: rate_flg = NL80211_RATE_INFO_160_MHZ_WIDTH; break; + case RATE_INFO_BW_HE_RU: + rate_flg = 0; + WARN_ON(!(info->flags & RATE_INFO_FLAGS_HE_MCS)); } if (rate_flg && nla_put_flag(msg, rate_flg)) @@ -4491,6 +4550,19 @@ static bool nl80211_put_sta_rate(struct sk_buff *msg, struct rate_info *info, if (info->flags & RATE_INFO_FLAGS_SHORT_GI && nla_put_flag(msg, NL80211_RATE_INFO_SHORT_GI)) return false; + } else if (info->flags & RATE_INFO_FLAGS_HE_MCS) { + if (nla_put_u8(msg, NL80211_RATE_INFO_HE_MCS, info->mcs)) + return false; + if (nla_put_u8(msg, NL80211_RATE_INFO_HE_NSS, info->nss)) + return false; + if (nla_put_u8(msg, NL80211_RATE_INFO_HE_GI, info->he_gi)) + return false; + if (nla_put_u8(msg, NL80211_RATE_INFO_HE_DCM, info->he_dcm)) + return false; + if (info->bw == RATE_INFO_BW_HE_RU && + nla_put_u8(msg, NL80211_RATE_INFO_HE_RU_ALLOC, + info->he_ru_alloc)) + return false; } nla_nest_end(msg, rate); @@ -4887,7 +4959,8 @@ int cfg80211_check_station_change(struct wiphy *wiphy, return -EINVAL; if (params->supported_rates) return -EINVAL; - if (params->ext_capab || params->ht_capa || params->vht_capa) + if (params->ext_capab || params->ht_capa || params->vht_capa || + params->he_capa) return -EINVAL; } @@ -5093,6 +5166,15 @@ static int nl80211_set_station_tdls(struct genl_info *info, if (info->attrs[NL80211_ATTR_VHT_CAPABILITY]) params->vht_capa = nla_data(info->attrs[NL80211_ATTR_VHT_CAPABILITY]); + if (info->attrs[NL80211_ATTR_HE_CAPABILITY]) { + params->he_capa = + nla_data(info->attrs[NL80211_ATTR_HE_CAPABILITY]); + params->he_capa_len = + nla_len(info->attrs[NL80211_ATTR_HE_CAPABILITY]); + + if (params->he_capa_len < NL80211_HE_MIN_CAPABILITY_LEN) + return -EINVAL; + } err = nl80211_parse_sta_channel_info(info, params); if (err) @@ -5320,6 +5402,17 @@ static int nl80211_new_station(struct sk_buff *skb, struct genl_info *info) params.vht_capa = nla_data(info->attrs[NL80211_ATTR_VHT_CAPABILITY]); + if (info->attrs[NL80211_ATTR_HE_CAPABILITY]) { + params.he_capa = + nla_data(info->attrs[NL80211_ATTR_HE_CAPABILITY]); + params.he_capa_len = + nla_len(info->attrs[NL80211_ATTR_HE_CAPABILITY]); + + /* max len is validated in nla policy */ + if (params.he_capa_len < NL80211_HE_MIN_CAPABILITY_LEN) + return -EINVAL; + } + if (info->attrs[NL80211_ATTR_OPMODE_NOTIF]) { params.opmode_notif_used = true; params.opmode_notif = @@ -5352,6 +5445,10 @@ static int nl80211_new_station(struct sk_buff *skb, struct genl_info *info) if (!(params.sta_flags_set & BIT(NL80211_STA_FLAG_WME))) { params.ht_capa = NULL; params.vht_capa = NULL; + + /* HE requires WME */ + if (params.he_capa_len) + return -EINVAL; } /* When you run into this, adjust the code below for the new flag */ diff --git a/net/wireless/util.c b/net/wireless/util.c index b91597a8baa2..4ed06b271f32 100644 --- a/net/wireless/util.c +++ b/net/wireless/util.c @@ -4,6 +4,7 @@ * * Copyright 2007-2009 Johannes Berg * Copyright 2013-2014 Intel Mobile Communications GmbH + * Copyright 2017 Intel Deutschland GmbH */ #include #include @@ -1142,6 +1143,85 @@ static u32 cfg80211_calculate_bitrate_vht(struct rate_info *rate) return 0; } +static u32 cfg80211_calculate_bitrate_he(struct rate_info *rate) +{ +#define SCALE 2048 + u16 mcs_divisors[12] = { + 34133, /* 16.666666... */ + 17067, /* 8.333333... */ + 11378, /* 5.555555... */ + 8533, /* 4.166666... */ + 5689, /* 2.777777... */ + 4267, /* 2.083333... */ + 3923, /* 1.851851... */ + 3413, /* 1.666666... */ + 2844, /* 1.388888... */ + 2560, /* 1.250000... */ + 2276, /* 1.111111... */ + 2048, /* 1.000000... */ + }; + u32 rates_160M[3] = { 960777777, 907400000, 816666666 }; + u32 rates_969[3] = { 480388888, 453700000, 408333333 }; + u32 rates_484[3] = { 229411111, 216666666, 195000000 }; + u32 rates_242[3] = { 114711111, 108333333, 97500000 }; + u32 rates_106[3] = { 40000000, 37777777, 34000000 }; + u32 rates_52[3] = { 18820000, 17777777, 16000000 }; + u32 rates_26[3] = { 9411111, 8888888, 8000000 }; + u64 tmp; + u32 result; + + if (WARN_ON_ONCE(rate->mcs > 11)) + return 0; + + if (WARN_ON_ONCE(rate->he_gi > NL80211_RATE_INFO_HE_GI_3_2)) + return 0; + if (WARN_ON_ONCE(rate->he_ru_alloc > + NL80211_RATE_INFO_HE_RU_ALLOC_2x996)) + return 0; + if (WARN_ON_ONCE(rate->nss < 1 || rate->nss > 8)) + return 0; + + if (rate->bw == RATE_INFO_BW_160) + result = rates_160M[rate->he_gi]; + else if (rate->bw == RATE_INFO_BW_80 || + (rate->bw == RATE_INFO_BW_HE_RU && + rate->he_ru_alloc == NL80211_RATE_INFO_HE_RU_ALLOC_996)) + result = rates_969[rate->he_gi]; + else if (rate->bw == RATE_INFO_BW_40 || + (rate->bw == RATE_INFO_BW_HE_RU && + rate->he_ru_alloc == NL80211_RATE_INFO_HE_RU_ALLOC_484)) + result = rates_484[rate->he_gi]; + else if (rate->bw == RATE_INFO_BW_20 || + (rate->bw == RATE_INFO_BW_HE_RU && + rate->he_ru_alloc == NL80211_RATE_INFO_HE_RU_ALLOC_242)) + result = rates_242[rate->he_gi]; + else if (rate->bw == RATE_INFO_BW_HE_RU && + rate->he_ru_alloc == NL80211_RATE_INFO_HE_RU_ALLOC_106) + result = rates_106[rate->he_gi]; + else if (rate->bw == RATE_INFO_BW_HE_RU && + rate->he_ru_alloc == NL80211_RATE_INFO_HE_RU_ALLOC_52) + result = rates_52[rate->he_gi]; + else if (rate->bw == RATE_INFO_BW_HE_RU && + rate->he_ru_alloc == NL80211_RATE_INFO_HE_RU_ALLOC_26) + result = rates_26[rate->he_gi]; + else if (WARN(1, "invalid HE MCS: bw:%d, ru:%d\n", + rate->bw, rate->he_ru_alloc)) + return 0; + + /* now scale to the appropriate MCS */ + tmp = result; + tmp *= SCALE; + do_div(tmp, mcs_divisors[rate->mcs]); + result = tmp; + + /* and take NSS, DCM into account */ + result = (result * rate->nss) / 8; + if (rate->he_dcm) + result /= 2; + + return result; +} + u32 cfg80211_calculate_bitrate(struct rate_info *rate) { if (rate->flags & RATE_INFO_FLAGS_MCS) @@ -1150,6 +1230,8 @@ u32 cfg80211_calculate_bitrate(struct rate_info *rate) return cfg80211_calculate_bitrate_60g(rate); if (rate->flags & RATE_INFO_FLAGS_VHT_MCS) return cfg80211_calculate_bitrate_vht(rate); + if (rate->flags & RATE_INFO_FLAGS_HE_MCS) + return cfg80211_calculate_bitrate_he(rate); return rate->legacy; } -- cgit v1.2.3-59-g8ed1b From b8042b3da925f390c1482bf9dc0898dc0b3ea7b5 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Mon, 18 Jun 2018 22:39:29 +0200 Subject: ieee80211: bump IEEE80211_MAX_AMPDU_BUF to support HE Bump the IEEE80211_MAX_AMPDU_BUF size to 0x100 for HE support and - for now - use IEEE80211_MAX_AMPDU_BUF_HT everywhere. This is derived from my internal patch, parts of which Luca had sent upstream. Signed-off-by: Luca Coelho Signed-off-by: Johannes Berg --- drivers/net/wireless/realtek/rtlwifi/base.c | 2 +- drivers/staging/rtl8188eu/include/wifi.h | 1 - drivers/staging/rtl8712/wifi.h | 1 - drivers/staging/rtl8723bs/include/wifi.h | 1 - drivers/staging/rtlwifi/base.c | 2 +- include/linux/ieee80211.h | 10 ++++++---- net/mac80211/agg-rx.c | 4 ++-- net/mac80211/agg-tx.c | 2 +- net/mac80211/ht.c | 2 +- net/mac80211/main.c | 4 ++-- 10 files changed, 14 insertions(+), 15 deletions(-) (limited to 'include/linux') diff --git a/drivers/net/wireless/realtek/rtlwifi/base.c b/drivers/net/wireless/realtek/rtlwifi/base.c index 39c817eddd78..31bd6f714052 100644 --- a/drivers/net/wireless/realtek/rtlwifi/base.c +++ b/drivers/net/wireless/realtek/rtlwifi/base.c @@ -1904,7 +1904,7 @@ void rtl_rx_ampdu_apply(struct rtl_priv *rtlpriv) reject_agg, ctrl_agg_size, agg_size); rtlpriv->hw->max_rx_aggregation_subframes = - (ctrl_agg_size ? agg_size : IEEE80211_MAX_AMPDU_BUF); + (ctrl_agg_size ? agg_size : IEEE80211_MAX_AMPDU_BUF_HT); } EXPORT_SYMBOL(rtl_rx_ampdu_apply); diff --git a/drivers/staging/rtl8188eu/include/wifi.h b/drivers/staging/rtl8188eu/include/wifi.h index 084a246eec19..6790b7c8cfb1 100644 --- a/drivers/staging/rtl8188eu/include/wifi.h +++ b/drivers/staging/rtl8188eu/include/wifi.h @@ -575,7 +575,6 @@ enum ht_cap_ampdu_factor { * According to IEEE802.11n spec size varies from 8K to 64K (in powers of 2) */ #define IEEE80211_MIN_AMPDU_BUF 0x8 -#define IEEE80211_MAX_AMPDU_BUF 0x40 #define OP_MODE_PURE 0 diff --git a/drivers/staging/rtl8712/wifi.h b/drivers/staging/rtl8712/wifi.h index 0ed2f44ab4e9..00a4302e9983 100644 --- a/drivers/staging/rtl8712/wifi.h +++ b/drivers/staging/rtl8712/wifi.h @@ -574,7 +574,6 @@ struct ieee80211_ht_addt_info { * According to IEEE802.11n spec size varies from 8K to 64K (in powers of 2) */ #define IEEE80211_MIN_AMPDU_BUF 0x8 -#define IEEE80211_MAX_AMPDU_BUF 0x40 /* Spatial Multiplexing Power Save Modes */ diff --git a/drivers/staging/rtl8723bs/include/wifi.h b/drivers/staging/rtl8723bs/include/wifi.h index 08bc79840b23..559bf2606fb7 100644 --- a/drivers/staging/rtl8723bs/include/wifi.h +++ b/drivers/staging/rtl8723bs/include/wifi.h @@ -799,7 +799,6 @@ enum HT_CAP_AMPDU_FACTOR { * According to IEEE802.11n spec size varies from 8K to 64K (in powers of 2) */ #define IEEE80211_MIN_AMPDU_BUF 0x8 -#define IEEE80211_MAX_AMPDU_BUF 0x40 /* Spatial Multiplexing Power Save Modes */ diff --git a/drivers/staging/rtlwifi/base.c b/drivers/staging/rtlwifi/base.c index e46e47d93d7d..094827c1879a 100644 --- a/drivers/staging/rtlwifi/base.c +++ b/drivers/staging/rtlwifi/base.c @@ -1838,7 +1838,7 @@ void rtl_rx_ampdu_apply(struct rtl_priv *rtlpriv) reject_agg, ctrl_agg_size, agg_size); rtlpriv->hw->max_rx_aggregation_subframes = - (ctrl_agg_size ? agg_size : IEEE80211_MAX_AMPDU_BUF); + (ctrl_agg_size ? agg_size : IEEE80211_MAX_AMPDU_BUF_HT); } /********************************************************* diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h index e6a6503bfa33..9c03a7d5e400 100644 --- a/include/linux/ieee80211.h +++ b/include/linux/ieee80211.h @@ -1433,11 +1433,13 @@ struct ieee80211_ht_operation { #define IEEE80211_DELBA_PARAM_INITIATOR_MASK 0x0800 /* - * A-PMDU buffer sizes - * According to IEEE802.11n spec size varies from 8K to 64K (in powers of 2) + * A-MPDU buffer sizes + * According to HT size varies from 8 to 64 frames + * HE adds the ability to have up to 256 frames. */ -#define IEEE80211_MIN_AMPDU_BUF 0x8 -#define IEEE80211_MAX_AMPDU_BUF 0x40 +#define IEEE80211_MIN_AMPDU_BUF 0x8 +#define IEEE80211_MAX_AMPDU_BUF_HT 0x40 +#define IEEE80211_MAX_AMPDU_BUF 0x100 /* Spatial Multiplexing Power Save Modes (for capability) */ diff --git a/net/mac80211/agg-rx.c b/net/mac80211/agg-rx.c index e83c19d4c292..3ffd853b483f 100644 --- a/net/mac80211/agg-rx.c +++ b/net/mac80211/agg-rx.c @@ -274,7 +274,7 @@ void ___ieee80211_start_rx_ba_session(struct sta_info *sta, /* XXX: check own ht delayed BA capability?? */ if (((ba_policy != 1) && (!(sta->sta.ht_cap.cap & IEEE80211_HT_CAP_DELAY_BA))) || - (buf_size > IEEE80211_MAX_AMPDU_BUF)) { + (buf_size > IEEE80211_MAX_AMPDU_BUF_HT)) { status = WLAN_STATUS_INVALID_QOS_PARAM; ht_dbg_ratelimited(sta->sdata, "AddBA Req with bad params from %pM on tid %u. policy %d, buffer size %d\n", @@ -283,7 +283,7 @@ void ___ieee80211_start_rx_ba_session(struct sta_info *sta, } /* determine default buffer size */ if (buf_size == 0) - buf_size = IEEE80211_MAX_AMPDU_BUF; + buf_size = IEEE80211_MAX_AMPDU_BUF_HT; /* make sure the size doesn't exceed the maximum supported by the hw */ if (buf_size > sta->sta.max_rx_aggregation_subframes) diff --git a/net/mac80211/agg-tx.c b/net/mac80211/agg-tx.c index ac4295296514..86c6bc0432ba 100644 --- a/net/mac80211/agg-tx.c +++ b/net/mac80211/agg-tx.c @@ -514,7 +514,7 @@ void ieee80211_tx_ba_session_handle_start(struct sta_info *sta, int tid) /* send AddBA request */ ieee80211_send_addba_request(sdata, sta->sta.addr, tid, tid_tx->dialog_token, params.ssn, - IEEE80211_MAX_AMPDU_BUF, + IEEE80211_MAX_AMPDU_BUF_HT, tid_tx->timeout); } diff --git a/net/mac80211/ht.c b/net/mac80211/ht.c index 26a7ba3b698f..f849ea814993 100644 --- a/net/mac80211/ht.c +++ b/net/mac80211/ht.c @@ -352,7 +352,7 @@ void ieee80211_ba_session_work(struct work_struct *work) test_and_clear_bit(tid, sta->ampdu_mlme.tid_rx_manage_offl)) ___ieee80211_start_rx_ba_session(sta, 0, 0, 0, 1, tid, - IEEE80211_MAX_AMPDU_BUF, + IEEE80211_MAX_AMPDU_BUF_HT, false, true); if (test_and_clear_bit(tid + IEEE80211_NUM_TIDS, diff --git a/net/mac80211/main.c b/net/mac80211/main.c index a6f8e3a646d4..070f77862014 100644 --- a/net/mac80211/main.c +++ b/net/mac80211/main.c @@ -597,8 +597,8 @@ struct ieee80211_hw *ieee80211_alloc_hw_nm(size_t priv_data_len, local->hw.queues = 1; local->hw.max_rates = 1; local->hw.max_report_rates = 0; - local->hw.max_rx_aggregation_subframes = IEEE80211_MAX_AMPDU_BUF; - local->hw.max_tx_aggregation_subframes = IEEE80211_MAX_AMPDU_BUF; + local->hw.max_rx_aggregation_subframes = IEEE80211_MAX_AMPDU_BUF_HT; + local->hw.max_tx_aggregation_subframes = IEEE80211_MAX_AMPDU_BUF_HT; local->hw.offchannel_tx_hw_queue = IEEE80211_INVAL_HW_QUEUE; local->hw.conf.long_frame_max_tx_count = wiphy->retry_long; local->hw.conf.short_frame_max_tx_count = wiphy->retry_short; -- cgit v1.2.3-59-g8ed1b From 38b7ca927d6a12bf4466be22a90b7d998f5ae69a Mon Sep 17 00:00:00 2001 From: Yishai Hadas Date: Thu, 8 Mar 2018 14:36:27 +0200 Subject: net/mlx5: Expose DEVX specification This patch updates the mlx5_ifc structures and command interface to support DEVX. Signed-off-by: Yishai Hadas Signed-off-by: Leon Romanovsky --- drivers/net/ethernet/mellanox/mlx5/core/cmd.c | 4 ++ include/linux/mlx5/device.h | 3 ++ include/linux/mlx5/mlx5_ifc.h | 67 ++++++++++++++++++++++++++- 3 files changed, 73 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/drivers/net/ethernet/mellanox/mlx5/core/cmd.c b/drivers/net/ethernet/mellanox/mlx5/core/cmd.c index b99d6df3905b..d07f24de8fa3 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/cmd.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/cmd.c @@ -310,6 +310,7 @@ static int mlx5_internal_err_ret_value(struct mlx5_core_dev *dev, u16 op, case MLX5_CMD_OP_DEALLOC_ENCAP_HEADER: case MLX5_CMD_OP_DEALLOC_MODIFY_HEADER_CONTEXT: case MLX5_CMD_OP_FPGA_DESTROY_QP: + case MLX5_CMD_OP_DESTROY_GENERAL_OBJECT: return MLX5_CMD_STAT_OK; case MLX5_CMD_OP_QUERY_HCA_CAP: @@ -427,6 +428,7 @@ static int mlx5_internal_err_ret_value(struct mlx5_core_dev *dev, u16 op, case MLX5_CMD_OP_FPGA_MODIFY_QP: case MLX5_CMD_OP_FPGA_QUERY_QP: case MLX5_CMD_OP_FPGA_QUERY_QP_COUNTERS: + case MLX5_CMD_OP_CREATE_GENERAL_OBJECT: *status = MLX5_DRIVER_STATUS_ABORTED; *synd = MLX5_DRIVER_SYND; return -EIO; @@ -599,6 +601,8 @@ const char *mlx5_command_str(int command) MLX5_COMMAND_STR_CASE(FPGA_QUERY_QP); MLX5_COMMAND_STR_CASE(FPGA_QUERY_QP_COUNTERS); MLX5_COMMAND_STR_CASE(FPGA_DESTROY_QP); + MLX5_COMMAND_STR_CASE(CREATE_GENERAL_OBJECT); + MLX5_COMMAND_STR_CASE(DESTROY_GENERAL_OBJECT); default: return "unknown command opcode"; } } diff --git a/include/linux/mlx5/device.h b/include/linux/mlx5/device.h index 02f72ebf31a7..f8671c0a43aa 100644 --- a/include/linux/mlx5/device.h +++ b/include/linux/mlx5/device.h @@ -1071,6 +1071,9 @@ enum mlx5_qcam_feature_groups { #define MLX5_CAP_GEN(mdev, cap) \ MLX5_GET(cmd_hca_cap, mdev->caps.hca_cur[MLX5_CAP_GENERAL], cap) +#define MLX5_CAP_GEN_64(mdev, cap) \ + MLX5_GET64(cmd_hca_cap, mdev->caps.hca_cur[MLX5_CAP_GENERAL], cap) + #define MLX5_CAP_GEN_MAX(mdev, cap) \ MLX5_GET(cmd_hca_cap, mdev->caps.hca_max[MLX5_CAP_GENERAL], cap) diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index 27134c4fcb76..f810772e80c0 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -75,6 +75,15 @@ enum { MLX5_SET_HCA_CAP_OP_MOD_ATOMIC = 0x3, }; +enum { + MLX5_GENERAL_OBJ_TYPES_CAP_UCTX = (1ULL << 4), + MLX5_GENERAL_OBJ_TYPES_CAP_UMEM = (1ULL << 5), +}; + +enum { + MLX5_OBJ_TYPE_UCTX = 0x0004, +}; + enum { MLX5_CMD_OP_QUERY_HCA_CAP = 0x100, MLX5_CMD_OP_QUERY_ADAPTER = 0x101, @@ -242,6 +251,8 @@ enum { MLX5_CMD_OP_FPGA_QUERY_QP = 0x962, MLX5_CMD_OP_FPGA_DESTROY_QP = 0x963, MLX5_CMD_OP_FPGA_QUERY_QP_COUNTERS = 0x964, + MLX5_CMD_OP_CREATE_GENERAL_OBJECT = 0xa00, + MLX5_CMD_OP_DESTROY_GENERAL_OBJECT = 0xa03, MLX5_CMD_OP_MAX }; @@ -1113,7 +1124,9 @@ struct mlx5_ifc_cmd_hca_cap_bits { u8 reserved_at_3f8[0x3]; u8 log_max_current_uc_list[0x5]; - u8 reserved_at_400[0x80]; + u8 general_obj_types[0x40]; + + u8 reserved_at_440[0x40]; u8 reserved_at_480[0x3]; u8 log_max_l2_table[0x5]; @@ -9115,4 +9128,56 @@ struct mlx5_ifc_dealloc_memic_out_bits { u8 reserved_at_40[0x40]; }; +struct mlx5_ifc_general_obj_in_cmd_hdr_bits { + u8 opcode[0x10]; + u8 uid[0x10]; + + u8 reserved_at_20[0x10]; + u8 obj_type[0x10]; + + u8 obj_id[0x20]; + + u8 reserved_at_60[0x20]; +}; + +struct mlx5_ifc_general_obj_out_cmd_hdr_bits { + u8 status[0x8]; + u8 reserved_at_8[0x18]; + + u8 syndrome[0x20]; + + u8 obj_id[0x20]; + + u8 reserved_at_60[0x20]; +}; + +struct mlx5_ifc_umem_bits { + u8 modify_field_select[0x40]; + + u8 reserved_at_40[0x5b]; + u8 log_page_size[0x5]; + + u8 page_offset[0x20]; + + u8 num_of_mtt[0x40]; + + struct mlx5_ifc_mtt_bits mtt[0]; +}; + +struct mlx5_ifc_uctx_bits { + u8 modify_field_select[0x40]; + + u8 reserved_at_40[0x1c0]; +}; + +struct mlx5_ifc_create_umem_in_bits { + struct mlx5_ifc_general_obj_in_cmd_hdr_bits hdr; + struct mlx5_ifc_umem_bits umem; +}; + +struct mlx5_ifc_create_uctx_in_bits { + struct mlx5_ifc_general_obj_in_cmd_hdr_bits hdr; + struct mlx5_ifc_uctx_bits uctx; +}; + #endif /* MLX5_IFC_H */ -- cgit v1.2.3-59-g8ed1b From 0af5107cd0640ee3424e337b492e4b11b450ce28 Mon Sep 17 00:00:00 2001 From: Talat Batheesh Date: Thu, 17 May 2018 11:14:18 +0300 Subject: net/mlx5: Add RoCE RX ICRC encapsulated counter Add capability bit in PCAM register and RoCE ICRC error counter to PPCNT register. Signed-off-by: Talat Batheesh Reviewed-by: Mark Bloch Signed-off-by: Leon Romanovsky --- include/linux/mlx5/mlx5_ifc.h | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index f810772e80c0..deb3a459a22e 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -1681,7 +1681,11 @@ struct mlx5_ifc_eth_extended_cntrs_grp_data_layout_bits { u8 rx_buffer_full_low[0x20]; - u8 reserved_at_1c0[0x600]; + u8 rx_icrc_encapsulated_high[0x20]; + + u8 rx_icrc_encapsulated_low[0x20]; + + u8 reserved_at_200[0x5c0]; }; struct mlx5_ifc_eth_3635_cntrs_grp_data_layout_bits { @@ -8044,8 +8048,9 @@ struct mlx5_ifc_peir_reg_bits { }; struct mlx5_ifc_pcam_enhanced_features_bits { - u8 reserved_at_0[0x76]; - + u8 reserved_at_0[0x6d]; + u8 rx_icrc_encapsulated_counter[0x1]; + u8 reserved_at_6e[0x8]; u8 pfcc_mask[0x1]; u8 reserved_at_77[0x4]; u8 rx_buffer_fullness_counters[0x1]; -- cgit v1.2.3-59-g8ed1b From 0eb71a9da5796851fa87ddc1a534066c0fe54055 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Mon, 18 Jun 2018 12:52:50 +1000 Subject: rhashtable: split rhashtable.h Due to the use of rhashtables in net namespaces, rhashtable.h is included in lots of the kernel, so a small changes can required a large recompilation. This makes development painful. This patch splits out rhashtable-types.h which just includes the major type declarations, and does not include (non-trivial) inline code. rhashtable.h is no longer included by anything in the include/ directory. Common include files only include rhashtable-types.h so a large recompilation is only triggered when that changes. Acked-by: Herbert Xu Signed-off-by: NeilBrown Signed-off-by: David S. Miller --- MAINTAINERS | 2 + drivers/net/ethernet/chelsio/cxgb4/cxgb4.h | 1 + include/linux/ipc.h | 2 +- include/linux/ipc_namespace.h | 2 +- include/linux/mroute_base.h | 2 +- include/linux/rhashtable-types.h | 139 +++++++++++++++++++++++++++++ include/linux/rhashtable.h | 127 +------------------------- include/net/inet_frag.h | 2 +- include/net/netfilter/nf_flow_table.h | 2 +- include/net/sctp/structs.h | 2 +- include/net/seg6.h | 2 +- include/net/seg6_hmac.h | 2 +- ipc/msg.c | 1 + ipc/sem.c | 1 + ipc/shm.c | 1 + ipc/util.c | 1 + lib/rhashtable.c | 1 + net/ipv4/inet_fragment.c | 1 + net/ipv4/ipmr.c | 1 + net/ipv4/ipmr_base.c | 1 + net/ipv6/ip6mr.c | 1 + net/ipv6/seg6.c | 1 + net/ipv6/seg6_hmac.c | 1 + net/netfilter/nf_tables_api.c | 1 + net/sctp/input.c | 1 + net/sctp/socket.c | 1 + 26 files changed, 166 insertions(+), 133 deletions(-) create mode 100644 include/linux/rhashtable-types.h (limited to 'include/linux') diff --git a/MAINTAINERS b/MAINTAINERS index edf3cf5ea691..99e5cef8172e 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -12162,7 +12162,9 @@ M: Herbert Xu L: netdev@vger.kernel.org S: Maintained F: lib/rhashtable.c +F: lib/test_rhashtable.c F: include/linux/rhashtable.h +F: include/linux/rhashtable-types.h RICOH R5C592 MEMORYSTICK DRIVER M: Maxim Levitsky diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4.h b/drivers/net/ethernet/chelsio/cxgb4/cxgb4.h index 0dbe2d9e22d6..1adb968b8354 100644 --- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4.h +++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4.h @@ -46,6 +46,7 @@ #include #include #include +#include #include #include #include diff --git a/include/linux/ipc.h b/include/linux/ipc.h index 6cc2df7f7ac9..e1c9eea6015b 100644 --- a/include/linux/ipc.h +++ b/include/linux/ipc.h @@ -4,7 +4,7 @@ #include #include -#include +#include #include #include diff --git a/include/linux/ipc_namespace.h b/include/linux/ipc_namespace.h index b5630c8eb2f3..6cea726612b7 100644 --- a/include/linux/ipc_namespace.h +++ b/include/linux/ipc_namespace.h @@ -9,7 +9,7 @@ #include #include #include -#include +#include struct user_namespace; diff --git a/include/linux/mroute_base.h b/include/linux/mroute_base.h index d633f737b3c6..fd436cdd4725 100644 --- a/include/linux/mroute_base.h +++ b/include/linux/mroute_base.h @@ -2,7 +2,7 @@ #define __LINUX_MROUTE_BASE_H #include -#include +#include #include #include #include diff --git a/include/linux/rhashtable-types.h b/include/linux/rhashtable-types.h new file mode 100644 index 000000000000..9740063ff13b --- /dev/null +++ b/include/linux/rhashtable-types.h @@ -0,0 +1,139 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Resizable, Scalable, Concurrent Hash Table + * + * Simple structures that might be needed in include + * files. + */ + +#ifndef _LINUX_RHASHTABLE_TYPES_H +#define _LINUX_RHASHTABLE_TYPES_H + +#include +#include +#include +#include + +struct rhash_head { + struct rhash_head __rcu *next; +}; + +struct rhlist_head { + struct rhash_head rhead; + struct rhlist_head __rcu *next; +}; + +struct bucket_table; + +/** + * struct rhashtable_compare_arg - Key for the function rhashtable_compare + * @ht: Hash table + * @key: Key to compare against + */ +struct rhashtable_compare_arg { + struct rhashtable *ht; + const void *key; +}; + +typedef u32 (*rht_hashfn_t)(const void *data, u32 len, u32 seed); +typedef u32 (*rht_obj_hashfn_t)(const void *data, u32 len, u32 seed); +typedef int (*rht_obj_cmpfn_t)(struct rhashtable_compare_arg *arg, + const void *obj); + +/** + * struct rhashtable_params - Hash table construction parameters + * @nelem_hint: Hint on number of elements, should be 75% of desired size + * @key_len: Length of key + * @key_offset: Offset of key in struct to be hashed + * @head_offset: Offset of rhash_head in struct to be hashed + * @max_size: Maximum size while expanding + * @min_size: Minimum size while shrinking + * @locks_mul: Number of bucket locks to allocate per cpu (default: 32) + * @automatic_shrinking: Enable automatic shrinking of tables + * @nulls_base: Base value to generate nulls marker + * @hashfn: Hash function (default: jhash2 if !(key_len % 4), or jhash) + * @obj_hashfn: Function to hash object + * @obj_cmpfn: Function to compare key with object + */ +struct rhashtable_params { + u16 nelem_hint; + u16 key_len; + u16 key_offset; + u16 head_offset; + unsigned int max_size; + u16 min_size; + bool automatic_shrinking; + u8 locks_mul; + u32 nulls_base; + rht_hashfn_t hashfn; + rht_obj_hashfn_t obj_hashfn; + rht_obj_cmpfn_t obj_cmpfn; +}; + +/** + * struct rhashtable - Hash table handle + * @tbl: Bucket table + * @key_len: Key length for hashfn + * @max_elems: Maximum number of elements in table + * @p: Configuration parameters + * @rhlist: True if this is an rhltable + * @run_work: Deferred worker to expand/shrink asynchronously + * @mutex: Mutex to protect current/future table swapping + * @lock: Spin lock to protect walker list + * @nelems: Number of elements in table + */ +struct rhashtable { + struct bucket_table __rcu *tbl; + unsigned int key_len; + unsigned int max_elems; + struct rhashtable_params p; + bool rhlist; + struct work_struct run_work; + struct mutex mutex; + spinlock_t lock; + atomic_t nelems; +}; + +/** + * struct rhltable - Hash table with duplicate objects in a list + * @ht: Underlying rhtable + */ +struct rhltable { + struct rhashtable ht; +}; + +/** + * struct rhashtable_walker - Hash table walker + * @list: List entry on list of walkers + * @tbl: The table that we were walking over + */ +struct rhashtable_walker { + struct list_head list; + struct bucket_table *tbl; +}; + +/** + * struct rhashtable_iter - Hash table iterator + * @ht: Table to iterate through + * @p: Current pointer + * @list: Current hash list pointer + * @walker: Associated rhashtable walker + * @slot: Current slot + * @skip: Number of entries to skip in slot + */ +struct rhashtable_iter { + struct rhashtable *ht; + struct rhash_head *p; + struct rhlist_head *list; + struct rhashtable_walker walker; + unsigned int slot; + unsigned int skip; + bool end_of_table; +}; + +int rhashtable_init(struct rhashtable *ht, + const struct rhashtable_params *params); +int rhltable_init(struct rhltable *hlt, + const struct rhashtable_params *params); + +#endif /* _LINUX_RHASHTABLE_TYPES_H */ diff --git a/include/linux/rhashtable.h b/include/linux/rhashtable.h index 4e1f535c2034..48754ab07cdf 100644 --- a/include/linux/rhashtable.h +++ b/include/linux/rhashtable.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* * Resizable, Scalable, Concurrent Hash Table * @@ -17,16 +18,14 @@ #ifndef _LINUX_RHASHTABLE_H #define _LINUX_RHASHTABLE_H -#include -#include #include #include #include #include #include -#include #include +#include /* * The end of the chain is marked with a special nulls marks which has * the following format: @@ -64,15 +63,6 @@ */ #define RHT_ELASTICITY 16u -struct rhash_head { - struct rhash_head __rcu *next; -}; - -struct rhlist_head { - struct rhash_head rhead; - struct rhlist_head __rcu *next; -}; - /** * struct bucket_table - Table of hash buckets * @size: Number of hash buckets @@ -102,114 +92,6 @@ struct bucket_table { struct rhash_head __rcu *buckets[] ____cacheline_aligned_in_smp; }; -/** - * struct rhashtable_compare_arg - Key for the function rhashtable_compare - * @ht: Hash table - * @key: Key to compare against - */ -struct rhashtable_compare_arg { - struct rhashtable *ht; - const void *key; -}; - -typedef u32 (*rht_hashfn_t)(const void *data, u32 len, u32 seed); -typedef u32 (*rht_obj_hashfn_t)(const void *data, u32 len, u32 seed); -typedef int (*rht_obj_cmpfn_t)(struct rhashtable_compare_arg *arg, - const void *obj); - -struct rhashtable; - -/** - * struct rhashtable_params - Hash table construction parameters - * @nelem_hint: Hint on number of elements, should be 75% of desired size - * @key_len: Length of key - * @key_offset: Offset of key in struct to be hashed - * @head_offset: Offset of rhash_head in struct to be hashed - * @max_size: Maximum size while expanding - * @min_size: Minimum size while shrinking - * @locks_mul: Number of bucket locks to allocate per cpu (default: 32) - * @automatic_shrinking: Enable automatic shrinking of tables - * @nulls_base: Base value to generate nulls marker - * @hashfn: Hash function (default: jhash2 if !(key_len % 4), or jhash) - * @obj_hashfn: Function to hash object - * @obj_cmpfn: Function to compare key with object - */ -struct rhashtable_params { - u16 nelem_hint; - u16 key_len; - u16 key_offset; - u16 head_offset; - unsigned int max_size; - u16 min_size; - bool automatic_shrinking; - u8 locks_mul; - u32 nulls_base; - rht_hashfn_t hashfn; - rht_obj_hashfn_t obj_hashfn; - rht_obj_cmpfn_t obj_cmpfn; -}; - -/** - * struct rhashtable - Hash table handle - * @tbl: Bucket table - * @key_len: Key length for hashfn - * @max_elems: Maximum number of elements in table - * @p: Configuration parameters - * @rhlist: True if this is an rhltable - * @run_work: Deferred worker to expand/shrink asynchronously - * @mutex: Mutex to protect current/future table swapping - * @lock: Spin lock to protect walker list - * @nelems: Number of elements in table - */ -struct rhashtable { - struct bucket_table __rcu *tbl; - unsigned int key_len; - unsigned int max_elems; - struct rhashtable_params p; - bool rhlist; - struct work_struct run_work; - struct mutex mutex; - spinlock_t lock; - atomic_t nelems; -}; - -/** - * struct rhltable - Hash table with duplicate objects in a list - * @ht: Underlying rhtable - */ -struct rhltable { - struct rhashtable ht; -}; - -/** - * struct rhashtable_walker - Hash table walker - * @list: List entry on list of walkers - * @tbl: The table that we were walking over - */ -struct rhashtable_walker { - struct list_head list; - struct bucket_table *tbl; -}; - -/** - * struct rhashtable_iter - Hash table iterator - * @ht: Table to iterate through - * @p: Current pointer - * @list: Current hash list pointer - * @walker: Associated rhashtable walker - * @slot: Current slot - * @skip: Number of entries to skip in slot - */ -struct rhashtable_iter { - struct rhashtable *ht; - struct rhash_head *p; - struct rhlist_head *list; - struct rhashtable_walker walker; - unsigned int slot; - unsigned int skip; - bool end_of_table; -}; - static inline unsigned long rht_marker(const struct rhashtable *ht, u32 hash) { return NULLS_MARKER(ht->p.nulls_base + hash); @@ -376,11 +258,6 @@ static inline int lockdep_rht_bucket_is_held(const struct bucket_table *tbl, } #endif /* CONFIG_PROVE_LOCKING */ -int rhashtable_init(struct rhashtable *ht, - const struct rhashtable_params *params); -int rhltable_init(struct rhltable *hlt, - const struct rhashtable_params *params); - void *rhashtable_insert_slow(struct rhashtable *ht, const void *key, struct rhash_head *obj); diff --git a/include/net/inet_frag.h b/include/net/inet_frag.h index ed07e3786d98..f4272a29dc44 100644 --- a/include/net/inet_frag.h +++ b/include/net/inet_frag.h @@ -2,7 +2,7 @@ #ifndef __NET_FRAG_H__ #define __NET_FRAG_H__ -#include +#include struct netns_frags { /* sysctls */ diff --git a/include/net/netfilter/nf_flow_table.h b/include/net/netfilter/nf_flow_table.h index ba9fa4592f2b..0e355f4a3d76 100644 --- a/include/net/netfilter/nf_flow_table.h +++ b/include/net/netfilter/nf_flow_table.h @@ -4,7 +4,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h index dbe1b911a24d..e0f962d27386 100644 --- a/include/net/sctp/structs.h +++ b/include/net/sctp/structs.h @@ -48,7 +48,7 @@ #define __sctp_structs_h__ #include -#include +#include #include /* linux/in.h needs this!! */ #include /* We get struct sockaddr_in. */ #include /* We get struct in6_addr */ diff --git a/include/net/seg6.h b/include/net/seg6.h index e029e301faa5..2567941a2f32 100644 --- a/include/net/seg6.h +++ b/include/net/seg6.h @@ -18,7 +18,7 @@ #include #include #include -#include +#include static inline void update_csum_diff4(struct sk_buff *skb, __be32 from, __be32 to) diff --git a/include/net/seg6_hmac.h b/include/net/seg6_hmac.h index 69c3a106056b..7fda469e2758 100644 --- a/include/net/seg6_hmac.h +++ b/include/net/seg6_hmac.h @@ -22,7 +22,7 @@ #include #include #include -#include +#include #define SEG6_HMAC_MAX_DIGESTSIZE 160 #define SEG6_HMAC_RING_SIZE 256 diff --git a/ipc/msg.c b/ipc/msg.c index 3b6545302598..203281198079 100644 --- a/ipc/msg.c +++ b/ipc/msg.c @@ -38,6 +38,7 @@ #include #include #include +#include #include #include diff --git a/ipc/sem.c b/ipc/sem.c index 5af1943ad782..29c0347ef11d 100644 --- a/ipc/sem.c +++ b/ipc/sem.c @@ -86,6 +86,7 @@ #include #include #include +#include #include #include "util.h" diff --git a/ipc/shm.c b/ipc/shm.c index 051a3e1fb8df..d4daf78df6da 100644 --- a/ipc/shm.c +++ b/ipc/shm.c @@ -43,6 +43,7 @@ #include #include #include +#include #include diff --git a/ipc/util.c b/ipc/util.c index 4e81182fa0ac..fdffff41f65b 100644 --- a/ipc/util.c +++ b/ipc/util.c @@ -63,6 +63,7 @@ #include #include #include +#include #include diff --git a/lib/rhashtable.c b/lib/rhashtable.c index 9427b5766134..c9fafea7dc6e 100644 --- a/lib/rhashtable.c +++ b/lib/rhashtable.c @@ -28,6 +28,7 @@ #include #include #include +#include #define HASH_DEFAULT_SIZE 64UL #define HASH_MIN_SIZE 4U diff --git a/net/ipv4/inet_fragment.c b/net/ipv4/inet_fragment.c index c9e35b81d093..316518f87294 100644 --- a/net/ipv4/inet_fragment.c +++ b/net/ipv4/inet_fragment.c @@ -20,6 +20,7 @@ #include #include #include +#include #include #include diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c index 9f79b9803a16..82f914122f1b 100644 --- a/net/ipv4/ipmr.c +++ b/net/ipv4/ipmr.c @@ -60,6 +60,7 @@ #include #include #include +#include #include #include #include diff --git a/net/ipv4/ipmr_base.c b/net/ipv4/ipmr_base.c index cafb0506c8c9..1ad9aa62a97b 100644 --- a/net/ipv4/ipmr_base.c +++ b/net/ipv4/ipmr_base.c @@ -2,6 +2,7 @@ * Common logic shared by IPv4 [ipmr] and IPv6 [ip6mr] implementation */ +#include #include /* Sets everything common except 'dev', since that is done under locking */ diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c index 0d0f0053bb11..d0b7e0249c13 100644 --- a/net/ipv6/ip6mr.c +++ b/net/ipv6/ip6mr.c @@ -32,6 +32,7 @@ #include #include #include +#include #include #include #include diff --git a/net/ipv6/seg6.c b/net/ipv6/seg6.c index 0fdf2a55e746..8d0ba757a46c 100644 --- a/net/ipv6/seg6.c +++ b/net/ipv6/seg6.c @@ -17,6 +17,7 @@ #include #include #include +#include #include #include diff --git a/net/ipv6/seg6_hmac.c b/net/ipv6/seg6_hmac.c index 33fb35cbfac1..b1791129a875 100644 --- a/net/ipv6/seg6_hmac.c +++ b/net/ipv6/seg6_hmac.c @@ -22,6 +22,7 @@ #include #include #include +#include #include #include diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 896d4a36081d..3f211e1025c1 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -14,6 +14,7 @@ #include #include #include +#include #include #include #include diff --git a/net/sctp/input.c b/net/sctp/input.c index ba8a6e6c36fa..9bbc5f92c941 100644 --- a/net/sctp/input.c +++ b/net/sctp/input.c @@ -56,6 +56,7 @@ #include #include #include +#include /* Forward declarations for internal helpers. */ static int sctp_rcv_ootb(struct sk_buff *); diff --git a/net/sctp/socket.c b/net/sctp/socket.c index d20f7addee19..0e91e83eea5a 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@ -66,6 +66,7 @@ #include #include #include +#include #include #include -- cgit v1.2.3-59-g8ed1b From 9f9a707738aa7a8b9f78a641b83927ada256a626 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Mon, 18 Jun 2018 12:52:50 +1000 Subject: rhashtable: remove nulls_base and related code. This "feature" is unused, undocumented, and untested and so doesn't really belong. A patch is under development to properly implement support for detecting when a search gets diverted down a different chain, which the common purpose of nulls markers. This patch actually fixes a bug too. The table resizing allows a table to grow to 2^31 buckets, but the hash is truncated to 27 bits - any growth beyond 2^27 is wasteful an ineffective. This patch results in NULLS_MARKER(0) being used for all chains, and leaves the use of rht_is_a_null() to test for it. Acked-by: Herbert Xu Signed-off-by: NeilBrown Signed-off-by: David S. Miller --- include/linux/rhashtable-types.h | 2 -- include/linux/rhashtable.h | 33 +++------------------------------ lib/rhashtable.c | 8 -------- lib/test_rhashtable.c | 5 +---- net/core/xdp.c | 4 ++-- 5 files changed, 6 insertions(+), 46 deletions(-) (limited to 'include/linux') diff --git a/include/linux/rhashtable-types.h b/include/linux/rhashtable-types.h index 9740063ff13b..763d613ce2c2 100644 --- a/include/linux/rhashtable-types.h +++ b/include/linux/rhashtable-types.h @@ -50,7 +50,6 @@ typedef int (*rht_obj_cmpfn_t)(struct rhashtable_compare_arg *arg, * @min_size: Minimum size while shrinking * @locks_mul: Number of bucket locks to allocate per cpu (default: 32) * @automatic_shrinking: Enable automatic shrinking of tables - * @nulls_base: Base value to generate nulls marker * @hashfn: Hash function (default: jhash2 if !(key_len % 4), or jhash) * @obj_hashfn: Function to hash object * @obj_cmpfn: Function to compare key with object @@ -64,7 +63,6 @@ struct rhashtable_params { u16 min_size; bool automatic_shrinking; u8 locks_mul; - u32 nulls_base; rht_hashfn_t hashfn; rht_obj_hashfn_t obj_hashfn; rht_obj_cmpfn_t obj_cmpfn; diff --git a/include/linux/rhashtable.h b/include/linux/rhashtable.h index 48754ab07cdf..d9f719af7936 100644 --- a/include/linux/rhashtable.h +++ b/include/linux/rhashtable.h @@ -28,25 +28,8 @@ #include /* * The end of the chain is marked with a special nulls marks which has - * the following format: - * - * +-------+-----------------------------------------------------+-+ - * | Base | Hash |1| - * +-------+-----------------------------------------------------+-+ - * - * Base (4 bits) : Reserved to distinguish between multiple tables. - * Specified via &struct rhashtable_params.nulls_base. - * Hash (27 bits): Full hash (unmasked) of first element added to bucket - * 1 (1 bit) : Nulls marker (always set) - * - * The remaining bits of the next pointer remain unused for now. + * the least significant bit set. */ -#define RHT_BASE_BITS 4 -#define RHT_HASH_BITS 27 -#define RHT_BASE_SHIFT RHT_HASH_BITS - -/* Base bits plus 1 bit for nulls marker */ -#define RHT_HASH_RESERVED_SPACE (RHT_BASE_BITS + 1) /* Maximum chain length before rehash * @@ -92,24 +75,14 @@ struct bucket_table { struct rhash_head __rcu *buckets[] ____cacheline_aligned_in_smp; }; -static inline unsigned long rht_marker(const struct rhashtable *ht, u32 hash) -{ - return NULLS_MARKER(ht->p.nulls_base + hash); -} - #define INIT_RHT_NULLS_HEAD(ptr, ht, hash) \ - ((ptr) = (typeof(ptr)) rht_marker(ht, hash)) + ((ptr) = (typeof(ptr)) NULLS_MARKER(0)) static inline bool rht_is_a_nulls(const struct rhash_head *ptr) { return ((unsigned long) ptr & 1); } -static inline unsigned long rht_get_nulls_value(const struct rhash_head *ptr) -{ - return ((unsigned long) ptr) >> 1; -} - static inline void *rht_obj(const struct rhashtable *ht, const struct rhash_head *he) { @@ -119,7 +92,7 @@ static inline void *rht_obj(const struct rhashtable *ht, static inline unsigned int rht_bucket_index(const struct bucket_table *tbl, unsigned int hash) { - return (hash >> RHT_HASH_RESERVED_SPACE) & (tbl->size - 1); + return hash & (tbl->size - 1); } static inline unsigned int rht_key_get_hash(struct rhashtable *ht, diff --git a/lib/rhashtable.c b/lib/rhashtable.c index c9fafea7dc6e..688693c919be 100644 --- a/lib/rhashtable.c +++ b/lib/rhashtable.c @@ -995,7 +995,6 @@ static u32 rhashtable_jhash2(const void *key, u32 length, u32 seed) * .key_offset = offsetof(struct test_obj, key), * .key_len = sizeof(int), * .hashfn = jhash, - * .nulls_base = (1U << RHT_BASE_SHIFT), * }; * * Configuration Example 2: Variable length keys @@ -1029,9 +1028,6 @@ int rhashtable_init(struct rhashtable *ht, (params->obj_hashfn && !params->obj_cmpfn)) return -EINVAL; - if (params->nulls_base && params->nulls_base < (1U << RHT_BASE_SHIFT)) - return -EINVAL; - memset(ht, 0, sizeof(*ht)); mutex_init(&ht->mutex); spin_lock_init(&ht->lock); @@ -1096,10 +1092,6 @@ int rhltable_init(struct rhltable *hlt, const struct rhashtable_params *params) { int err; - /* No rhlist NULLs marking for now. */ - if (params->nulls_base) - return -EINVAL; - err = rhashtable_init(&hlt->ht, params); hlt->ht.rhlist = true; return err; diff --git a/lib/test_rhashtable.c b/lib/test_rhashtable.c index 6ca59ffcacbe..82ac39ce5310 100644 --- a/lib/test_rhashtable.c +++ b/lib/test_rhashtable.c @@ -83,7 +83,7 @@ static u32 my_hashfn(const void *data, u32 len, u32 seed) { const struct test_obj_rhl *obj = data; - return (obj->value.id % 10) << RHT_HASH_RESERVED_SPACE; + return (obj->value.id % 10); } static int my_cmpfn(struct rhashtable_compare_arg *arg, const void *obj) @@ -99,7 +99,6 @@ static struct rhashtable_params test_rht_params = { .key_offset = offsetof(struct test_obj, value), .key_len = sizeof(struct test_obj_val), .hashfn = jhash, - .nulls_base = (3U << RHT_BASE_SHIFT), }; static struct rhashtable_params test_rht_params_dup = { @@ -296,8 +295,6 @@ static int __init test_rhltable(unsigned int entries) if (!obj_in_table) goto out_free; - /* nulls_base not supported in rhlist interface */ - test_rht_params.nulls_base = 0; err = rhltable_init(&rhlt, &test_rht_params); if (WARN_ON(err)) goto out_free; diff --git a/net/core/xdp.c b/net/core/xdp.c index 9d1f22072d5d..31c58719b5a9 100644 --- a/net/core/xdp.c +++ b/net/core/xdp.c @@ -45,8 +45,8 @@ static u32 xdp_mem_id_hashfn(const void *data, u32 len, u32 seed) BUILD_BUG_ON(FIELD_SIZEOF(struct xdp_mem_allocator, mem.id) != sizeof(u32)); - /* Use cyclic increasing ID as direct hash key, see rht_bucket_index */ - return key << RHT_HASH_RESERVED_SPACE; + /* Use cyclic increasing ID as direct hash key */ + return key; } static int xdp_mem_id_cmp(struct rhashtable_compare_arg *arg, -- cgit v1.2.3-59-g8ed1b From 9b4f64a227b6f462482a8cc68c7134dc6e26f1c1 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Mon, 18 Jun 2018 12:52:50 +1000 Subject: rhashtable: simplify INIT_RHT_NULLS_HEAD() The 'ht' and 'hash' arguments to INIT_RHT_NULLS_HEAD() are no longer used - so drop them. This allows us to also remove the nhash argument from nested_table_alloc(). Acked-by: Herbert Xu Signed-off-by: NeilBrown Signed-off-by: David S. Miller --- include/linux/rhashtable.h | 2 +- lib/rhashtable.c | 15 ++++++--------- 2 files changed, 7 insertions(+), 10 deletions(-) (limited to 'include/linux') diff --git a/include/linux/rhashtable.h b/include/linux/rhashtable.h index d9f719af7936..3f3a182bd0b4 100644 --- a/include/linux/rhashtable.h +++ b/include/linux/rhashtable.h @@ -75,7 +75,7 @@ struct bucket_table { struct rhash_head __rcu *buckets[] ____cacheline_aligned_in_smp; }; -#define INIT_RHT_NULLS_HEAD(ptr, ht, hash) \ +#define INIT_RHT_NULLS_HEAD(ptr) \ ((ptr) = (typeof(ptr)) NULLS_MARKER(0)) static inline bool rht_is_a_nulls(const struct rhash_head *ptr) diff --git a/lib/rhashtable.c b/lib/rhashtable.c index 688693c919be..a81cd27d518c 100644 --- a/lib/rhashtable.c +++ b/lib/rhashtable.c @@ -116,8 +116,7 @@ static void bucket_table_free_rcu(struct rcu_head *head) static union nested_table *nested_table_alloc(struct rhashtable *ht, union nested_table __rcu **prev, - unsigned int shifted, - unsigned int nhash) + unsigned int shifted) { union nested_table *ntbl; int i; @@ -130,8 +129,7 @@ static union nested_table *nested_table_alloc(struct rhashtable *ht, if (ntbl && shifted) { for (i = 0; i < PAGE_SIZE / sizeof(ntbl[0].bucket); i++) - INIT_RHT_NULLS_HEAD(ntbl[i].bucket, ht, - (i << shifted) | nhash); + INIT_RHT_NULLS_HEAD(ntbl[i].bucket); } rcu_assign_pointer(*prev, ntbl); @@ -157,7 +155,7 @@ static struct bucket_table *nested_bucket_table_alloc(struct rhashtable *ht, return NULL; if (!nested_table_alloc(ht, (union nested_table __rcu **)tbl->buckets, - 0, 0)) { + 0)) { kfree(tbl); return NULL; } @@ -207,7 +205,7 @@ static struct bucket_table *bucket_table_alloc(struct rhashtable *ht, tbl->hash_rnd = get_random_u32(); for (i = 0; i < nbuckets; i++) - INIT_RHT_NULLS_HEAD(tbl->buckets[i], ht, i); + INIT_RHT_NULLS_HEAD(tbl->buckets[i]); return tbl; } @@ -1217,7 +1215,7 @@ struct rhash_head __rcu **rht_bucket_nested_insert(struct rhashtable *ht, nhash = index; shifted = tbl->nest; ntbl = nested_table_alloc(ht, &ntbl[index].table, - size <= (1 << shift) ? shifted : 0, nhash); + size <= (1 << shift) ? shifted : 0); while (ntbl && size > (1 << shift)) { index = hash & ((1 << shift) - 1); @@ -1226,8 +1224,7 @@ struct rhash_head __rcu **rht_bucket_nested_insert(struct rhashtable *ht, nhash |= index << shifted; shifted += shift; ntbl = nested_table_alloc(ht, &ntbl[index].table, - size <= (1 << shift) ? shifted : 0, - nhash); + size <= (1 << shift) ? shifted : 0); } if (!ntbl) -- cgit v1.2.3-59-g8ed1b From c0690016a73fe6bd456887bbbe6e10c7f0096554 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Mon, 18 Jun 2018 12:52:50 +1000 Subject: rhashtable: clean up dereference of ->future_tbl. Using rht_dereference_bucket() to dereference ->future_tbl looks like a type error, and could be confusing. Using rht_dereference_rcu() to test a pointer for NULL adds an unnecessary barrier - rcu_access_pointer() is preferred for NULL tests when no lock is held. This uses 3 different ways to access ->future_tbl. - if we know the mutex is held, use rht_dereference() - if we don't hold the mutex, and are only testing for NULL, use rcu_access_pointer() - otherwise (using RCU protection for true dereference), use rht_dereference_rcu(). Note that this includes a simplification of the call to rhashtable_last_table() - we don't do an extra dereference before the call any more. Acked-by: Herbert Xu Signed-off-by: NeilBrown Signed-off-by: David S. Miller --- include/linux/rhashtable.h | 2 +- lib/rhashtable.c | 9 ++++----- 2 files changed, 5 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/rhashtable.h b/include/linux/rhashtable.h index 3f3a182bd0b4..eb7111039247 100644 --- a/include/linux/rhashtable.h +++ b/include/linux/rhashtable.h @@ -595,7 +595,7 @@ static inline void *__rhashtable_insert_fast( lock = rht_bucket_lock(tbl, hash); spin_lock_bh(lock); - if (unlikely(rht_dereference_bucket(tbl->future_tbl, tbl, hash))) { + if (unlikely(rcu_access_pointer(tbl->future_tbl))) { slow_path: spin_unlock_bh(lock); rcu_read_unlock(); diff --git a/lib/rhashtable.c b/lib/rhashtable.c index 52ec83212856..0e04947b7e0c 100644 --- a/lib/rhashtable.c +++ b/lib/rhashtable.c @@ -226,8 +226,7 @@ static struct bucket_table *rhashtable_last_table(struct rhashtable *ht, static int rhashtable_rehash_one(struct rhashtable *ht, unsigned int old_hash) { struct bucket_table *old_tbl = rht_dereference(ht->tbl, ht); - struct bucket_table *new_tbl = rhashtable_last_table(ht, - rht_dereference_rcu(old_tbl->future_tbl, ht)); + struct bucket_table *new_tbl = rhashtable_last_table(ht, old_tbl); struct rhash_head __rcu **pprev = rht_bucket_var(old_tbl, old_hash); int err = -EAGAIN; struct rhash_head *head, *next, *entry; @@ -467,7 +466,7 @@ static int rhashtable_insert_rehash(struct rhashtable *ht, fail: /* Do not fail the insert if someone else did a rehash. */ - if (likely(rcu_dereference_raw(tbl->future_tbl))) + if (likely(rcu_access_pointer(tbl->future_tbl))) return 0; /* Schedule async rehash to retry allocation in process context. */ @@ -540,7 +539,7 @@ static struct bucket_table *rhashtable_insert_one(struct rhashtable *ht, if (PTR_ERR(data) != -EAGAIN && PTR_ERR(data) != -ENOENT) return ERR_CAST(data); - new_tbl = rcu_dereference(tbl->future_tbl); + new_tbl = rht_dereference_rcu(tbl->future_tbl, ht); if (new_tbl) return new_tbl; @@ -599,7 +598,7 @@ static void *rhashtable_try_insert(struct rhashtable *ht, const void *key, break; spin_unlock_bh(lock); - tbl = rcu_dereference(tbl->future_tbl); + tbl = rht_dereference_rcu(tbl->future_tbl, ht); } data = rhashtable_lookup_one(ht, tbl, hash, key, obj); -- cgit v1.2.3-59-g8ed1b From 3f6c65d6255a872846c44182c82c78d3dc6239f5 Mon Sep 17 00:00:00 2001 From: Wei Wang Date: Tue, 19 Jun 2018 21:42:50 -0700 Subject: tcp: ignore rcv_rtt sample with old ts ecr value When receiving multiple packets with the same ts ecr value, only try to compute rcv_rtt sample with the earliest received packet. This is because the rcv_rtt calculated by later received packets could possibly include long idle time or other types of delay. For example: (1) server sends last packet of reply with TS val V1 (2) client ACKs last packet of reply with TS ecr V1 (3) long idle time passes (4) client sends next request data packet with TS ecr V1 (again!) At this time, the rcv_rtt computed on server with TS ecr V1 will be inflated with the idle time and should get ignored. Signed-off-by: Wei Wang Signed-off-by: Neal Cardwell Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/linux/tcp.h | 1 + net/ipv4/tcp.c | 1 + net/ipv4/tcp_input.c | 14 +++++++++++--- 3 files changed, 13 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/tcp.h b/include/linux/tcp.h index 72705eaf4b84..3dbea6610304 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -350,6 +350,7 @@ struct tcp_sock { #endif /* Receiver side RTT estimation */ + u32 rcv_rtt_last_tsecr; struct { u32 rtt_us; u32 seq; diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 141acd92e58a..47c45d5be9f9 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -2563,6 +2563,7 @@ int tcp_disconnect(struct sock *sk, int flags) sk->sk_shutdown = 0; sock_reset_flag(sk, SOCK_DONE); tp->srtt_us = 0; + tp->rcv_rtt_last_tsecr = 0; tp->write_seq += tp->max_window + 2; if (tp->write_seq == 0) tp->write_seq = 1; diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 355d3dffd021..76ca88f63b70 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -582,9 +582,12 @@ static inline void tcp_rcv_rtt_measure_ts(struct sock *sk, { struct tcp_sock *tp = tcp_sk(sk); - if (tp->rx_opt.rcv_tsecr && - (TCP_SKB_CB(skb)->end_seq - - TCP_SKB_CB(skb)->seq >= inet_csk(sk)->icsk_ack.rcv_mss)) { + if (tp->rx_opt.rcv_tsecr == tp->rcv_rtt_last_tsecr) + return; + tp->rcv_rtt_last_tsecr = tp->rx_opt.rcv_tsecr; + + if (TCP_SKB_CB(skb)->end_seq - + TCP_SKB_CB(skb)->seq >= inet_csk(sk)->icsk_ack.rcv_mss) { u32 delta = tcp_time_stamp(tp) - tp->rx_opt.rcv_tsecr; u32 delta_us; @@ -5475,6 +5478,11 @@ void tcp_rcv_established(struct sock *sk, struct sk_buff *skb) tcp_ack(sk, skb, 0); __kfree_skb(skb); tcp_data_snd_check(sk); + /* When receiving pure ack in fast path, update + * last ts ecr directly instead of calling + * tcp_rcv_rtt_measure_ts() + */ + tp->rcv_rtt_last_tsecr = tp->rx_opt.rcv_tsecr; return; } else { /* Header too small */ TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS); -- cgit v1.2.3-59-g8ed1b From d4546c2509b1e9cd082e3682dcec98472e37ee5a Mon Sep 17 00:00:00 2001 From: David Miller Date: Sun, 24 Jun 2018 14:13:49 +0900 Subject: net: Convert GRO SKB handling to list_head. Manage pending per-NAPI GRO packets via list_head. Return an SKB pointer from the GRO receive handlers. When GRO receive handlers return non-NULL, it means that this SKB needs to be completed at this time and removed from the NAPI queue. Several operations are greatly simplified by this transformation, especially timing out the oldest SKB in the list when gro_count exceeds MAX_GRO_SKBS, and napi_gro_flush() which walks the queue in reverse order. Signed-off-by: David S. Miller --- drivers/net/geneve.c | 11 ++++---- drivers/net/vxlan.c | 11 ++++---- include/linux/etherdevice.h | 3 +- include/linux/netdevice.h | 32 ++++++++++----------- include/linux/skbuff.h | 3 +- include/linux/udp.h | 4 +-- include/net/inet_common.h | 2 +- include/net/tcp.h | 2 +- include/net/udp.h | 4 +-- include/net/udp_tunnel.h | 6 ++-- net/8021q/vlan.c | 13 +++++---- net/core/dev.c | 68 +++++++++++++++++++-------------------------- net/core/skbuff.c | 4 +-- net/ethernet/eth.c | 12 ++++---- net/ipv4/af_inet.c | 12 ++++---- net/ipv4/esp4_offload.c | 4 +-- net/ipv4/fou.c | 20 ++++++------- net/ipv4/gre_offload.c | 8 +++--- net/ipv4/tcp_offload.c | 14 +++++----- net/ipv4/udp_offload.c | 13 +++++---- net/ipv6/esp6_offload.c | 4 +-- net/ipv6/ip6_offload.c | 16 +++++------ net/ipv6/tcpv6_offload.c | 4 +-- net/ipv6/udp_offload.c | 4 +-- 24 files changed, 133 insertions(+), 141 deletions(-) (limited to 'include/linux') diff --git a/drivers/net/geneve.c b/drivers/net/geneve.c index 750eaa53bf0c..3e94375b9b01 100644 --- a/drivers/net/geneve.c +++ b/drivers/net/geneve.c @@ -418,11 +418,12 @@ static int geneve_hlen(struct genevehdr *gh) return sizeof(*gh) + gh->opt_len * 4; } -static struct sk_buff **geneve_gro_receive(struct sock *sk, - struct sk_buff **head, - struct sk_buff *skb) +static struct sk_buff *geneve_gro_receive(struct sock *sk, + struct list_head *head, + struct sk_buff *skb) { - struct sk_buff *p, **pp = NULL; + struct sk_buff *pp = NULL; + struct sk_buff *p; struct genevehdr *gh, *gh2; unsigned int hlen, gh_len, off_gnv; const struct packet_offload *ptype; @@ -449,7 +450,7 @@ static struct sk_buff **geneve_gro_receive(struct sock *sk, goto out; } - for (p = *head; p; p = p->next) { + list_for_each_entry(p, head, list) { if (!NAPI_GRO_CB(p)->same_flow) continue; diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c index aee0e60471f1..cc14e0cd5647 100644 --- a/drivers/net/vxlan.c +++ b/drivers/net/vxlan.c @@ -568,11 +568,12 @@ static struct vxlanhdr *vxlan_gro_remcsum(struct sk_buff *skb, return vh; } -static struct sk_buff **vxlan_gro_receive(struct sock *sk, - struct sk_buff **head, - struct sk_buff *skb) +static struct sk_buff *vxlan_gro_receive(struct sock *sk, + struct list_head *head, + struct sk_buff *skb) { - struct sk_buff *p, **pp = NULL; + struct sk_buff *pp = NULL; + struct sk_buff *p; struct vxlanhdr *vh, *vh2; unsigned int hlen, off_vx; int flush = 1; @@ -607,7 +608,7 @@ static struct sk_buff **vxlan_gro_receive(struct sock *sk, skb_gro_pull(skb, sizeof(struct vxlanhdr)); /* pull vxlan header */ - for (p = *head; p; p = p->next) { + list_for_each_entry(p, head, list) { if (!NAPI_GRO_CB(p)->same_flow) continue; diff --git a/include/linux/etherdevice.h b/include/linux/etherdevice.h index 79563840c295..572e11bb8696 100644 --- a/include/linux/etherdevice.h +++ b/include/linux/etherdevice.h @@ -59,8 +59,7 @@ struct net_device *devm_alloc_etherdev_mqs(struct device *dev, int sizeof_priv, unsigned int rxqs); #define devm_alloc_etherdev(dev, sizeof_priv) devm_alloc_etherdev_mqs(dev, sizeof_priv, 1, 1) -struct sk_buff **eth_gro_receive(struct sk_buff **head, - struct sk_buff *skb); +struct sk_buff *eth_gro_receive(struct list_head *head, struct sk_buff *skb); int eth_gro_complete(struct sk_buff *skb, int nhoff); /* Reserved Ethernet Addresses per IEEE 802.1Q */ diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 3ec9850c7936..f176d9873910 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -322,7 +322,7 @@ struct napi_struct { int poll_owner; #endif struct net_device *dev; - struct sk_buff *gro_list; + struct list_head gro_list; struct sk_buff *skb; struct hrtimer timer; struct list_head dev_list; @@ -2255,10 +2255,10 @@ static inline int gro_recursion_inc_test(struct sk_buff *skb) return ++NAPI_GRO_CB(skb)->recursion_counter == GRO_RECURSION_LIMIT; } -typedef struct sk_buff **(*gro_receive_t)(struct sk_buff **, struct sk_buff *); -static inline struct sk_buff **call_gro_receive(gro_receive_t cb, - struct sk_buff **head, - struct sk_buff *skb) +typedef struct sk_buff *(*gro_receive_t)(struct list_head *, struct sk_buff *); +static inline struct sk_buff *call_gro_receive(gro_receive_t cb, + struct list_head *head, + struct sk_buff *skb) { if (unlikely(gro_recursion_inc_test(skb))) { NAPI_GRO_CB(skb)->flush |= 1; @@ -2268,12 +2268,12 @@ static inline struct sk_buff **call_gro_receive(gro_receive_t cb, return cb(head, skb); } -typedef struct sk_buff **(*gro_receive_sk_t)(struct sock *, struct sk_buff **, - struct sk_buff *); -static inline struct sk_buff **call_gro_receive_sk(gro_receive_sk_t cb, - struct sock *sk, - struct sk_buff **head, - struct sk_buff *skb) +typedef struct sk_buff *(*gro_receive_sk_t)(struct sock *, struct list_head *, + struct sk_buff *); +static inline struct sk_buff *call_gro_receive_sk(gro_receive_sk_t cb, + struct sock *sk, + struct list_head *head, + struct sk_buff *skb) { if (unlikely(gro_recursion_inc_test(skb))) { NAPI_GRO_CB(skb)->flush |= 1; @@ -2299,8 +2299,8 @@ struct packet_type { struct offload_callbacks { struct sk_buff *(*gso_segment)(struct sk_buff *skb, netdev_features_t features); - struct sk_buff **(*gro_receive)(struct sk_buff **head, - struct sk_buff *skb); + struct sk_buff *(*gro_receive)(struct list_head *head, + struct sk_buff *skb); int (*gro_complete)(struct sk_buff *skb, int nhoff); }; @@ -2568,7 +2568,7 @@ struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex); struct net_device *dev_get_by_napi_id(unsigned int napi_id); int netdev_get_name(struct net *net, char *name, int ifindex); int dev_restart(struct net_device *dev); -int skb_gro_receive(struct sk_buff **head, struct sk_buff *skb); +int skb_gro_receive(struct sk_buff *p, struct sk_buff *skb); static inline unsigned int skb_gro_offset(const struct sk_buff *skb) { @@ -2784,13 +2784,13 @@ static inline void skb_gro_remcsum_cleanup(struct sk_buff *skb, } #ifdef CONFIG_XFRM_OFFLOAD -static inline void skb_gro_flush_final(struct sk_buff *skb, struct sk_buff **pp, int flush) +static inline void skb_gro_flush_final(struct sk_buff *skb, struct sk_buff *pp, int flush) { if (PTR_ERR(pp) != -EINPROGRESS) NAPI_GRO_CB(skb)->flush |= flush; } #else -static inline void skb_gro_flush_final(struct sk_buff *skb, struct sk_buff **pp, int flush) +static inline void skb_gro_flush_final(struct sk_buff *skb, struct sk_buff *pp, int flush) { NAPI_GRO_CB(skb)->flush |= flush; } diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index c86885954994..7ccc601b55d9 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -677,7 +677,8 @@ struct sk_buff { int ip_defrag_offset; }; }; - struct rb_node rbnode; /* used in netem & tcp stack */ + struct rb_node rbnode; /* used in netem & tcp stack */ + struct list_head list; }; struct sock *sk; diff --git a/include/linux/udp.h b/include/linux/udp.h index ca840345571b..320d49d85484 100644 --- a/include/linux/udp.h +++ b/include/linux/udp.h @@ -74,8 +74,8 @@ struct udp_sock { void (*encap_destroy)(struct sock *sk); /* GRO functions for UDP socket */ - struct sk_buff ** (*gro_receive)(struct sock *sk, - struct sk_buff **head, + struct sk_buff * (*gro_receive)(struct sock *sk, + struct list_head *head, struct sk_buff *skb); int (*gro_complete)(struct sock *sk, struct sk_buff *skb, diff --git a/include/net/inet_common.h b/include/net/inet_common.h index 384b90c62c0b..3ca969cbd161 100644 --- a/include/net/inet_common.h +++ b/include/net/inet_common.h @@ -43,7 +43,7 @@ int inet_ctl_sock_create(struct sock **sk, unsigned short family, int inet_recv_error(struct sock *sk, struct msghdr *msg, int len, int *addr_len); -struct sk_buff **inet_gro_receive(struct sk_buff **head, struct sk_buff *skb); +struct sk_buff *inet_gro_receive(struct list_head *head, struct sk_buff *skb); int inet_gro_complete(struct sk_buff *skb, int nhoff); struct sk_buff *inet_gso_segment(struct sk_buff *skb, netdev_features_t features); diff --git a/include/net/tcp.h b/include/net/tcp.h index 822ee49ed0f9..402a88b0e8a8 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -1788,7 +1788,7 @@ void tcp_v4_destroy_sock(struct sock *sk); struct sk_buff *tcp_gso_segment(struct sk_buff *skb, netdev_features_t features); -struct sk_buff **tcp_gro_receive(struct sk_buff **head, struct sk_buff *skb); +struct sk_buff *tcp_gro_receive(struct list_head *head, struct sk_buff *skb); int tcp_gro_complete(struct sk_buff *skb); void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr); diff --git a/include/net/udp.h b/include/net/udp.h index b1ea8b0f5e6a..5723c6128ae4 100644 --- a/include/net/udp.h +++ b/include/net/udp.h @@ -170,8 +170,8 @@ static inline void udp_csum_pull_header(struct sk_buff *skb) typedef struct sock *(*udp_lookup_t)(struct sk_buff *skb, __be16 sport, __be16 dport); -struct sk_buff **udp_gro_receive(struct sk_buff **head, struct sk_buff *skb, - struct udphdr *uh, udp_lookup_t lookup); +struct sk_buff *udp_gro_receive(struct list_head *head, struct sk_buff *skb, + struct udphdr *uh, udp_lookup_t lookup); int udp_gro_complete(struct sk_buff *skb, int nhoff, udp_lookup_t lookup); struct sk_buff *__udp_gso_segment(struct sk_buff *gso_skb, diff --git a/include/net/udp_tunnel.h b/include/net/udp_tunnel.h index b95a6927c718..fe680ab6b15a 100644 --- a/include/net/udp_tunnel.h +++ b/include/net/udp_tunnel.h @@ -65,9 +65,9 @@ static inline int udp_sock_create(struct net *net, typedef int (*udp_tunnel_encap_rcv_t)(struct sock *sk, struct sk_buff *skb); typedef void (*udp_tunnel_encap_destroy_t)(struct sock *sk); -typedef struct sk_buff **(*udp_tunnel_gro_receive_t)(struct sock *sk, - struct sk_buff **head, - struct sk_buff *skb); +typedef struct sk_buff *(*udp_tunnel_gro_receive_t)(struct sock *sk, + struct list_head *head, + struct sk_buff *skb); typedef int (*udp_tunnel_gro_complete_t)(struct sock *sk, struct sk_buff *skb, int nhoff); diff --git a/net/8021q/vlan.c b/net/8021q/vlan.c index 73a65789271b..99141986efa0 100644 --- a/net/8021q/vlan.c +++ b/net/8021q/vlan.c @@ -647,13 +647,14 @@ out: return err; } -static struct sk_buff **vlan_gro_receive(struct sk_buff **head, - struct sk_buff *skb) +static struct sk_buff *vlan_gro_receive(struct list_head *head, + struct sk_buff *skb) { - struct sk_buff *p, **pp = NULL; - struct vlan_hdr *vhdr; - unsigned int hlen, off_vlan; const struct packet_offload *ptype; + unsigned int hlen, off_vlan; + struct sk_buff *pp = NULL; + struct vlan_hdr *vhdr; + struct sk_buff *p; __be16 type; int flush = 1; @@ -675,7 +676,7 @@ static struct sk_buff **vlan_gro_receive(struct sk_buff **head, flush = 0; - for (p = *head; p; p = p->next) { + list_for_each_entry(p, head, list) { struct vlan_hdr *vhdr2; if (!NAPI_GRO_CB(p)->same_flow) diff --git a/net/core/dev.c b/net/core/dev.c index a5aa1c7444e6..aa61b9344b46 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -4881,36 +4881,25 @@ out: */ void napi_gro_flush(struct napi_struct *napi, bool flush_old) { - struct sk_buff *skb, *prev = NULL; - - /* scan list and build reverse chain */ - for (skb = napi->gro_list; skb != NULL; skb = skb->next) { - skb->prev = prev; - prev = skb; - } - - for (skb = prev; skb; skb = prev) { - skb->next = NULL; + struct sk_buff *skb, *p; + list_for_each_entry_safe_reverse(skb, p, &napi->gro_list, list) { if (flush_old && NAPI_GRO_CB(skb)->age == jiffies) return; - - prev = skb->prev; + list_del_init(&skb->list); napi_gro_complete(skb); napi->gro_count--; } - - napi->gro_list = NULL; } EXPORT_SYMBOL(napi_gro_flush); static void gro_list_prepare(struct napi_struct *napi, struct sk_buff *skb) { - struct sk_buff *p; unsigned int maclen = skb->dev->hard_header_len; u32 hash = skb_get_hash_raw(skb); + struct sk_buff *p; - for (p = napi->gro_list; p; p = p->next) { + list_for_each_entry(p, &napi->gro_list, list) { unsigned long diffs; NAPI_GRO_CB(p)->flush = 0; @@ -4977,12 +4966,12 @@ static void gro_pull_from_frag0(struct sk_buff *skb, int grow) static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb) { - struct sk_buff **pp = NULL; + struct list_head *head = &offload_base; struct packet_offload *ptype; __be16 type = skb->protocol; - struct list_head *head = &offload_base; - int same_flow; + struct sk_buff *pp = NULL; enum gro_result ret; + int same_flow; int grow; if (netif_elide_gro(skb->dev)) @@ -5039,11 +5028,8 @@ static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff ret = NAPI_GRO_CB(skb)->free ? GRO_MERGED_FREE : GRO_MERGED; if (pp) { - struct sk_buff *nskb = *pp; - - *pp = nskb->next; - nskb->next = NULL; - napi_gro_complete(nskb); + list_del_init(&pp->list); + napi_gro_complete(pp); napi->gro_count--; } @@ -5054,15 +5040,10 @@ static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff goto normal; if (unlikely(napi->gro_count >= MAX_GRO_SKBS)) { - struct sk_buff *nskb = napi->gro_list; + struct sk_buff *nskb; - /* locate the end of the list to select the 'oldest' flow */ - while (nskb->next) { - pp = &nskb->next; - nskb = *pp; - } - *pp = NULL; - nskb->next = NULL; + nskb = list_last_entry(&napi->gro_list, struct sk_buff, list); + list_del(&nskb->list); napi_gro_complete(nskb); } else { napi->gro_count++; @@ -5071,8 +5052,7 @@ static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff NAPI_GRO_CB(skb)->age = jiffies; NAPI_GRO_CB(skb)->last = skb; skb_shinfo(skb)->gso_size = skb_gro_len(skb); - skb->next = napi->gro_list; - napi->gro_list = skb; + list_add(&skb->list, &napi->gro_list); ret = GRO_HELD; pull: @@ -5478,7 +5458,7 @@ bool napi_complete_done(struct napi_struct *n, int work_done) NAPIF_STATE_IN_BUSY_POLL))) return false; - if (n->gro_list) { + if (!list_empty(&n->gro_list)) { unsigned long timeout = 0; if (work_done) @@ -5687,7 +5667,7 @@ static enum hrtimer_restart napi_watchdog(struct hrtimer *timer) /* Note : we use a relaxed variant of napi_schedule_prep() not setting * NAPI_STATE_MISSED, since we do not react to a device IRQ. */ - if (napi->gro_list && !napi_disable_pending(napi) && + if (!list_empty(&napi->gro_list) && !napi_disable_pending(napi) && !test_and_set_bit(NAPI_STATE_SCHED, &napi->state)) __napi_schedule_irqoff(napi); @@ -5701,7 +5681,7 @@ void netif_napi_add(struct net_device *dev, struct napi_struct *napi, hrtimer_init(&napi->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED); napi->timer.function = napi_watchdog; napi->gro_count = 0; - napi->gro_list = NULL; + INIT_LIST_HEAD(&napi->gro_list); napi->skb = NULL; napi->poll = poll; if (weight > NAPI_POLL_WEIGHT) @@ -5734,6 +5714,14 @@ void napi_disable(struct napi_struct *n) } EXPORT_SYMBOL(napi_disable); +static void gro_list_free(struct list_head *head) +{ + struct sk_buff *skb, *p; + + list_for_each_entry_safe(skb, p, head, list) + kfree_skb(skb); +} + /* Must be called in process context */ void netif_napi_del(struct napi_struct *napi) { @@ -5743,8 +5731,8 @@ void netif_napi_del(struct napi_struct *napi) list_del_init(&napi->dev_list); napi_free_frags(napi); - kfree_skb_list(napi->gro_list); - napi->gro_list = NULL; + gro_list_free(&napi->gro_list); + INIT_LIST_HEAD(&napi->gro_list); napi->gro_count = 0; } EXPORT_SYMBOL(netif_napi_del); @@ -5787,7 +5775,7 @@ static int napi_poll(struct napi_struct *n, struct list_head *repoll) goto out_unlock; } - if (n->gro_list) { + if (!list_empty(&n->gro_list)) { /* flush too old packets * If HZ < 1000, flush all packets. */ diff --git a/net/core/skbuff.c b/net/core/skbuff.c index c642304f178c..b1f274f22d85 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -3815,14 +3815,14 @@ err: } EXPORT_SYMBOL_GPL(skb_segment); -int skb_gro_receive(struct sk_buff **head, struct sk_buff *skb) +int skb_gro_receive(struct sk_buff *p, struct sk_buff *skb) { struct skb_shared_info *pinfo, *skbinfo = skb_shinfo(skb); unsigned int offset = skb_gro_offset(skb); unsigned int headlen = skb_headlen(skb); unsigned int len = skb_gro_len(skb); - struct sk_buff *lp, *p = *head; unsigned int delta_truesize; + struct sk_buff *lp; if (unlikely(p->len + len >= 65536)) return -E2BIG; diff --git a/net/ethernet/eth.c b/net/ethernet/eth.c index ee28440f57c5..fd8faa0dfa61 100644 --- a/net/ethernet/eth.c +++ b/net/ethernet/eth.c @@ -427,13 +427,13 @@ ssize_t sysfs_format_mac(char *buf, const unsigned char *addr, int len) } EXPORT_SYMBOL(sysfs_format_mac); -struct sk_buff **eth_gro_receive(struct sk_buff **head, - struct sk_buff *skb) +struct sk_buff *eth_gro_receive(struct list_head *head, struct sk_buff *skb) { - struct sk_buff *p, **pp = NULL; - struct ethhdr *eh, *eh2; - unsigned int hlen, off_eth; const struct packet_offload *ptype; + unsigned int hlen, off_eth; + struct sk_buff *pp = NULL; + struct ethhdr *eh, *eh2; + struct sk_buff *p; __be16 type; int flush = 1; @@ -448,7 +448,7 @@ struct sk_buff **eth_gro_receive(struct sk_buff **head, flush = 0; - for (p = *head; p; p = p->next) { + list_for_each_entry(p, head, list) { if (!NAPI_GRO_CB(p)->same_flow) continue; diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index 15e125558c76..06b218a2870f 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -1384,12 +1384,12 @@ out: } EXPORT_SYMBOL(inet_gso_segment); -struct sk_buff **inet_gro_receive(struct sk_buff **head, struct sk_buff *skb) +struct sk_buff *inet_gro_receive(struct list_head *head, struct sk_buff *skb) { const struct net_offload *ops; - struct sk_buff **pp = NULL; - struct sk_buff *p; + struct sk_buff *pp = NULL; const struct iphdr *iph; + struct sk_buff *p; unsigned int hlen; unsigned int off; unsigned int id; @@ -1425,7 +1425,7 @@ struct sk_buff **inet_gro_receive(struct sk_buff **head, struct sk_buff *skb) flush = (u16)((ntohl(*(__be32 *)iph) ^ skb_gro_len(skb)) | (id & ~IP_DF)); id >>= 16; - for (p = *head; p; p = p->next) { + list_for_each_entry(p, head, list) { struct iphdr *iph2; u16 flush_id; @@ -1505,8 +1505,8 @@ out: } EXPORT_SYMBOL(inet_gro_receive); -static struct sk_buff **ipip_gro_receive(struct sk_buff **head, - struct sk_buff *skb) +static struct sk_buff *ipip_gro_receive(struct list_head *head, + struct sk_buff *skb) { if (NAPI_GRO_CB(skb)->encap_mark) { NAPI_GRO_CB(skb)->flush = 1; diff --git a/net/ipv4/esp4_offload.c b/net/ipv4/esp4_offload.c index 7cf755ef9efb..bbeecd13e534 100644 --- a/net/ipv4/esp4_offload.c +++ b/net/ipv4/esp4_offload.c @@ -28,8 +28,8 @@ #include #include -static struct sk_buff **esp4_gro_receive(struct sk_buff **head, - struct sk_buff *skb) +static struct sk_buff *esp4_gro_receive(struct list_head *head, + struct sk_buff *skb) { int offset = skb_gro_offset(skb); struct xfrm_offload *xo; diff --git a/net/ipv4/fou.c b/net/ipv4/fou.c index 1540db65241a..efdc9e1f741e 100644 --- a/net/ipv4/fou.c +++ b/net/ipv4/fou.c @@ -224,14 +224,14 @@ drop: return 0; } -static struct sk_buff **fou_gro_receive(struct sock *sk, - struct sk_buff **head, - struct sk_buff *skb) +static struct sk_buff *fou_gro_receive(struct sock *sk, + struct list_head *head, + struct sk_buff *skb) { - const struct net_offload *ops; - struct sk_buff **pp = NULL; u8 proto = fou_from_sock(sk)->protocol; const struct net_offload **offloads; + const struct net_offload *ops; + struct sk_buff *pp = NULL; /* We can clear the encap_mark for FOU as we are essentially doing * one of two possible things. We are either adding an L4 tunnel @@ -305,13 +305,13 @@ static struct guehdr *gue_gro_remcsum(struct sk_buff *skb, unsigned int off, return guehdr; } -static struct sk_buff **gue_gro_receive(struct sock *sk, - struct sk_buff **head, - struct sk_buff *skb) +static struct sk_buff *gue_gro_receive(struct sock *sk, + struct list_head *head, + struct sk_buff *skb) { const struct net_offload **offloads; const struct net_offload *ops; - struct sk_buff **pp = NULL; + struct sk_buff *pp = NULL; struct sk_buff *p; struct guehdr *guehdr; size_t len, optlen, hdrlen, off; @@ -397,7 +397,7 @@ static struct sk_buff **gue_gro_receive(struct sock *sk, skb_gro_pull(skb, hdrlen); - for (p = *head; p; p = p->next) { + list_for_each_entry(p, head, list) { const struct guehdr *guehdr2; if (!NAPI_GRO_CB(p)->same_flow) diff --git a/net/ipv4/gre_offload.c b/net/ipv4/gre_offload.c index 1859c473b21a..b9673c21be45 100644 --- a/net/ipv4/gre_offload.c +++ b/net/ipv4/gre_offload.c @@ -108,10 +108,10 @@ out: return segs; } -static struct sk_buff **gre_gro_receive(struct sk_buff **head, - struct sk_buff *skb) +static struct sk_buff *gre_gro_receive(struct list_head *head, + struct sk_buff *skb) { - struct sk_buff **pp = NULL; + struct sk_buff *pp = NULL; struct sk_buff *p; const struct gre_base_hdr *greh; unsigned int hlen, grehlen; @@ -182,7 +182,7 @@ static struct sk_buff **gre_gro_receive(struct sk_buff **head, null_compute_pseudo); } - for (p = *head; p; p = p->next) { + list_for_each_entry(p, head, list) { const struct gre_base_hdr *greh2; if (!NAPI_GRO_CB(p)->same_flow) diff --git a/net/ipv4/tcp_offload.c b/net/ipv4/tcp_offload.c index 8cc7c3487330..f5aee641f825 100644 --- a/net/ipv4/tcp_offload.c +++ b/net/ipv4/tcp_offload.c @@ -180,9 +180,9 @@ out: return segs; } -struct sk_buff **tcp_gro_receive(struct sk_buff **head, struct sk_buff *skb) +struct sk_buff *tcp_gro_receive(struct list_head *head, struct sk_buff *skb) { - struct sk_buff **pp = NULL; + struct sk_buff *pp = NULL; struct sk_buff *p; struct tcphdr *th; struct tcphdr *th2; @@ -220,7 +220,7 @@ struct sk_buff **tcp_gro_receive(struct sk_buff **head, struct sk_buff *skb) len = skb_gro_len(skb); flags = tcp_flag_word(th); - for (; (p = *head); head = &p->next) { + list_for_each_entry(p, head, list) { if (!NAPI_GRO_CB(p)->same_flow) continue; @@ -233,7 +233,7 @@ struct sk_buff **tcp_gro_receive(struct sk_buff **head, struct sk_buff *skb) goto found; } - + p = NULL; goto out_check_final; found: @@ -263,7 +263,7 @@ found: flush |= (len - 1) >= mss; flush |= (ntohl(th2->seq) + skb_gro_len(p)) ^ ntohl(th->seq); - if (flush || skb_gro_receive(head, skb)) { + if (flush || skb_gro_receive(p, skb)) { mss = 1; goto out_check_final; } @@ -277,7 +277,7 @@ out_check_final: TCP_FLAG_FIN)); if (p && (!NAPI_GRO_CB(skb)->same_flow || flush)) - pp = head; + pp = p; out: NAPI_GRO_CB(skb)->flush |= (flush != 0); @@ -302,7 +302,7 @@ int tcp_gro_complete(struct sk_buff *skb) } EXPORT_SYMBOL(tcp_gro_complete); -static struct sk_buff **tcp4_gro_receive(struct sk_buff **head, struct sk_buff *skb) +static struct sk_buff *tcp4_gro_receive(struct list_head *head, struct sk_buff *skb) { /* Don't bother verifying checksum if we're going to flush anyway. */ if (!NAPI_GRO_CB(skb)->flush && diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c index 92dc9e5a7ff3..ac46c1c55c99 100644 --- a/net/ipv4/udp_offload.c +++ b/net/ipv4/udp_offload.c @@ -343,10 +343,11 @@ out: return segs; } -struct sk_buff **udp_gro_receive(struct sk_buff **head, struct sk_buff *skb, - struct udphdr *uh, udp_lookup_t lookup) +struct sk_buff *udp_gro_receive(struct list_head *head, struct sk_buff *skb, + struct udphdr *uh, udp_lookup_t lookup) { - struct sk_buff *p, **pp = NULL; + struct sk_buff *pp = NULL; + struct sk_buff *p; struct udphdr *uh2; unsigned int off = skb_gro_offset(skb); int flush = 1; @@ -371,7 +372,7 @@ struct sk_buff **udp_gro_receive(struct sk_buff **head, struct sk_buff *skb, unflush: flush = 0; - for (p = *head; p; p = p->next) { + list_for_each_entry(p, head, list) { if (!NAPI_GRO_CB(p)->same_flow) continue; @@ -399,8 +400,8 @@ out: } EXPORT_SYMBOL(udp_gro_receive); -static struct sk_buff **udp4_gro_receive(struct sk_buff **head, - struct sk_buff *skb) +static struct sk_buff *udp4_gro_receive(struct list_head *head, + struct sk_buff *skb) { struct udphdr *uh = udp_gro_udphdr(skb); diff --git a/net/ipv6/esp6_offload.c b/net/ipv6/esp6_offload.c index 27f59b61f70f..ddfa533a84e5 100644 --- a/net/ipv6/esp6_offload.c +++ b/net/ipv6/esp6_offload.c @@ -49,8 +49,8 @@ static __u16 esp6_nexthdr_esp_offset(struct ipv6hdr *ipv6_hdr, int nhlen) return 0; } -static struct sk_buff **esp6_gro_receive(struct sk_buff **head, - struct sk_buff *skb) +static struct sk_buff *esp6_gro_receive(struct list_head *head, + struct sk_buff *skb) { int offset = skb_gro_offset(skb); struct xfrm_offload *xo; diff --git a/net/ipv6/ip6_offload.c b/net/ipv6/ip6_offload.c index 5b3f2f89ef41..37ff4805b20c 100644 --- a/net/ipv6/ip6_offload.c +++ b/net/ipv6/ip6_offload.c @@ -163,11 +163,11 @@ static int ipv6_exthdrs_len(struct ipv6hdr *iph, return len; } -static struct sk_buff **ipv6_gro_receive(struct sk_buff **head, - struct sk_buff *skb) +static struct sk_buff *ipv6_gro_receive(struct list_head *head, + struct sk_buff *skb) { const struct net_offload *ops; - struct sk_buff **pp = NULL; + struct sk_buff *pp = NULL; struct sk_buff *p; struct ipv6hdr *iph; unsigned int nlen; @@ -214,7 +214,7 @@ static struct sk_buff **ipv6_gro_receive(struct sk_buff **head, flush--; nlen = skb_network_header_len(skb); - for (p = *head; p; p = p->next) { + list_for_each_entry(p, head, list) { const struct ipv6hdr *iph2; __be32 first_word; /* */ @@ -263,8 +263,8 @@ out: return pp; } -static struct sk_buff **sit_ip6ip6_gro_receive(struct sk_buff **head, - struct sk_buff *skb) +static struct sk_buff *sit_ip6ip6_gro_receive(struct list_head *head, + struct sk_buff *skb) { /* Common GRO receive for SIT and IP6IP6 */ @@ -278,8 +278,8 @@ static struct sk_buff **sit_ip6ip6_gro_receive(struct sk_buff **head, return ipv6_gro_receive(head, skb); } -static struct sk_buff **ip4ip6_gro_receive(struct sk_buff **head, - struct sk_buff *skb) +static struct sk_buff *ip4ip6_gro_receive(struct list_head *head, + struct sk_buff *skb) { /* Common GRO receive for SIT and IP6IP6 */ diff --git a/net/ipv6/tcpv6_offload.c b/net/ipv6/tcpv6_offload.c index 278e49cd67d4..e72947c99454 100644 --- a/net/ipv6/tcpv6_offload.c +++ b/net/ipv6/tcpv6_offload.c @@ -15,8 +15,8 @@ #include #include "ip6_offload.h" -static struct sk_buff **tcp6_gro_receive(struct sk_buff **head, - struct sk_buff *skb) +static struct sk_buff *tcp6_gro_receive(struct list_head *head, + struct sk_buff *skb) { /* Don't bother verifying checksum if we're going to flush anyway. */ if (!NAPI_GRO_CB(skb)->flush && diff --git a/net/ipv6/udp_offload.c b/net/ipv6/udp_offload.c index 03a2ff3fe1e6..95dee9ca8d22 100644 --- a/net/ipv6/udp_offload.c +++ b/net/ipv6/udp_offload.c @@ -114,8 +114,8 @@ out: return segs; } -static struct sk_buff **udp6_gro_receive(struct sk_buff **head, - struct sk_buff *skb) +static struct sk_buff *udp6_gro_receive(struct list_head *head, + struct sk_buff *skb) { struct udphdr *uh = udp_gro_udphdr(skb); -- cgit v1.2.3-59-g8ed1b From 07d78363dcffd9cb1bf6f06a6cac0e0847f3c1de Mon Sep 17 00:00:00 2001 From: David Miller Date: Sun, 24 Jun 2018 14:14:02 +0900 Subject: net: Convert NAPI gro list into a small hash table. Improve the performance of GRO receive by splitting flows into multiple hash chains. Suggested-by: Eric Dumazet Signed-off-by: David S. Miller --- include/linux/netdevice.h | 3 +- net/core/dev.c | 105 ++++++++++++++++++++++++++++++++++------------ 2 files changed, 81 insertions(+), 27 deletions(-) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index f176d9873910..c6b377a15869 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -305,6 +305,7 @@ int __init netdev_boot_setup(char *str); /* * Structure for NAPI scheduling similar to tasklet but with weighting */ +#define GRO_HASH_BUCKETS 8 struct napi_struct { /* The poll_list must only be managed by the entity which * changes the state of the NAPI_STATE_SCHED bit. This means @@ -322,7 +323,7 @@ struct napi_struct { int poll_owner; #endif struct net_device *dev; - struct list_head gro_list; + struct list_head gro_hash[GRO_HASH_BUCKETS]; struct sk_buff *skb; struct hrtimer timer; struct list_head dev_list; diff --git a/net/core/dev.c b/net/core/dev.c index aa61b9344b46..dffed642e686 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -4875,15 +4875,12 @@ out: return netif_receive_skb_internal(skb); } -/* napi->gro_list contains packets ordered by age. - * youngest packets at the head of it. - * Complete skbs in reverse order to reduce latencies. - */ -void napi_gro_flush(struct napi_struct *napi, bool flush_old) +static void __napi_gro_flush_chain(struct napi_struct *napi, struct list_head *head, + bool flush_old) { struct sk_buff *skb, *p; - list_for_each_entry_safe_reverse(skb, p, &napi->gro_list, list) { + list_for_each_entry_safe_reverse(skb, p, head, list) { if (flush_old && NAPI_GRO_CB(skb)->age == jiffies) return; list_del_init(&skb->list); @@ -4891,15 +4888,33 @@ void napi_gro_flush(struct napi_struct *napi, bool flush_old) napi->gro_count--; } } + +/* napi->gro_hash contains packets ordered by age. + * youngest packets at the head of it. + * Complete skbs in reverse order to reduce latencies. + */ +void napi_gro_flush(struct napi_struct *napi, bool flush_old) +{ + int i; + + for (i = 0; i < GRO_HASH_BUCKETS; i++) { + struct list_head *head = &napi->gro_hash[i]; + + __napi_gro_flush_chain(napi, head, flush_old); + } +} EXPORT_SYMBOL(napi_gro_flush); -static void gro_list_prepare(struct napi_struct *napi, struct sk_buff *skb) +static struct list_head *gro_list_prepare(struct napi_struct *napi, + struct sk_buff *skb) { unsigned int maclen = skb->dev->hard_header_len; u32 hash = skb_get_hash_raw(skb); + struct list_head *head; struct sk_buff *p; - list_for_each_entry(p, &napi->gro_list, list) { + head = &napi->gro_hash[hash & (GRO_HASH_BUCKETS - 1)]; + list_for_each_entry(p, head, list) { unsigned long diffs; NAPI_GRO_CB(p)->flush = 0; @@ -4922,6 +4937,8 @@ static void gro_list_prepare(struct napi_struct *napi, struct sk_buff *skb) maclen); NAPI_GRO_CB(p)->same_flow = !diffs; } + + return head; } static void skb_gro_reset_offset(struct sk_buff *skb) @@ -4964,11 +4981,45 @@ static void gro_pull_from_frag0(struct sk_buff *skb, int grow) } } +static void gro_flush_oldest(struct napi_struct *napi) +{ + struct sk_buff *oldest = NULL; + unsigned long age = jiffies; + int i; + + for (i = 0; i < GRO_HASH_BUCKETS; i++) { + struct list_head *head = &napi->gro_hash[i]; + struct sk_buff *skb; + + if (list_empty(head)) + continue; + + skb = list_last_entry(head, struct sk_buff, list); + if (!oldest || time_before(NAPI_GRO_CB(skb)->age, age)) { + oldest = skb; + age = NAPI_GRO_CB(skb)->age; + } + } + + /* We are called with napi->gro_count >= MAX_GRO_SKBS, so this is + * impossible. + */ + if (WARN_ON_ONCE(!oldest)) + return; + + /* Do not adjust napi->gro_count, caller is adding a new SKB to + * the chain. + */ + list_del(&oldest->list); + napi_gro_complete(oldest); +} + static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb) { struct list_head *head = &offload_base; struct packet_offload *ptype; __be16 type = skb->protocol; + struct list_head *gro_head; struct sk_buff *pp = NULL; enum gro_result ret; int same_flow; @@ -4977,7 +5028,7 @@ static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff if (netif_elide_gro(skb->dev)) goto normal; - gro_list_prepare(napi, skb); + gro_head = gro_list_prepare(napi, skb); rcu_read_lock(); list_for_each_entry_rcu(ptype, head, list) { @@ -5011,7 +5062,7 @@ static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff NAPI_GRO_CB(skb)->csum_valid = 0; } - pp = ptype->callbacks.gro_receive(&napi->gro_list, skb); + pp = ptype->callbacks.gro_receive(gro_head, skb); break; } rcu_read_unlock(); @@ -5040,11 +5091,7 @@ static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff goto normal; if (unlikely(napi->gro_count >= MAX_GRO_SKBS)) { - struct sk_buff *nskb; - - nskb = list_last_entry(&napi->gro_list, struct sk_buff, list); - list_del(&nskb->list); - napi_gro_complete(nskb); + gro_flush_oldest(napi); } else { napi->gro_count++; } @@ -5052,7 +5099,7 @@ static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff NAPI_GRO_CB(skb)->age = jiffies; NAPI_GRO_CB(skb)->last = skb; skb_shinfo(skb)->gso_size = skb_gro_len(skb); - list_add(&skb->list, &napi->gro_list); + list_add(&skb->list, gro_head); ret = GRO_HELD; pull: @@ -5458,7 +5505,7 @@ bool napi_complete_done(struct napi_struct *n, int work_done) NAPIF_STATE_IN_BUSY_POLL))) return false; - if (!list_empty(&n->gro_list)) { + if (n->gro_count) { unsigned long timeout = 0; if (work_done) @@ -5667,7 +5714,7 @@ static enum hrtimer_restart napi_watchdog(struct hrtimer *timer) /* Note : we use a relaxed variant of napi_schedule_prep() not setting * NAPI_STATE_MISSED, since we do not react to a device IRQ. */ - if (!list_empty(&napi->gro_list) && !napi_disable_pending(napi) && + if (napi->gro_count && !napi_disable_pending(napi) && !test_and_set_bit(NAPI_STATE_SCHED, &napi->state)) __napi_schedule_irqoff(napi); @@ -5677,11 +5724,14 @@ static enum hrtimer_restart napi_watchdog(struct hrtimer *timer) void netif_napi_add(struct net_device *dev, struct napi_struct *napi, int (*poll)(struct napi_struct *, int), int weight) { + int i; + INIT_LIST_HEAD(&napi->poll_list); hrtimer_init(&napi->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED); napi->timer.function = napi_watchdog; napi->gro_count = 0; - INIT_LIST_HEAD(&napi->gro_list); + for (i = 0; i < GRO_HASH_BUCKETS; i++) + INIT_LIST_HEAD(&napi->gro_hash[i]); napi->skb = NULL; napi->poll = poll; if (weight > NAPI_POLL_WEIGHT) @@ -5714,12 +5764,16 @@ void napi_disable(struct napi_struct *n) } EXPORT_SYMBOL(napi_disable); -static void gro_list_free(struct list_head *head) +static void flush_gro_hash(struct napi_struct *napi) { - struct sk_buff *skb, *p; + int i; - list_for_each_entry_safe(skb, p, head, list) - kfree_skb(skb); + for (i = 0; i < GRO_HASH_BUCKETS; i++) { + struct sk_buff *skb, *n; + + list_for_each_entry_safe(skb, n, &napi->gro_hash[i], list) + kfree_skb(skb); + } } /* Must be called in process context */ @@ -5731,8 +5785,7 @@ void netif_napi_del(struct napi_struct *napi) list_del_init(&napi->dev_list); napi_free_frags(napi); - gro_list_free(&napi->gro_list); - INIT_LIST_HEAD(&napi->gro_list); + flush_gro_hash(napi); napi->gro_count = 0; } EXPORT_SYMBOL(netif_napi_del); @@ -5775,7 +5828,7 @@ static int napi_poll(struct napi_struct *n, struct list_head *repoll) goto out_unlock; } - if (!list_empty(&n->gro_list)) { + if (n->gro_count) { /* flush too old packets * If HZ < 1000, flush all packets. */ -- cgit v1.2.3-59-g8ed1b From a8f62d0c6fe533e07cd1acce7588278f9d6e7720 Mon Sep 17 00:00:00 2001 From: Yangbo Lu Date: Mon, 25 Jun 2018 20:37:08 +0800 Subject: ptp: support DPAA FMan 1588 timer in ptp_qoriq This patch is to support DPAA (Data Path Acceleration Architecture) 1588 timer by adding "fsl,fman-ptp-timer" compatible, sharing interrupt with FMan, adding FSL_DPAA_ETH dependency, and fixing up register offset. Signed-off-by: Yangbo Lu Acked-by: Richard Cochran Acked-by: Madalin Bucur Signed-off-by: David S. Miller --- drivers/ptp/Kconfig | 2 +- drivers/ptp/ptp_qoriq.c | 104 +++++++++++++++++++++++++++--------------- include/linux/fsl/ptp_qoriq.h | 38 ++++++++++++--- 3 files changed, 98 insertions(+), 46 deletions(-) (limited to 'include/linux') diff --git a/drivers/ptp/Kconfig b/drivers/ptp/Kconfig index 474c988d2e95..d137c480db46 100644 --- a/drivers/ptp/Kconfig +++ b/drivers/ptp/Kconfig @@ -43,7 +43,7 @@ config PTP_1588_CLOCK_DTE config PTP_1588_CLOCK_QORIQ tristate "Freescale QorIQ 1588 timer as PTP clock" - depends on GIANFAR + depends on GIANFAR || FSL_DPAA_ETH depends on PTP_1588_CLOCK default y help diff --git a/drivers/ptp/ptp_qoriq.c b/drivers/ptp/ptp_qoriq.c index e8652c148c52..a14c317b5a38 100644 --- a/drivers/ptp/ptp_qoriq.c +++ b/drivers/ptp/ptp_qoriq.c @@ -39,11 +39,12 @@ /* Caller must hold qoriq_ptp->lock. */ static u64 tmr_cnt_read(struct qoriq_ptp *qoriq_ptp) { + struct qoriq_ptp_registers *regs = &qoriq_ptp->regs; u64 ns; u32 lo, hi; - lo = qoriq_read(&qoriq_ptp->regs->tmr_cnt_l); - hi = qoriq_read(&qoriq_ptp->regs->tmr_cnt_h); + lo = qoriq_read(®s->ctrl_regs->tmr_cnt_l); + hi = qoriq_read(®s->ctrl_regs->tmr_cnt_h); ns = ((u64) hi) << 32; ns |= lo; return ns; @@ -52,16 +53,18 @@ static u64 tmr_cnt_read(struct qoriq_ptp *qoriq_ptp) /* Caller must hold qoriq_ptp->lock. */ static void tmr_cnt_write(struct qoriq_ptp *qoriq_ptp, u64 ns) { + struct qoriq_ptp_registers *regs = &qoriq_ptp->regs; u32 hi = ns >> 32; u32 lo = ns & 0xffffffff; - qoriq_write(&qoriq_ptp->regs->tmr_cnt_l, lo); - qoriq_write(&qoriq_ptp->regs->tmr_cnt_h, hi); + qoriq_write(®s->ctrl_regs->tmr_cnt_l, lo); + qoriq_write(®s->ctrl_regs->tmr_cnt_h, hi); } /* Caller must hold qoriq_ptp->lock. */ static void set_alarm(struct qoriq_ptp *qoriq_ptp) { + struct qoriq_ptp_registers *regs = &qoriq_ptp->regs; u64 ns; u32 lo, hi; @@ -70,16 +73,18 @@ static void set_alarm(struct qoriq_ptp *qoriq_ptp) ns -= qoriq_ptp->tclk_period; hi = ns >> 32; lo = ns & 0xffffffff; - qoriq_write(&qoriq_ptp->regs->tmr_alarm1_l, lo); - qoriq_write(&qoriq_ptp->regs->tmr_alarm1_h, hi); + qoriq_write(®s->alarm_regs->tmr_alarm1_l, lo); + qoriq_write(®s->alarm_regs->tmr_alarm1_h, hi); } /* Caller must hold qoriq_ptp->lock. */ static void set_fipers(struct qoriq_ptp *qoriq_ptp) { + struct qoriq_ptp_registers *regs = &qoriq_ptp->regs; + set_alarm(qoriq_ptp); - qoriq_write(&qoriq_ptp->regs->tmr_fiper1, qoriq_ptp->tmr_fiper1); - qoriq_write(&qoriq_ptp->regs->tmr_fiper2, qoriq_ptp->tmr_fiper2); + qoriq_write(®s->fiper_regs->tmr_fiper1, qoriq_ptp->tmr_fiper1); + qoriq_write(®s->fiper_regs->tmr_fiper2, qoriq_ptp->tmr_fiper2); } /* @@ -89,16 +94,17 @@ static void set_fipers(struct qoriq_ptp *qoriq_ptp) static irqreturn_t isr(int irq, void *priv) { struct qoriq_ptp *qoriq_ptp = priv; + struct qoriq_ptp_registers *regs = &qoriq_ptp->regs; struct ptp_clock_event event; u64 ns; u32 ack = 0, lo, hi, mask, val; - val = qoriq_read(&qoriq_ptp->regs->tmr_tevent); + val = qoriq_read(®s->ctrl_regs->tmr_tevent); if (val & ETS1) { ack |= ETS1; - hi = qoriq_read(&qoriq_ptp->regs->tmr_etts1_h); - lo = qoriq_read(&qoriq_ptp->regs->tmr_etts1_l); + hi = qoriq_read(®s->etts_regs->tmr_etts1_h); + lo = qoriq_read(®s->etts_regs->tmr_etts1_l); event.type = PTP_CLOCK_EXTTS; event.index = 0; event.timestamp = ((u64) hi) << 32; @@ -108,8 +114,8 @@ static irqreturn_t isr(int irq, void *priv) if (val & ETS2) { ack |= ETS2; - hi = qoriq_read(&qoriq_ptp->regs->tmr_etts2_h); - lo = qoriq_read(&qoriq_ptp->regs->tmr_etts2_l); + hi = qoriq_read(®s->etts_regs->tmr_etts2_h); + lo = qoriq_read(®s->etts_regs->tmr_etts2_l); event.type = PTP_CLOCK_EXTTS; event.index = 1; event.timestamp = ((u64) hi) << 32; @@ -130,16 +136,16 @@ static irqreturn_t isr(int irq, void *priv) hi = ns >> 32; lo = ns & 0xffffffff; spin_lock(&qoriq_ptp->lock); - qoriq_write(&qoriq_ptp->regs->tmr_alarm2_l, lo); - qoriq_write(&qoriq_ptp->regs->tmr_alarm2_h, hi); + qoriq_write(®s->alarm_regs->tmr_alarm2_l, lo); + qoriq_write(®s->alarm_regs->tmr_alarm2_h, hi); spin_unlock(&qoriq_ptp->lock); qoriq_ptp->alarm_value = ns; } else { - qoriq_write(&qoriq_ptp->regs->tmr_tevent, ALM2); + qoriq_write(®s->ctrl_regs->tmr_tevent, ALM2); spin_lock(&qoriq_ptp->lock); - mask = qoriq_read(&qoriq_ptp->regs->tmr_temask); + mask = qoriq_read(®s->ctrl_regs->tmr_temask); mask &= ~ALM2EN; - qoriq_write(&qoriq_ptp->regs->tmr_temask, mask); + qoriq_write(®s->ctrl_regs->tmr_temask, mask); spin_unlock(&qoriq_ptp->lock); qoriq_ptp->alarm_value = 0; qoriq_ptp->alarm_interval = 0; @@ -153,7 +159,7 @@ static irqreturn_t isr(int irq, void *priv) } if (ack) { - qoriq_write(&qoriq_ptp->regs->tmr_tevent, ack); + qoriq_write(®s->ctrl_regs->tmr_tevent, ack); return IRQ_HANDLED; } else return IRQ_NONE; @@ -169,6 +175,7 @@ static int ptp_qoriq_adjfine(struct ptp_clock_info *ptp, long scaled_ppm) u32 tmr_add; int neg_adj = 0; struct qoriq_ptp *qoriq_ptp = container_of(ptp, struct qoriq_ptp, caps); + struct qoriq_ptp_registers *regs = &qoriq_ptp->regs; if (scaled_ppm < 0) { neg_adj = 1; @@ -186,7 +193,7 @@ static int ptp_qoriq_adjfine(struct ptp_clock_info *ptp, long scaled_ppm) tmr_add = neg_adj ? tmr_add - diff : tmr_add + diff; - qoriq_write(&qoriq_ptp->regs->tmr_add, tmr_add); + qoriq_write(®s->ctrl_regs->tmr_add, tmr_add); return 0; } @@ -250,6 +257,7 @@ static int ptp_qoriq_enable(struct ptp_clock_info *ptp, struct ptp_clock_request *rq, int on) { struct qoriq_ptp *qoriq_ptp = container_of(ptp, struct qoriq_ptp, caps); + struct qoriq_ptp_registers *regs = &qoriq_ptp->regs; unsigned long flags; u32 bit, mask; @@ -266,23 +274,23 @@ static int ptp_qoriq_enable(struct ptp_clock_info *ptp, return -EINVAL; } spin_lock_irqsave(&qoriq_ptp->lock, flags); - mask = qoriq_read(&qoriq_ptp->regs->tmr_temask); + mask = qoriq_read(®s->ctrl_regs->tmr_temask); if (on) mask |= bit; else mask &= ~bit; - qoriq_write(&qoriq_ptp->regs->tmr_temask, mask); + qoriq_write(®s->ctrl_regs->tmr_temask, mask); spin_unlock_irqrestore(&qoriq_ptp->lock, flags); return 0; case PTP_CLK_REQ_PPS: spin_lock_irqsave(&qoriq_ptp->lock, flags); - mask = qoriq_read(&qoriq_ptp->regs->tmr_temask); + mask = qoriq_read(®s->ctrl_regs->tmr_temask); if (on) mask |= PP1EN; else mask &= ~PP1EN; - qoriq_write(&qoriq_ptp->regs->tmr_temask, mask); + qoriq_write(®s->ctrl_regs->tmr_temask, mask); spin_unlock_irqrestore(&qoriq_ptp->lock, flags); return 0; @@ -313,10 +321,12 @@ static int qoriq_ptp_probe(struct platform_device *dev) { struct device_node *node = dev->dev.of_node; struct qoriq_ptp *qoriq_ptp; + struct qoriq_ptp_registers *regs; struct timespec64 now; int err = -ENOMEM; u32 tmr_ctrl; unsigned long flags; + void __iomem *base; qoriq_ptp = kzalloc(sizeof(*qoriq_ptp), GFP_KERNEL); if (!qoriq_ptp) @@ -351,7 +361,7 @@ static int qoriq_ptp_probe(struct platform_device *dev) pr_err("irq not in device tree\n"); goto no_node; } - if (request_irq(qoriq_ptp->irq, isr, 0, DRIVER, qoriq_ptp)) { + if (request_irq(qoriq_ptp->irq, isr, IRQF_SHARED, DRIVER, qoriq_ptp)) { pr_err("request_irq failed\n"); goto no_node; } @@ -368,12 +378,27 @@ static int qoriq_ptp_probe(struct platform_device *dev) spin_lock_init(&qoriq_ptp->lock); - qoriq_ptp->regs = ioremap(qoriq_ptp->rsrc->start, - resource_size(qoriq_ptp->rsrc)); - if (!qoriq_ptp->regs) { + base = ioremap(qoriq_ptp->rsrc->start, + resource_size(qoriq_ptp->rsrc)); + if (!base) { pr_err("ioremap ptp registers failed\n"); goto no_ioremap; } + + qoriq_ptp->base = base; + + if (of_device_is_compatible(node, "fsl,fman-ptp-timer")) { + qoriq_ptp->regs.ctrl_regs = base + FMAN_CTRL_REGS_OFFSET; + qoriq_ptp->regs.alarm_regs = base + FMAN_ALARM_REGS_OFFSET; + qoriq_ptp->regs.fiper_regs = base + FMAN_FIPER_REGS_OFFSET; + qoriq_ptp->regs.etts_regs = base + FMAN_ETTS_REGS_OFFSET; + } else { + qoriq_ptp->regs.ctrl_regs = base + CTRL_REGS_OFFSET; + qoriq_ptp->regs.alarm_regs = base + ALARM_REGS_OFFSET; + qoriq_ptp->regs.fiper_regs = base + FIPER_REGS_OFFSET; + qoriq_ptp->regs.etts_regs = base + ETTS_REGS_OFFSET; + } + ktime_get_real_ts64(&now); ptp_qoriq_settime(&qoriq_ptp->caps, &now); @@ -383,13 +408,14 @@ static int qoriq_ptp_probe(struct platform_device *dev) spin_lock_irqsave(&qoriq_ptp->lock, flags); - qoriq_write(&qoriq_ptp->regs->tmr_ctrl, tmr_ctrl); - qoriq_write(&qoriq_ptp->regs->tmr_add, qoriq_ptp->tmr_add); - qoriq_write(&qoriq_ptp->regs->tmr_prsc, qoriq_ptp->tmr_prsc); - qoriq_write(&qoriq_ptp->regs->tmr_fiper1, qoriq_ptp->tmr_fiper1); - qoriq_write(&qoriq_ptp->regs->tmr_fiper2, qoriq_ptp->tmr_fiper2); + regs = &qoriq_ptp->regs; + qoriq_write(®s->ctrl_regs->tmr_ctrl, tmr_ctrl); + qoriq_write(®s->ctrl_regs->tmr_add, qoriq_ptp->tmr_add); + qoriq_write(®s->ctrl_regs->tmr_prsc, qoriq_ptp->tmr_prsc); + qoriq_write(®s->fiper_regs->tmr_fiper1, qoriq_ptp->tmr_fiper1); + qoriq_write(®s->fiper_regs->tmr_fiper2, qoriq_ptp->tmr_fiper2); set_alarm(qoriq_ptp); - qoriq_write(&qoriq_ptp->regs->tmr_ctrl, tmr_ctrl|FIPERST|RTPE|TE|FRD); + qoriq_write(®s->ctrl_regs->tmr_ctrl, tmr_ctrl|FIPERST|RTPE|TE|FRD); spin_unlock_irqrestore(&qoriq_ptp->lock, flags); @@ -405,7 +431,7 @@ static int qoriq_ptp_probe(struct platform_device *dev) return 0; no_clock: - iounmap(qoriq_ptp->regs); + iounmap(qoriq_ptp->base); no_ioremap: release_resource(qoriq_ptp->rsrc); no_resource: @@ -419,12 +445,13 @@ no_memory: static int qoriq_ptp_remove(struct platform_device *dev) { struct qoriq_ptp *qoriq_ptp = platform_get_drvdata(dev); + struct qoriq_ptp_registers *regs = &qoriq_ptp->regs; - qoriq_write(&qoriq_ptp->regs->tmr_temask, 0); - qoriq_write(&qoriq_ptp->regs->tmr_ctrl, 0); + qoriq_write(®s->ctrl_regs->tmr_temask, 0); + qoriq_write(®s->ctrl_regs->tmr_ctrl, 0); ptp_clock_unregister(qoriq_ptp->clock); - iounmap(qoriq_ptp->regs); + iounmap(qoriq_ptp->base); release_resource(qoriq_ptp->rsrc); free_irq(qoriq_ptp->irq, qoriq_ptp); kfree(qoriq_ptp); @@ -434,6 +461,7 @@ static int qoriq_ptp_remove(struct platform_device *dev) static const struct of_device_id match_table[] = { { .compatible = "fsl,etsec-ptp" }, + { .compatible = "fsl,fman-ptp-timer" }, {}, }; MODULE_DEVICE_TABLE(of, match_table); diff --git a/include/linux/fsl/ptp_qoriq.h b/include/linux/fsl/ptp_qoriq.h index b462d9ea8007..dc3dac40f069 100644 --- a/include/linux/fsl/ptp_qoriq.h +++ b/include/linux/fsl/ptp_qoriq.h @@ -11,9 +11,8 @@ /* * qoriq ptp registers - * Generated by regen.tcl on Thu May 13 01:38:57 PM CEST 2010 */ -struct qoriq_ptp_registers { +struct ctrl_regs { u32 tmr_ctrl; /* Timer control register */ u32 tmr_tevent; /* Timestamp event register */ u32 tmr_temask; /* Timer event mask register */ @@ -28,22 +27,47 @@ struct qoriq_ptp_registers { u8 res1[4]; u32 tmroff_h; /* Timer offset high */ u32 tmroff_l; /* Timer offset low */ - u8 res2[8]; +}; + +struct alarm_regs { u32 tmr_alarm1_h; /* Timer alarm 1 high register */ u32 tmr_alarm1_l; /* Timer alarm 1 high register */ u32 tmr_alarm2_h; /* Timer alarm 2 high register */ u32 tmr_alarm2_l; /* Timer alarm 2 high register */ - u8 res3[48]; +}; + +struct fiper_regs { u32 tmr_fiper1; /* Timer fixed period interval */ u32 tmr_fiper2; /* Timer fixed period interval */ u32 tmr_fiper3; /* Timer fixed period interval */ - u8 res4[20]; +}; + +struct etts_regs { u32 tmr_etts1_h; /* Timestamp of general purpose external trigger */ u32 tmr_etts1_l; /* Timestamp of general purpose external trigger */ u32 tmr_etts2_h; /* Timestamp of general purpose external trigger */ u32 tmr_etts2_l; /* Timestamp of general purpose external trigger */ }; +struct qoriq_ptp_registers { + struct ctrl_regs __iomem *ctrl_regs; + struct alarm_regs __iomem *alarm_regs; + struct fiper_regs __iomem *fiper_regs; + struct etts_regs __iomem *etts_regs; +}; + +/* Offset definitions for the four register groups */ +#define CTRL_REGS_OFFSET 0x0 +#define ALARM_REGS_OFFSET 0x40 +#define FIPER_REGS_OFFSET 0x80 +#define ETTS_REGS_OFFSET 0xa0 + +#define FMAN_CTRL_REGS_OFFSET 0x80 +#define FMAN_ALARM_REGS_OFFSET 0xb8 +#define FMAN_FIPER_REGS_OFFSET 0xd0 +#define FMAN_ETTS_REGS_OFFSET 0xe0 + + /* Bit definitions for the TMR_CTRL register */ #define ALM1P (1<<31) /* Alarm1 output polarity */ #define ALM2P (1<<30) /* Alarm2 output polarity */ @@ -105,10 +129,10 @@ struct qoriq_ptp_registers { #define DRIVER "ptp_qoriq" #define DEFAULT_CKSEL 1 #define N_EXT_TS 2 -#define REG_SIZE sizeof(struct qoriq_ptp_registers) struct qoriq_ptp { - struct qoriq_ptp_registers __iomem *regs; + void __iomem *base; + struct qoriq_ptp_registers regs; spinlock_t lock; /* protects regs */ struct ptp_clock *clock; struct ptp_clock_info caps; -- cgit v1.2.3-59-g8ed1b From e7d4a95da86e0b048702765bbdcdc968aaf312e7 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Wed, 20 Jun 2018 08:58:28 +0200 Subject: bitfield: fix *_encode_bits() There's a bug in *_encode_bits() in using ~field_multiplier() for the check whether or not the constant value fits into the field, this is wrong and clearly ~field_mask() was intended. This was triggering for me for both constant and non-constant values. Additionally, make this case actually into an compile error. Declaring the extern function that will never exist with just a warning is pointless as then later we'll just get a link error. While at it, also fix the indentation in those lines I'm touching. Finally, as suggested by Andy Shevchenko, add some tests and for that introduce also u8 helpers. The tests don't compile without the fix, showing that it's necessary. Fixes: 00b0c9b82663 ("Add primitives for manipulating bitfields both in host- and fixed-endian.") Reviewed-by: Andy Shevchenko Signed-off-by: Johannes Berg Signed-off-by: Kalle Valo --- include/linux/bitfield.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bitfield.h b/include/linux/bitfield.h index cf2588d81148..147a7bb341dd 100644 --- a/include/linux/bitfield.h +++ b/include/linux/bitfield.h @@ -104,7 +104,7 @@ (typeof(_mask))(((_reg) & (_mask)) >> __bf_shf(_mask)); \ }) -extern void __compiletime_warning("value doesn't fit into mask") +extern void __compiletime_error("value doesn't fit into mask") __field_overflow(void); extern void __compiletime_error("bad bitfield mask") __bad_mask(void); @@ -121,8 +121,8 @@ static __always_inline u64 field_mask(u64 field) #define ____MAKE_OP(type,base,to,from) \ static __always_inline __##type type##_encode_bits(base v, base field) \ { \ - if (__builtin_constant_p(v) && (v & ~field_multiplier(field))) \ - __field_overflow(); \ + if (__builtin_constant_p(v) && (v & ~field_mask(field))) \ + __field_overflow(); \ return to((v & field_mask(field)) * field_multiplier(field)); \ } \ static __always_inline __##type type##_replace_bits(__##type old, \ -- cgit v1.2.3-59-g8ed1b From 37a3862e1238262e9866d5d66e5f5f9069cee3a1 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Wed, 20 Jun 2018 08:58:29 +0200 Subject: bitfield: add u8 helpers There's no reason why we shouldn't pack/unpack bits into/from u8 values/registers/etc., so add u8 helpers. Use the ____MAKE_OP() macro directly to avoid having nonsense le8_encode_bits() and similar functions. Reviewed-by: Andy Shevchenko Signed-off-by: Johannes Berg Signed-off-by: Kalle Valo --- include/linux/bitfield.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/bitfield.h b/include/linux/bitfield.h index 147a7bb341dd..65a6981eef7b 100644 --- a/include/linux/bitfield.h +++ b/include/linux/bitfield.h @@ -143,6 +143,7 @@ static __always_inline base type##_get_bits(__##type v, base field) \ ____MAKE_OP(le##size,u##size,cpu_to_le##size,le##size##_to_cpu) \ ____MAKE_OP(be##size,u##size,cpu_to_be##size,be##size##_to_cpu) \ ____MAKE_OP(u##size,u##size,,) +____MAKE_OP(u8,u8,,) __MAKE_OP(16) __MAKE_OP(32) __MAKE_OP(64) -- cgit v1.2.3-59-g8ed1b From 80d19669ecd34423e85ca04f2210b0e42a47cb16 Mon Sep 17 00:00:00 2001 From: Amritha Nambiar Date: Fri, 29 Jun 2018 21:26:41 -0700 Subject: net: Refactor XPS for CPUs and Rx queues Refactor XPS code to support Tx queue selection based on CPU(s) map or Rx queue(s) map. Signed-off-by: Amritha Nambiar Signed-off-by: David S. Miller --- include/linux/cpumask.h | 11 ++- include/linux/netdevice.h | 98 ++++++++++++++++++++- net/core/dev.c | 211 ++++++++++++++++++++++++++++++---------------- net/core/net-sysfs.c | 4 +- 4 files changed, 244 insertions(+), 80 deletions(-) (limited to 'include/linux') diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h index bf53d893ad02..57f20a0a7794 100644 --- a/include/linux/cpumask.h +++ b/include/linux/cpumask.h @@ -115,12 +115,17 @@ extern struct cpumask __cpu_active_mask; #define cpu_active(cpu) ((cpu) == 0) #endif -/* verify cpu argument to cpumask_* operators */ -static inline unsigned int cpumask_check(unsigned int cpu) +static inline void cpu_max_bits_warn(unsigned int cpu, unsigned int bits) { #ifdef CONFIG_DEBUG_PER_CPU_MAPS - WARN_ON_ONCE(cpu >= nr_cpumask_bits); + WARN_ON_ONCE(cpu >= bits); #endif /* CONFIG_DEBUG_PER_CPU_MAPS */ +} + +/* verify cpu argument to cpumask_* operators */ +static inline unsigned int cpumask_check(unsigned int cpu) +{ + cpu_max_bits_warn(cpu, nr_cpumask_bits); return cpu; } diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index c6b377a15869..8bf8d6149f79 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -731,10 +731,15 @@ struct xps_map { */ struct xps_dev_maps { struct rcu_head rcu; - struct xps_map __rcu *cpu_map[0]; + struct xps_map __rcu *attr_map[0]; /* Either CPUs map or RXQs map */ }; -#define XPS_DEV_MAPS_SIZE(_tcs) (sizeof(struct xps_dev_maps) + \ + +#define XPS_CPU_DEV_MAPS_SIZE(_tcs) (sizeof(struct xps_dev_maps) + \ (nr_cpu_ids * (_tcs) * sizeof(struct xps_map *))) + +#define XPS_RXQ_DEV_MAPS_SIZE(_tcs, _rxqs) (sizeof(struct xps_dev_maps) +\ + (_rxqs * (_tcs) * sizeof(struct xps_map *))) + #endif /* CONFIG_XPS */ #define TC_MAX_QUEUE 16 @@ -1910,7 +1915,8 @@ struct net_device { int watchdog_timeo; #ifdef CONFIG_XPS - struct xps_dev_maps __rcu *xps_maps; + struct xps_dev_maps __rcu *xps_cpus_map; + struct xps_dev_maps __rcu *xps_rxqs_map; #endif #ifdef CONFIG_NET_CLS_ACT struct mini_Qdisc __rcu *miniq_egress; @@ -3259,6 +3265,92 @@ static inline void netif_wake_subqueue(struct net_device *dev, u16 queue_index) #ifdef CONFIG_XPS int netif_set_xps_queue(struct net_device *dev, const struct cpumask *mask, u16 index); +int __netif_set_xps_queue(struct net_device *dev, const unsigned long *mask, + u16 index, bool is_rxqs_map); + +/** + * netif_attr_test_mask - Test a CPU or Rx queue set in a mask + * @j: CPU/Rx queue index + * @mask: bitmask of all cpus/rx queues + * @nr_bits: number of bits in the bitmask + * + * Test if a CPU or Rx queue index is set in a mask of all CPU/Rx queues. + */ +static inline bool netif_attr_test_mask(unsigned long j, + const unsigned long *mask, + unsigned int nr_bits) +{ + cpu_max_bits_warn(j, nr_bits); + return test_bit(j, mask); +} + +/** + * netif_attr_test_online - Test for online CPU/Rx queue + * @j: CPU/Rx queue index + * @online_mask: bitmask for CPUs/Rx queues that are online + * @nr_bits: number of bits in the bitmask + * + * Returns true if a CPU/Rx queue is online. + */ +static inline bool netif_attr_test_online(unsigned long j, + const unsigned long *online_mask, + unsigned int nr_bits) +{ + cpu_max_bits_warn(j, nr_bits); + + if (online_mask) + return test_bit(j, online_mask); + + return (j < nr_bits); +} + +/** + * netif_attrmask_next - get the next CPU/Rx queue in a cpu/Rx queues mask + * @n: CPU/Rx queue index + * @srcp: the cpumask/Rx queue mask pointer + * @nr_bits: number of bits in the bitmask + * + * Returns >= nr_bits if no further CPUs/Rx queues set. + */ +static inline unsigned int netif_attrmask_next(int n, const unsigned long *srcp, + unsigned int nr_bits) +{ + /* -1 is a legal arg here. */ + if (n != -1) + cpu_max_bits_warn(n, nr_bits); + + if (srcp) + return find_next_bit(srcp, nr_bits, n + 1); + + return n + 1; +} + +/** + * netif_attrmask_next_and - get the next CPU/Rx queue in *src1p & *src2p + * @n: CPU/Rx queue index + * @src1p: the first CPUs/Rx queues mask pointer + * @src2p: the second CPUs/Rx queues mask pointer + * @nr_bits: number of bits in the bitmask + * + * Returns >= nr_bits if no further CPUs/Rx queues set in both. + */ +static inline int netif_attrmask_next_and(int n, const unsigned long *src1p, + const unsigned long *src2p, + unsigned int nr_bits) +{ + /* -1 is a legal arg here. */ + if (n != -1) + cpu_max_bits_warn(n, nr_bits); + + if (src1p && src2p) + return find_next_and_bit(src1p, src2p, nr_bits, n + 1); + else if (src1p) + return find_next_bit(src1p, nr_bits, n + 1); + else if (src2p) + return find_next_bit(src2p, nr_bits, n + 1); + + return n + 1; +} #else static inline int netif_set_xps_queue(struct net_device *dev, const struct cpumask *mask, diff --git a/net/core/dev.c b/net/core/dev.c index dffed642e686..71059558dc39 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -2092,7 +2092,7 @@ static bool remove_xps_queue(struct xps_dev_maps *dev_maps, int pos; if (dev_maps) - map = xmap_dereference(dev_maps->cpu_map[tci]); + map = xmap_dereference(dev_maps->attr_map[tci]); if (!map) return false; @@ -2105,7 +2105,7 @@ static bool remove_xps_queue(struct xps_dev_maps *dev_maps, break; } - RCU_INIT_POINTER(dev_maps->cpu_map[tci], NULL); + RCU_INIT_POINTER(dev_maps->attr_map[tci], NULL); kfree_rcu(map, rcu); return false; } @@ -2135,31 +2135,58 @@ static bool remove_xps_queue_cpu(struct net_device *dev, return active; } +static void clean_xps_maps(struct net_device *dev, const unsigned long *mask, + struct xps_dev_maps *dev_maps, unsigned int nr_ids, + u16 offset, u16 count, bool is_rxqs_map) +{ + bool active = false; + int i, j; + + for (j = -1; j = netif_attrmask_next(j, mask, nr_ids), + j < nr_ids;) + active |= remove_xps_queue_cpu(dev, dev_maps, j, offset, + count); + if (!active) { + if (is_rxqs_map) { + RCU_INIT_POINTER(dev->xps_rxqs_map, NULL); + } else { + RCU_INIT_POINTER(dev->xps_cpus_map, NULL); + + for (i = offset + (count - 1); count--; i--) + netdev_queue_numa_node_write( + netdev_get_tx_queue(dev, i), + NUMA_NO_NODE); + } + kfree_rcu(dev_maps, rcu); + } +} + static void netif_reset_xps_queues(struct net_device *dev, u16 offset, u16 count) { + const unsigned long *possible_mask = NULL; struct xps_dev_maps *dev_maps; - int cpu, i; - bool active = false; + unsigned int nr_ids; mutex_lock(&xps_map_mutex); - dev_maps = xmap_dereference(dev->xps_maps); - if (!dev_maps) - goto out_no_maps; - - for_each_possible_cpu(cpu) - active |= remove_xps_queue_cpu(dev, dev_maps, cpu, - offset, count); + dev_maps = xmap_dereference(dev->xps_rxqs_map); + if (dev_maps) { + nr_ids = dev->num_rx_queues; + clean_xps_maps(dev, possible_mask, dev_maps, nr_ids, offset, + count, true); - if (!active) { - RCU_INIT_POINTER(dev->xps_maps, NULL); - kfree_rcu(dev_maps, rcu); } - for (i = offset + (count - 1); count--; i--) - netdev_queue_numa_node_write(netdev_get_tx_queue(dev, i), - NUMA_NO_NODE); + dev_maps = xmap_dereference(dev->xps_cpus_map); + if (!dev_maps) + goto out_no_maps; + + if (num_possible_cpus() > 1) + possible_mask = cpumask_bits(cpu_possible_mask); + nr_ids = nr_cpu_ids; + clean_xps_maps(dev, possible_mask, dev_maps, nr_ids, offset, count, + false); out_no_maps: mutex_unlock(&xps_map_mutex); @@ -2170,8 +2197,8 @@ static void netif_reset_xps_queues_gt(struct net_device *dev, u16 index) netif_reset_xps_queues(dev, index, dev->num_tx_queues - index); } -static struct xps_map *expand_xps_map(struct xps_map *map, - int cpu, u16 index) +static struct xps_map *expand_xps_map(struct xps_map *map, int attr_index, + u16 index, bool is_rxqs_map) { struct xps_map *new_map; int alloc_len = XPS_MIN_MAP_ALLOC; @@ -2183,7 +2210,7 @@ static struct xps_map *expand_xps_map(struct xps_map *map, return map; } - /* Need to add queue to this CPU's existing map */ + /* Need to add tx-queue to this CPU's/rx-queue's existing map */ if (map) { if (pos < map->alloc_len) return map; @@ -2191,9 +2218,14 @@ static struct xps_map *expand_xps_map(struct xps_map *map, alloc_len = map->alloc_len * 2; } - /* Need to allocate new map to store queue on this CPU's map */ - new_map = kzalloc_node(XPS_MAP_SIZE(alloc_len), GFP_KERNEL, - cpu_to_node(cpu)); + /* Need to allocate new map to store tx-queue on this CPU's/rx-queue's + * map + */ + if (is_rxqs_map) + new_map = kzalloc(XPS_MAP_SIZE(alloc_len), GFP_KERNEL); + else + new_map = kzalloc_node(XPS_MAP_SIZE(alloc_len), GFP_KERNEL, + cpu_to_node(attr_index)); if (!new_map) return NULL; @@ -2205,14 +2237,16 @@ static struct xps_map *expand_xps_map(struct xps_map *map, return new_map; } -int netif_set_xps_queue(struct net_device *dev, const struct cpumask *mask, - u16 index) +int __netif_set_xps_queue(struct net_device *dev, const unsigned long *mask, + u16 index, bool is_rxqs_map) { + const unsigned long *online_mask = NULL, *possible_mask = NULL; struct xps_dev_maps *dev_maps, *new_dev_maps = NULL; - int i, cpu, tci, numa_node_id = -2; + int i, j, tci, numa_node_id = -2; int maps_sz, num_tc = 1, tc = 0; struct xps_map *map, *new_map; bool active = false; + unsigned int nr_ids; if (dev->num_tc) { num_tc = dev->num_tc; @@ -2221,16 +2255,27 @@ int netif_set_xps_queue(struct net_device *dev, const struct cpumask *mask, return -EINVAL; } - maps_sz = XPS_DEV_MAPS_SIZE(num_tc); - if (maps_sz < L1_CACHE_BYTES) - maps_sz = L1_CACHE_BYTES; - mutex_lock(&xps_map_mutex); + if (is_rxqs_map) { + maps_sz = XPS_RXQ_DEV_MAPS_SIZE(num_tc, dev->num_rx_queues); + dev_maps = xmap_dereference(dev->xps_rxqs_map); + nr_ids = dev->num_rx_queues; + } else { + maps_sz = XPS_CPU_DEV_MAPS_SIZE(num_tc); + if (num_possible_cpus() > 1) { + online_mask = cpumask_bits(cpu_online_mask); + possible_mask = cpumask_bits(cpu_possible_mask); + } + dev_maps = xmap_dereference(dev->xps_cpus_map); + nr_ids = nr_cpu_ids; + } - dev_maps = xmap_dereference(dev->xps_maps); + if (maps_sz < L1_CACHE_BYTES) + maps_sz = L1_CACHE_BYTES; /* allocate memory for queue storage */ - for_each_cpu_and(cpu, cpu_online_mask, mask) { + for (j = -1; j = netif_attrmask_next_and(j, online_mask, mask, nr_ids), + j < nr_ids;) { if (!new_dev_maps) new_dev_maps = kzalloc(maps_sz, GFP_KERNEL); if (!new_dev_maps) { @@ -2238,73 +2283,81 @@ int netif_set_xps_queue(struct net_device *dev, const struct cpumask *mask, return -ENOMEM; } - tci = cpu * num_tc + tc; - map = dev_maps ? xmap_dereference(dev_maps->cpu_map[tci]) : + tci = j * num_tc + tc; + map = dev_maps ? xmap_dereference(dev_maps->attr_map[tci]) : NULL; - map = expand_xps_map(map, cpu, index); + map = expand_xps_map(map, j, index, is_rxqs_map); if (!map) goto error; - RCU_INIT_POINTER(new_dev_maps->cpu_map[tci], map); + RCU_INIT_POINTER(new_dev_maps->attr_map[tci], map); } if (!new_dev_maps) goto out_no_new_maps; - for_each_possible_cpu(cpu) { + for (j = -1; j = netif_attrmask_next(j, possible_mask, nr_ids), + j < nr_ids;) { /* copy maps belonging to foreign traffic classes */ - for (i = tc, tci = cpu * num_tc; dev_maps && i--; tci++) { + for (i = tc, tci = j * num_tc; dev_maps && i--; tci++) { /* fill in the new device map from the old device map */ - map = xmap_dereference(dev_maps->cpu_map[tci]); - RCU_INIT_POINTER(new_dev_maps->cpu_map[tci], map); + map = xmap_dereference(dev_maps->attr_map[tci]); + RCU_INIT_POINTER(new_dev_maps->attr_map[tci], map); } /* We need to explicitly update tci as prevous loop * could break out early if dev_maps is NULL. */ - tci = cpu * num_tc + tc; + tci = j * num_tc + tc; - if (cpumask_test_cpu(cpu, mask) && cpu_online(cpu)) { - /* add queue to CPU maps */ + if (netif_attr_test_mask(j, mask, nr_ids) && + netif_attr_test_online(j, online_mask, nr_ids)) { + /* add tx-queue to CPU/rx-queue maps */ int pos = 0; - map = xmap_dereference(new_dev_maps->cpu_map[tci]); + map = xmap_dereference(new_dev_maps->attr_map[tci]); while ((pos < map->len) && (map->queues[pos] != index)) pos++; if (pos == map->len) map->queues[map->len++] = index; #ifdef CONFIG_NUMA - if (numa_node_id == -2) - numa_node_id = cpu_to_node(cpu); - else if (numa_node_id != cpu_to_node(cpu)) - numa_node_id = -1; + if (!is_rxqs_map) { + if (numa_node_id == -2) + numa_node_id = cpu_to_node(j); + else if (numa_node_id != cpu_to_node(j)) + numa_node_id = -1; + } #endif } else if (dev_maps) { /* fill in the new device map from the old device map */ - map = xmap_dereference(dev_maps->cpu_map[tci]); - RCU_INIT_POINTER(new_dev_maps->cpu_map[tci], map); + map = xmap_dereference(dev_maps->attr_map[tci]); + RCU_INIT_POINTER(new_dev_maps->attr_map[tci], map); } /* copy maps belonging to foreign traffic classes */ for (i = num_tc - tc, tci++; dev_maps && --i; tci++) { /* fill in the new device map from the old device map */ - map = xmap_dereference(dev_maps->cpu_map[tci]); - RCU_INIT_POINTER(new_dev_maps->cpu_map[tci], map); + map = xmap_dereference(dev_maps->attr_map[tci]); + RCU_INIT_POINTER(new_dev_maps->attr_map[tci], map); } } - rcu_assign_pointer(dev->xps_maps, new_dev_maps); + if (is_rxqs_map) + rcu_assign_pointer(dev->xps_rxqs_map, new_dev_maps); + else + rcu_assign_pointer(dev->xps_cpus_map, new_dev_maps); /* Cleanup old maps */ if (!dev_maps) goto out_no_old_maps; - for_each_possible_cpu(cpu) { - for (i = num_tc, tci = cpu * num_tc; i--; tci++) { - new_map = xmap_dereference(new_dev_maps->cpu_map[tci]); - map = xmap_dereference(dev_maps->cpu_map[tci]); + for (j = -1; j = netif_attrmask_next(j, possible_mask, nr_ids), + j < nr_ids;) { + for (i = num_tc, tci = j * num_tc; i--; tci++) { + new_map = xmap_dereference(new_dev_maps->attr_map[tci]); + map = xmap_dereference(dev_maps->attr_map[tci]); if (map && map != new_map) kfree_rcu(map, rcu); } @@ -2317,19 +2370,23 @@ out_no_old_maps: active = true; out_no_new_maps: - /* update Tx queue numa node */ - netdev_queue_numa_node_write(netdev_get_tx_queue(dev, index), - (numa_node_id >= 0) ? numa_node_id : - NUMA_NO_NODE); + if (!is_rxqs_map) { + /* update Tx queue numa node */ + netdev_queue_numa_node_write(netdev_get_tx_queue(dev, index), + (numa_node_id >= 0) ? + numa_node_id : NUMA_NO_NODE); + } if (!dev_maps) goto out_no_maps; - /* removes queue from unused CPUs */ - for_each_possible_cpu(cpu) { - for (i = tc, tci = cpu * num_tc; i--; tci++) + /* removes tx-queue from unused CPUs/rx-queues */ + for (j = -1; j = netif_attrmask_next(j, possible_mask, nr_ids), + j < nr_ids;) { + for (i = tc, tci = j * num_tc; i--; tci++) active |= remove_xps_queue(dev_maps, tci, index); - if (!cpumask_test_cpu(cpu, mask) || !cpu_online(cpu)) + if (!netif_attr_test_mask(j, mask, nr_ids) || + !netif_attr_test_online(j, online_mask, nr_ids)) active |= remove_xps_queue(dev_maps, tci, index); for (i = num_tc - tc, tci++; --i; tci++) active |= remove_xps_queue(dev_maps, tci, index); @@ -2337,7 +2394,10 @@ out_no_new_maps: /* free map if not active */ if (!active) { - RCU_INIT_POINTER(dev->xps_maps, NULL); + if (is_rxqs_map) + RCU_INIT_POINTER(dev->xps_rxqs_map, NULL); + else + RCU_INIT_POINTER(dev->xps_cpus_map, NULL); kfree_rcu(dev_maps, rcu); } @@ -2347,11 +2407,12 @@ out_no_maps: return 0; error: /* remove any maps that we added */ - for_each_possible_cpu(cpu) { - for (i = num_tc, tci = cpu * num_tc; i--; tci++) { - new_map = xmap_dereference(new_dev_maps->cpu_map[tci]); + for (j = -1; j = netif_attrmask_next(j, possible_mask, nr_ids), + j < nr_ids;) { + for (i = num_tc, tci = j * num_tc; i--; tci++) { + new_map = xmap_dereference(new_dev_maps->attr_map[tci]); map = dev_maps ? - xmap_dereference(dev_maps->cpu_map[tci]) : + xmap_dereference(dev_maps->attr_map[tci]) : NULL; if (new_map && new_map != map) kfree(new_map); @@ -2363,6 +2424,12 @@ error: kfree(new_dev_maps); return -ENOMEM; } + +int netif_set_xps_queue(struct net_device *dev, const struct cpumask *mask, + u16 index) +{ + return __netif_set_xps_queue(dev, cpumask_bits(mask), index, false); +} EXPORT_SYMBOL(netif_set_xps_queue); #endif @@ -3384,7 +3451,7 @@ static inline int get_xps_queue(struct net_device *dev, struct sk_buff *skb) int queue_index = -1; rcu_read_lock(); - dev_maps = rcu_dereference(dev->xps_maps); + dev_maps = rcu_dereference(dev->xps_cpus_map); if (dev_maps) { unsigned int tci = skb->sender_cpu - 1; @@ -3393,7 +3460,7 @@ static inline int get_xps_queue(struct net_device *dev, struct sk_buff *skb) tci += netdev_get_prio_tc_map(dev, skb->priority); } - map = rcu_dereference(dev_maps->cpu_map[tci]); + map = rcu_dereference(dev_maps->attr_map[tci]); if (map) { if (map->len == 1) queue_index = map->queues[0]; diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c index bb7e80f4ced3..b39987c81d53 100644 --- a/net/core/net-sysfs.c +++ b/net/core/net-sysfs.c @@ -1227,13 +1227,13 @@ static ssize_t xps_cpus_show(struct netdev_queue *queue, return -ENOMEM; rcu_read_lock(); - dev_maps = rcu_dereference(dev->xps_maps); + dev_maps = rcu_dereference(dev->xps_cpus_map); if (dev_maps) { for_each_possible_cpu(cpu) { int i, tci = cpu * num_tc + tc; struct xps_map *map; - map = rcu_dereference(dev_maps->cpu_map[tci]); + map = rcu_dereference(dev_maps->attr_map[tci]); if (!map) continue; -- cgit v1.2.3-59-g8ed1b From 8a9c58d28d0f66569737a3295116710ed24573cd Mon Sep 17 00:00:00 2001 From: Xin Long Date: Mon, 2 Jul 2018 18:21:12 +0800 Subject: sctp: add support for dscp and flowlabel per transport Like some other per transport params, flowlabel and dscp are added in transport, asoc and sctp_sock. By default, transport sets its value from asoc's, and asoc does it from sctp_sock. flowlabel only works for ipv6 transport. Other than that they need to be passed down in sctp_xmit, flow4/6 also needs to set them before looking up route in get_dst. Note that it uses '& 0x100000' to check if flowlabel is set and '& 0x1' (tos 1st bit is unused) to check if dscp is set by users, so that they could be set to 0 by sockopt in next patch. Signed-off-by: Xin Long Signed-off-by: David S. Miller --- include/linux/sctp.h | 7 +++++++ include/net/sctp/structs.h | 9 +++++++++ net/sctp/associola.c | 7 +++++++ net/sctp/ipv6.c | 11 +++++++++-- net/sctp/protocol.c | 16 ++++++++++++---- 5 files changed, 44 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sctp.h b/include/linux/sctp.h index b36c76635f18..83d94341e003 100644 --- a/include/linux/sctp.h +++ b/include/linux/sctp.h @@ -801,4 +801,11 @@ struct sctp_strreset_resptsn { __be32 receivers_next_tsn; }; +enum { + SCTP_DSCP_SET_MASK = 0x1, + SCTP_DSCP_VAL_MASK = 0xfc, + SCTP_FLOWLABEL_SET_MASK = 0x100000, + SCTP_FLOWLABEL_VAL_MASK = 0xfffff +}; + #endif /* __LINUX_SCTP_H__ */ diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h index 701a51736fa5..ab869e0d8326 100644 --- a/include/net/sctp/structs.h +++ b/include/net/sctp/structs.h @@ -193,6 +193,9 @@ struct sctp_sock { /* This is the max_retrans value for new associations. */ __u16 pathmaxrxt; + __u32 flowlabel; + __u8 dscp; + /* The initial Path MTU to use for new associations. */ __u32 pathmtu; @@ -895,6 +898,9 @@ struct sctp_transport { */ __u16 pathmaxrxt; + __u32 flowlabel; + __u8 dscp; + /* This is the partially failed retrans value for the transport * and will be initialized from the assocs value. This can be changed * using the SCTP_PEER_ADDR_THLDS socket option @@ -1772,6 +1778,9 @@ struct sctp_association { */ __u16 pathmaxrxt; + __u32 flowlabel; + __u8 dscp; + /* Flag that path mtu update is pending */ __u8 pmtu_pending; diff --git a/net/sctp/associola.c b/net/sctp/associola.c index 5d5a16204d50..16ecfbc95614 100644 --- a/net/sctp/associola.c +++ b/net/sctp/associola.c @@ -115,6 +115,9 @@ static struct sctp_association *sctp_association_init( /* Initialize path max retrans value. */ asoc->pathmaxrxt = sp->pathmaxrxt; + asoc->flowlabel = sp->flowlabel; + asoc->dscp = sp->dscp; + /* Initialize default path MTU. */ asoc->pathmtu = sp->pathmtu; @@ -647,6 +650,10 @@ struct sctp_transport *sctp_assoc_add_peer(struct sctp_association *asoc, peer->sackdelay = asoc->sackdelay; peer->sackfreq = asoc->sackfreq; + if (addr->sa.sa_family == AF_INET6) + peer->flowlabel = asoc->flowlabel; + peer->dscp = asoc->dscp; + /* Enable/disable heartbeat, SACK delay, and path MTU discovery * based on association setting. */ diff --git a/net/sctp/ipv6.c b/net/sctp/ipv6.c index 0cd2e764f47f..38102bf7f13e 100644 --- a/net/sctp/ipv6.c +++ b/net/sctp/ipv6.c @@ -209,12 +209,17 @@ static int sctp_v6_xmit(struct sk_buff *skb, struct sctp_transport *transport) struct sock *sk = skb->sk; struct ipv6_pinfo *np = inet6_sk(sk); struct flowi6 *fl6 = &transport->fl.u.ip6; + __u8 tclass = np->tclass; int res; pr_debug("%s: skb:%p, len:%d, src:%pI6 dst:%pI6\n", __func__, skb, skb->len, &fl6->saddr, &fl6->daddr); - IP6_ECN_flow_xmit(sk, fl6->flowlabel); + if (transport->dscp & SCTP_DSCP_SET_MASK) + tclass = transport->dscp & SCTP_DSCP_VAL_MASK; + + if (INET_ECN_is_capable(tclass)) + IP6_ECN_flow_xmit(sk, fl6->flowlabel); if (!(transport->param_flags & SPP_PMTUD_ENABLE)) skb->ignore_df = 1; @@ -223,7 +228,7 @@ static int sctp_v6_xmit(struct sk_buff *skb, struct sctp_transport *transport) rcu_read_lock(); res = ip6_xmit(sk, skb, fl6, sk->sk_mark, rcu_dereference(np->opt), - np->tclass); + tclass); rcu_read_unlock(); return res; } @@ -254,6 +259,8 @@ static void sctp_v6_get_dst(struct sctp_transport *t, union sctp_addr *saddr, fl6->flowi6_oif = daddr->v6.sin6_scope_id; else if (asoc) fl6->flowi6_oif = asoc->base.sk->sk_bound_dev_if; + if (t->flowlabel & SCTP_FLOWLABEL_SET_MASK) + fl6->flowlabel = htonl(t->flowlabel & SCTP_FLOWLABEL_VAL_MASK); pr_debug("%s: dst=%pI6 ", __func__, &fl6->daddr); diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c index 67f73d3a1356..e948db29ab53 100644 --- a/net/sctp/protocol.c +++ b/net/sctp/protocol.c @@ -426,13 +426,16 @@ static void sctp_v4_get_dst(struct sctp_transport *t, union sctp_addr *saddr, struct dst_entry *dst = NULL; union sctp_addr *daddr = &t->ipaddr; union sctp_addr dst_saddr; + __u8 tos = inet_sk(sk)->tos; + if (t->dscp & SCTP_DSCP_SET_MASK) + tos = t->dscp & SCTP_DSCP_VAL_MASK; memset(fl4, 0x0, sizeof(struct flowi4)); fl4->daddr = daddr->v4.sin_addr.s_addr; fl4->fl4_dport = daddr->v4.sin_port; fl4->flowi4_proto = IPPROTO_SCTP; if (asoc) { - fl4->flowi4_tos = RT_CONN_FLAGS(asoc->base.sk); + fl4->flowi4_tos = RT_CONN_FLAGS_TOS(asoc->base.sk, tos); fl4->flowi4_oif = asoc->base.sk->sk_bound_dev_if; fl4->fl4_sport = htons(asoc->base.bind_addr.port); } @@ -495,7 +498,7 @@ static void sctp_v4_get_dst(struct sctp_transport *t, union sctp_addr *saddr, fl4->fl4_sport = laddr->a.v4.sin_port; flowi4_update_output(fl4, asoc->base.sk->sk_bound_dev_if, - RT_CONN_FLAGS(asoc->base.sk), + RT_CONN_FLAGS_TOS(asoc->base.sk, tos), daddr->v4.sin_addr.s_addr, laddr->a.v4.sin_addr.s_addr); @@ -971,16 +974,21 @@ static inline int sctp_v4_xmit(struct sk_buff *skb, struct sctp_transport *transport) { struct inet_sock *inet = inet_sk(skb->sk); + __u8 dscp = inet->tos; pr_debug("%s: skb:%p, len:%d, src:%pI4, dst:%pI4\n", __func__, skb, - skb->len, &transport->fl.u.ip4.saddr, &transport->fl.u.ip4.daddr); + skb->len, &transport->fl.u.ip4.saddr, + &transport->fl.u.ip4.daddr); + + if (transport->dscp & SCTP_DSCP_SET_MASK) + dscp = transport->dscp & SCTP_DSCP_VAL_MASK; inet->pmtudisc = transport->param_flags & SPP_PMTUD_ENABLE ? IP_PMTUDISC_DO : IP_PMTUDISC_DONT; SCTP_INC_STATS(sock_net(&inet->sk), SCTP_MIB_OUTSCTPPACKS); - return ip_queue_xmit(&inet->sk, skb, &transport->fl); + return __ip_queue_xmit(&inet->sk, skb, &transport->fl, dscp); } static struct sctp_af sctp_af_inet; -- cgit v1.2.3-59-g8ed1b From f6ad8c1bcdf014272d08c55b9469536952a0a771 Mon Sep 17 00:00:00 2001 From: Edward Cree Date: Mon, 2 Jul 2018 16:12:45 +0100 Subject: net: core: trivial netif_receive_skb_list() entry point Just calls netif_receive_skb() in a loop. Signed-off-by: Edward Cree Signed-off-by: David S. Miller --- include/linux/netdevice.h | 1 + net/core/dev.c | 19 +++++++++++++++++++ 2 files changed, 20 insertions(+) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 64480a0f2c16..f67258f057ca 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -3477,6 +3477,7 @@ int netif_rx(struct sk_buff *skb); int netif_rx_ni(struct sk_buff *skb); int netif_receive_skb(struct sk_buff *skb); int netif_receive_skb_core(struct sk_buff *skb); +void netif_receive_skb_list(struct list_head *head); gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb); void napi_gro_flush(struct napi_struct *napi, bool flush_old); struct sk_buff *napi_get_frags(struct napi_struct *napi); diff --git a/net/core/dev.c b/net/core/dev.c index 08d58e0debe5..85c456a4b551 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -4906,6 +4906,25 @@ int netif_receive_skb(struct sk_buff *skb) } EXPORT_SYMBOL(netif_receive_skb); +/** + * netif_receive_skb_list - process many receive buffers from network + * @head: list of skbs to process. + * + * For now, just calls netif_receive_skb() in a loop, ignoring the + * return value. + * + * This function may only be called from softirq context and interrupts + * should be enabled. + */ +void netif_receive_skb_list(struct list_head *head) +{ + struct sk_buff *skb, *next; + + list_for_each_entry_safe(skb, next, head, list) + netif_receive_skb(skb); +} +EXPORT_SYMBOL(netif_receive_skb_list); + DEFINE_PER_CPU(struct work_struct, flush_works); /* Network device is going away, flush any packets still pending */ -- cgit v1.2.3-59-g8ed1b From 4ce0017a373afaaa9ef17614d8fa4f6fde261d18 Mon Sep 17 00:00:00 2001 From: Edward Cree Date: Mon, 2 Jul 2018 16:13:40 +0100 Subject: net: core: another layer of lists, around PF_MEMALLOC skb handling First example of a layer splitting the list (rather than merely taking individual packets off it). Involves new list.h function, list_cut_before(), like list_cut_position() but cuts on the other side of the given entry. Signed-off-by: Edward Cree Signed-off-by: David S. Miller --- include/linux/list.h | 30 ++++++++++++++++++++++++++++++ net/core/dev.c | 44 ++++++++++++++++++++++++++++++++++++-------- 2 files changed, 66 insertions(+), 8 deletions(-) (limited to 'include/linux') diff --git a/include/linux/list.h b/include/linux/list.h index 4b129df4d46b..de04cc5ed536 100644 --- a/include/linux/list.h +++ b/include/linux/list.h @@ -285,6 +285,36 @@ static inline void list_cut_position(struct list_head *list, __list_cut_position(list, head, entry); } +/** + * list_cut_before - cut a list into two, before given entry + * @list: a new list to add all removed entries + * @head: a list with entries + * @entry: an entry within head, could be the head itself + * + * This helper moves the initial part of @head, up to but + * excluding @entry, from @head to @list. You should pass + * in @entry an element you know is on @head. @list should + * be an empty list or a list you do not care about losing + * its data. + * If @entry == @head, all entries on @head are moved to + * @list. + */ +static inline void list_cut_before(struct list_head *list, + struct list_head *head, + struct list_head *entry) +{ + if (head->next == entry) { + INIT_LIST_HEAD(list); + return; + } + list->next = head->next; + list->next->prev = list; + list->prev = entry->prev; + list->prev->next = list; + head->next = entry; + entry->prev = head; +} + static inline void __list_splice(const struct list_head *list, struct list_head *prev, struct list_head *next) diff --git a/net/core/dev.c b/net/core/dev.c index 1e87361df2ab..9aadef976e8c 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -4784,6 +4784,14 @@ int netif_receive_skb_core(struct sk_buff *skb) } EXPORT_SYMBOL(netif_receive_skb_core); +static void __netif_receive_skb_list_core(struct list_head *head, bool pfmemalloc) +{ + struct sk_buff *skb, *next; + + list_for_each_entry_safe(skb, next, head, list) + __netif_receive_skb_core(skb, pfmemalloc); +} + static int __netif_receive_skb(struct sk_buff *skb) { int ret; @@ -4809,6 +4817,34 @@ static int __netif_receive_skb(struct sk_buff *skb) return ret; } +static void __netif_receive_skb_list(struct list_head *head) +{ + unsigned long noreclaim_flag = 0; + struct sk_buff *skb, *next; + bool pfmemalloc = false; /* Is current sublist PF_MEMALLOC? */ + + list_for_each_entry_safe(skb, next, head, list) { + if ((sk_memalloc_socks() && skb_pfmemalloc(skb)) != pfmemalloc) { + struct list_head sublist; + + /* Handle the previous sublist */ + list_cut_before(&sublist, head, &skb->list); + __netif_receive_skb_list_core(&sublist, pfmemalloc); + pfmemalloc = !pfmemalloc; + /* See comments in __netif_receive_skb */ + if (pfmemalloc) + noreclaim_flag = memalloc_noreclaim_save(); + else + memalloc_noreclaim_restore(noreclaim_flag); + } + } + /* Handle the remaining sublist */ + __netif_receive_skb_list_core(head, pfmemalloc); + /* Restore pflags */ + if (pfmemalloc) + memalloc_noreclaim_restore(noreclaim_flag); +} + static int generic_xdp_install(struct net_device *dev, struct netdev_bpf *xdp) { struct bpf_prog *old = rtnl_dereference(dev->xdp_prog); @@ -4843,14 +4879,6 @@ static int generic_xdp_install(struct net_device *dev, struct netdev_bpf *xdp) return ret; } -static void __netif_receive_skb_list(struct list_head *head) -{ - struct sk_buff *skb, *next; - - list_for_each_entry_safe(skb, next, head, list) - __netif_receive_skb(skb); -} - static int netif_receive_skb_internal(struct sk_buff *skb) { int ret; -- cgit v1.2.3-59-g8ed1b From 17266ee939849cb095ed7dd9edbec4162172226b Mon Sep 17 00:00:00 2001 From: Edward Cree Date: Mon, 2 Jul 2018 16:14:12 +0100 Subject: net: ipv4: listified version of ip_rcv Also involved adding a way to run a netfilter hook over a list of packets. Rather than attempting to make netfilter know about lists (which would be a major project in itself) we just let it call the regular okfn (in this case ip_rcv_finish()) for any packets it steals, and have it give us back a list of packets it's synchronously accepted (which normally NF_HOOK would automatically call okfn() on, but we want to be able to potentially pass the list to a listified version of okfn().) The netfilter hooks themselves are indirect calls that still happen per- packet (see nf_hook_entry_hookfn()), but again, changing that can be left for future work. There is potential for out-of-order receives if the netfilter hook ends up synchronously stealing packets, as they will be processed before any accepts earlier in the list. However, it was already possible for an asynchronous accept to cause out-of-order receives, so presumably this is considered OK. Signed-off-by: Edward Cree Signed-off-by: David S. Miller --- include/linux/netdevice.h | 3 +++ include/linux/netfilter.h | 22 +++++++++++++++ include/net/ip.h | 2 ++ net/core/dev.c | 8 +++--- net/ipv4/af_inet.c | 1 + net/ipv4/ip_input.c | 68 ++++++++++++++++++++++++++++++++++++++++++----- 6 files changed, 94 insertions(+), 10 deletions(-) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index f67258f057ca..c1ef749b6f9f 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -2297,6 +2297,9 @@ struct packet_type { struct net_device *, struct packet_type *, struct net_device *); + void (*list_func) (struct list_head *, + struct packet_type *, + struct net_device *); bool (*id_match)(struct packet_type *ptype, struct sock *sk); void *af_packet_priv; diff --git a/include/linux/netfilter.h b/include/linux/netfilter.h index dd2052f0efb7..5a5e0a2ab2a3 100644 --- a/include/linux/netfilter.h +++ b/include/linux/netfilter.h @@ -288,6 +288,20 @@ NF_HOOK(uint8_t pf, unsigned int hook, struct net *net, struct sock *sk, struct return ret; } +static inline void +NF_HOOK_LIST(uint8_t pf, unsigned int hook, struct net *net, struct sock *sk, + struct list_head *head, struct net_device *in, struct net_device *out, + int (*okfn)(struct net *, struct sock *, struct sk_buff *)) +{ + struct sk_buff *skb, *next; + + list_for_each_entry_safe(skb, next, head, list) { + int ret = nf_hook(pf, hook, net, sk, skb, in, out, okfn); + if (ret != 1) + list_del(&skb->list); + } +} + /* Call setsockopt() */ int nf_setsockopt(struct sock *sk, u_int8_t pf, int optval, char __user *opt, unsigned int len); @@ -369,6 +383,14 @@ NF_HOOK(uint8_t pf, unsigned int hook, struct net *net, struct sock *sk, return okfn(net, sk, skb); } +static inline void +NF_HOOK_LIST(uint8_t pf, unsigned int hook, struct net *net, struct sock *sk, + struct list_head *head, struct net_device *in, struct net_device *out, + int (*okfn)(struct net *, struct sock *, struct sk_buff *)) +{ + /* nothing to do */ +} + static inline int nf_hook(u_int8_t pf, unsigned int hook, struct net *net, struct sock *sk, struct sk_buff *skb, struct net_device *indev, struct net_device *outdev, diff --git a/include/net/ip.h b/include/net/ip.h index 09da79d8ceea..99d1b835d2aa 100644 --- a/include/net/ip.h +++ b/include/net/ip.h @@ -138,6 +138,8 @@ int ip_build_and_send_pkt(struct sk_buff *skb, const struct sock *sk, struct ip_options_rcu *opt); int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev); +void ip_list_rcv(struct list_head *head, struct packet_type *pt, + struct net_device *orig_dev); int ip_local_deliver(struct sk_buff *skb); int ip_mr_input(struct sk_buff *skb); int ip_output(struct net *net, struct sock *sk, struct sk_buff *skb); diff --git a/net/core/dev.c b/net/core/dev.c index 1bc485bb0678..5e22719ce71d 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -4806,9 +4806,11 @@ static inline void __netif_receive_skb_list_ptype(struct list_head *head, return; if (list_empty(head)) return; - - list_for_each_entry_safe(skb, next, head, list) - pt_prev->func(skb, skb->dev, pt_prev, orig_dev); + if (pt_prev->list_func != NULL) + pt_prev->list_func(head, pt_prev, orig_dev); + else + list_for_each_entry_safe(skb, next, head, list) + pt_prev->func(skb, skb->dev, pt_prev, orig_dev); } static void __netif_receive_skb_list_core(struct list_head *head, bool pfmemalloc) diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index 9263a2c114e0..c716be13d58c 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -1882,6 +1882,7 @@ fs_initcall(ipv4_offload_init); static struct packet_type ip_packet_type __read_mostly = { .type = cpu_to_be16(ETH_P_IP), .func = ip_rcv, + .list_func = ip_list_rcv, }; static int __init inet_init(void) diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c index 7582713dd18f..914240830bdf 100644 --- a/net/ipv4/ip_input.c +++ b/net/ipv4/ip_input.c @@ -408,10 +408,9 @@ drop_error: /* * Main IP Receive routine. */ -int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev) +static struct sk_buff *ip_rcv_core(struct sk_buff *skb, struct net *net) { const struct iphdr *iph; - struct net *net; u32 len; /* When the interface is in promisc. mode, drop all the crap @@ -421,7 +420,6 @@ int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, goto drop; - net = dev_net(dev); __IP_UPD_PO_STATS(net, IPSTATS_MIB_IN, skb->len); skb = skb_share_check(skb, GFP_ATOMIC); @@ -489,9 +487,7 @@ int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, /* Must drop socket now because of tproxy. */ skb_orphan(skb); - return NF_HOOK(NFPROTO_IPV4, NF_INET_PRE_ROUTING, - net, NULL, skb, dev, NULL, - ip_rcv_finish); + return skb; csum_error: __IP_INC_STATS(net, IPSTATS_MIB_CSUMERRORS); @@ -500,5 +496,63 @@ inhdr_error: drop: kfree_skb(skb); out: - return NET_RX_DROP; + return NULL; +} + +/* + * IP receive entry point + */ +int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, + struct net_device *orig_dev) +{ + struct net *net = dev_net(dev); + + skb = ip_rcv_core(skb, net); + if (skb == NULL) + return NET_RX_DROP; + return NF_HOOK(NFPROTO_IPV4, NF_INET_PRE_ROUTING, + net, NULL, skb, dev, NULL, + ip_rcv_finish); +} + +static void ip_sublist_rcv(struct list_head *head, struct net_device *dev, + struct net *net) +{ + struct sk_buff *skb, *next; + + NF_HOOK_LIST(NFPROTO_IPV4, NF_INET_PRE_ROUTING, net, NULL, + head, dev, NULL, ip_rcv_finish); + list_for_each_entry_safe(skb, next, head, list) + ip_rcv_finish(net, NULL, skb); +} + +/* Receive a list of IP packets */ +void ip_list_rcv(struct list_head *head, struct packet_type *pt, + struct net_device *orig_dev) +{ + struct net_device *curr_dev = NULL; + struct net *curr_net = NULL; + struct sk_buff *skb, *next; + struct list_head sublist; + + list_for_each_entry_safe(skb, next, head, list) { + struct net_device *dev = skb->dev; + struct net *net = dev_net(dev); + + skb = ip_rcv_core(skb, net); + if (skb == NULL) + continue; + + if (curr_dev != dev || curr_net != net) { + /* dispatch old sublist */ + list_cut_before(&sublist, head, &skb->list); + if (!list_empty(&sublist)) + ip_sublist_rcv(&sublist, dev, net); + /* start new sublist */ + curr_dev = dev; + curr_net = net; + } + } + /* dispatch final sublist */ + ip_sublist_rcv(head, curr_dev, curr_net); } -- cgit v1.2.3-59-g8ed1b From 25db26a91364db00f5a30da2fea8e9afe14a163c Mon Sep 17 00:00:00 2001 From: Vinicius Costa Gomes Date: Tue, 3 Jul 2018 15:42:53 -0700 Subject: net/sched: Introduce the ETF Qdisc The ETF (Earliest TxTime First) qdisc uses the information added earlier in this series (the socket option SO_TXTIME and the new role of sk_buff->tstamp) to schedule packets transmission based on absolute time. For some workloads, just bandwidth enforcement is not enough, and precise control of the transmission of packets is necessary. Example: $ tc qdisc replace dev enp2s0 parent root handle 100 mqprio num_tc 3 \ map 2 2 1 0 2 2 2 2 2 2 2 2 2 2 2 2 queues 1@0 1@1 2@2 hw 0 $ tc qdisc add dev enp2s0 parent 100:1 etf delta 100000 \ clockid CLOCK_TAI In this example, the Qdisc will provide SW best-effort for the control of the transmission time to the network adapter, the time stamp in the socket will be in reference to the clockid CLOCK_TAI and packets will leave the qdisc "delta" (100000) nanoseconds before its transmission time. The ETF qdisc will buffer packets sorted by their txtime. It will drop packets on enqueue() if their skbuff clockid does not match the clock reference of the Qdisc. Moreover, on dequeue(), a packet will be dropped if it expires while being enqueued. The qdisc also supports the SO_TXTIME deadline mode. For this mode, it will dequeue a packet as soon as possible and change the skb timestamp to 'now' during etf_dequeue(). Note that both the qdisc's and the SO_TXTIME ABIs allow for a clockid to be configured, but it's been decided that usage of CLOCK_TAI should be enforced until we decide to allow for other clockids to be used. The rationale here is that PTP times are usually in the TAI scale, thus no other clocks should be necessary. For now, the qdisc will return EINVAL if any clocks other than CLOCK_TAI are used. Signed-off-by: Jesus Sanchez-Palencia Signed-off-by: Vinicius Costa Gomes Signed-off-by: David S. Miller --- include/linux/netdevice.h | 1 + include/uapi/linux/pkt_sched.h | 17 ++ net/sched/Kconfig | 11 ++ net/sched/Makefile | 1 + net/sched/sch_etf.c | 384 +++++++++++++++++++++++++++++++++++++++++ 5 files changed, 414 insertions(+) create mode 100644 net/sched/sch_etf.c (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index c1ef749b6f9f..f06ee8f91e74 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -798,6 +798,7 @@ enum tc_setup_type { TC_SETUP_QDISC_RED, TC_SETUP_QDISC_PRIO, TC_SETUP_QDISC_MQ, + TC_SETUP_QDISC_ETF, }; /* These structures hold the attributes of bpf state that are being passed diff --git a/include/uapi/linux/pkt_sched.h b/include/uapi/linux/pkt_sched.h index bad3c03bcf43..d5e933ce1447 100644 --- a/include/uapi/linux/pkt_sched.h +++ b/include/uapi/linux/pkt_sched.h @@ -937,4 +937,21 @@ enum { #define TCA_CBS_MAX (__TCA_CBS_MAX - 1) + +/* ETF */ +struct tc_etf_qopt { + __s32 delta; + __s32 clockid; + __u32 flags; +#define TC_ETF_DEADLINE_MODE_ON BIT(0) +}; + +enum { + TCA_ETF_UNSPEC, + TCA_ETF_PARMS, + __TCA_ETF_MAX, +}; + +#define TCA_ETF_MAX (__TCA_ETF_MAX - 1) + #endif diff --git a/net/sched/Kconfig b/net/sched/Kconfig index a01169fb5325..fcc89706745b 100644 --- a/net/sched/Kconfig +++ b/net/sched/Kconfig @@ -183,6 +183,17 @@ config NET_SCH_CBS To compile this code as a module, choose M here: the module will be called sch_cbs. +config NET_SCH_ETF + tristate "Earliest TxTime First (ETF)" + help + Say Y here if you want to use the Earliest TxTime First (ETF) packet + scheduling algorithm. + + See the top of for more details. + + To compile this code as a module, choose M here: the + module will be called sch_etf. + config NET_SCH_GRED tristate "Generic Random Early Detection (GRED)" ---help--- diff --git a/net/sched/Makefile b/net/sched/Makefile index 8811d3804878..9a5a7077d217 100644 --- a/net/sched/Makefile +++ b/net/sched/Makefile @@ -54,6 +54,7 @@ obj-$(CONFIG_NET_SCH_FQ) += sch_fq.o obj-$(CONFIG_NET_SCH_HHF) += sch_hhf.o obj-$(CONFIG_NET_SCH_PIE) += sch_pie.o obj-$(CONFIG_NET_SCH_CBS) += sch_cbs.o +obj-$(CONFIG_NET_SCH_ETF) += sch_etf.o obj-$(CONFIG_NET_CLS_U32) += cls_u32.o obj-$(CONFIG_NET_CLS_ROUTE4) += cls_route.o diff --git a/net/sched/sch_etf.c b/net/sched/sch_etf.c new file mode 100644 index 000000000000..4b7f4903ac17 --- /dev/null +++ b/net/sched/sch_etf.c @@ -0,0 +1,384 @@ +// SPDX-License-Identifier: GPL-2.0 + +/* net/sched/sch_etf.c Earliest TxTime First queueing discipline. + * + * Authors: Jesus Sanchez-Palencia + * Vinicius Costa Gomes + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define DEADLINE_MODE_IS_ON(x) ((x)->flags & TC_ETF_DEADLINE_MODE_ON) + +struct etf_sched_data { + bool deadline_mode; + int clockid; + int queue; + s32 delta; /* in ns */ + ktime_t last; /* The txtime of the last skb sent to the netdevice. */ + struct rb_root head; + struct qdisc_watchdog watchdog; + ktime_t (*get_time)(void); +}; + +static const struct nla_policy etf_policy[TCA_ETF_MAX + 1] = { + [TCA_ETF_PARMS] = { .len = sizeof(struct tc_etf_qopt) }, +}; + +static inline int validate_input_params(struct tc_etf_qopt *qopt, + struct netlink_ext_ack *extack) +{ + /* Check if params comply to the following rules: + * * Clockid and delta must be valid. + * + * * Dynamic clockids are not supported. + * + * * Delta must be a positive integer. + */ + if (qopt->clockid < 0) { + NL_SET_ERR_MSG(extack, "Dynamic clockids are not supported"); + return -ENOTSUPP; + } + + if (qopt->clockid != CLOCK_TAI) { + NL_SET_ERR_MSG(extack, "Invalid clockid. CLOCK_TAI must be used"); + return -EINVAL; + } + + if (qopt->delta < 0) { + NL_SET_ERR_MSG(extack, "Delta must be positive"); + return -EINVAL; + } + + return 0; +} + +static bool is_packet_valid(struct Qdisc *sch, struct sk_buff *nskb) +{ + struct etf_sched_data *q = qdisc_priv(sch); + ktime_t txtime = nskb->tstamp; + struct sock *sk = nskb->sk; + ktime_t now; + + if (!sk) + return false; + + if (!sock_flag(sk, SOCK_TXTIME)) + return false; + + /* We don't perform crosstimestamping. + * Drop if packet's clockid differs from qdisc's. + */ + if (sk->sk_clockid != q->clockid) + return false; + + if (sk->sk_txtime_deadline_mode != q->deadline_mode) + return false; + + now = q->get_time(); + if (ktime_before(txtime, now) || ktime_before(txtime, q->last)) + return false; + + return true; +} + +static struct sk_buff *etf_peek_timesortedlist(struct Qdisc *sch) +{ + struct etf_sched_data *q = qdisc_priv(sch); + struct rb_node *p; + + p = rb_first(&q->head); + if (!p) + return NULL; + + return rb_to_skb(p); +} + +static void reset_watchdog(struct Qdisc *sch) +{ + struct etf_sched_data *q = qdisc_priv(sch); + struct sk_buff *skb = etf_peek_timesortedlist(sch); + ktime_t next; + + if (!skb) + return; + + next = ktime_sub_ns(skb->tstamp, q->delta); + qdisc_watchdog_schedule_ns(&q->watchdog, ktime_to_ns(next)); +} + +static int etf_enqueue_timesortedlist(struct sk_buff *nskb, struct Qdisc *sch, + struct sk_buff **to_free) +{ + struct etf_sched_data *q = qdisc_priv(sch); + struct rb_node **p = &q->head.rb_node, *parent = NULL; + ktime_t txtime = nskb->tstamp; + + if (!is_packet_valid(sch, nskb)) + return qdisc_drop(nskb, sch, to_free); + + while (*p) { + struct sk_buff *skb; + + parent = *p; + skb = rb_to_skb(parent); + if (ktime_after(txtime, skb->tstamp)) + p = &parent->rb_right; + else + p = &parent->rb_left; + } + rb_link_node(&nskb->rbnode, parent, p); + rb_insert_color(&nskb->rbnode, &q->head); + + qdisc_qstats_backlog_inc(sch, nskb); + sch->q.qlen++; + + /* Now we may need to re-arm the qdisc watchdog for the next packet. */ + reset_watchdog(sch); + + return NET_XMIT_SUCCESS; +} + +static void timesortedlist_erase(struct Qdisc *sch, struct sk_buff *skb, + bool drop) +{ + struct etf_sched_data *q = qdisc_priv(sch); + + rb_erase(&skb->rbnode, &q->head); + + /* The rbnode field in the skb re-uses these fields, now that + * we are done with the rbnode, reset them. + */ + skb->next = NULL; + skb->prev = NULL; + skb->dev = qdisc_dev(sch); + + qdisc_qstats_backlog_dec(sch, skb); + + if (drop) { + struct sk_buff *to_free = NULL; + + qdisc_drop(skb, sch, &to_free); + kfree_skb_list(to_free); + qdisc_qstats_overlimit(sch); + } else { + qdisc_bstats_update(sch, skb); + + q->last = skb->tstamp; + } + + sch->q.qlen--; +} + +static struct sk_buff *etf_dequeue_timesortedlist(struct Qdisc *sch) +{ + struct etf_sched_data *q = qdisc_priv(sch); + struct sk_buff *skb; + ktime_t now, next; + + skb = etf_peek_timesortedlist(sch); + if (!skb) + return NULL; + + now = q->get_time(); + + /* Drop if packet has expired while in queue. */ + /* FIXME: Must return error on the socket's error queue */ + if (ktime_before(skb->tstamp, now)) { + timesortedlist_erase(sch, skb, true); + skb = NULL; + goto out; + } + + /* When in deadline mode, dequeue as soon as possible and change the + * txtime from deadline to (now + delta). + */ + if (q->deadline_mode) { + timesortedlist_erase(sch, skb, false); + skb->tstamp = now; + goto out; + } + + next = ktime_sub_ns(skb->tstamp, q->delta); + + /* Dequeue only if now is within the [txtime - delta, txtime] range. */ + if (ktime_after(now, next)) + timesortedlist_erase(sch, skb, false); + else + skb = NULL; + +out: + /* Now we may need to re-arm the qdisc watchdog for the next packet. */ + reset_watchdog(sch); + + return skb; +} + +static int etf_init(struct Qdisc *sch, struct nlattr *opt, + struct netlink_ext_ack *extack) +{ + struct etf_sched_data *q = qdisc_priv(sch); + struct net_device *dev = qdisc_dev(sch); + struct nlattr *tb[TCA_ETF_MAX + 1]; + struct tc_etf_qopt *qopt; + int err; + + if (!opt) { + NL_SET_ERR_MSG(extack, + "Missing ETF qdisc options which are mandatory"); + return -EINVAL; + } + + err = nla_parse_nested(tb, TCA_ETF_MAX, opt, etf_policy, extack); + if (err < 0) + return err; + + if (!tb[TCA_ETF_PARMS]) { + NL_SET_ERR_MSG(extack, "Missing mandatory ETF parameters"); + return -EINVAL; + } + + qopt = nla_data(tb[TCA_ETF_PARMS]); + + pr_debug("delta %d clockid %d deadline %s\n", + qopt->delta, qopt->clockid, + DEADLINE_MODE_IS_ON(qopt) ? "on" : "off"); + + err = validate_input_params(qopt, extack); + if (err < 0) + return err; + + q->queue = sch->dev_queue - netdev_get_tx_queue(dev, 0); + + /* Everything went OK, save the parameters used. */ + q->delta = qopt->delta; + q->clockid = qopt->clockid; + q->deadline_mode = DEADLINE_MODE_IS_ON(qopt); + + switch (q->clockid) { + case CLOCK_REALTIME: + q->get_time = ktime_get_real; + break; + case CLOCK_MONOTONIC: + q->get_time = ktime_get; + break; + case CLOCK_BOOTTIME: + q->get_time = ktime_get_boottime; + break; + case CLOCK_TAI: + q->get_time = ktime_get_clocktai; + break; + default: + NL_SET_ERR_MSG(extack, "Clockid is not supported"); + return -ENOTSUPP; + } + + qdisc_watchdog_init_clockid(&q->watchdog, sch, q->clockid); + + return 0; +} + +static void timesortedlist_clear(struct Qdisc *sch) +{ + struct etf_sched_data *q = qdisc_priv(sch); + struct rb_node *p = rb_first(&q->head); + + while (p) { + struct sk_buff *skb = rb_to_skb(p); + + p = rb_next(p); + + rb_erase(&skb->rbnode, &q->head); + rtnl_kfree_skbs(skb, skb); + sch->q.qlen--; + } +} + +static void etf_reset(struct Qdisc *sch) +{ + struct etf_sched_data *q = qdisc_priv(sch); + + /* Only cancel watchdog if it's been initialized. */ + if (q->watchdog.qdisc == sch) + qdisc_watchdog_cancel(&q->watchdog); + + /* No matter which mode we are on, it's safe to clear both lists. */ + timesortedlist_clear(sch); + __qdisc_reset_queue(&sch->q); + + sch->qstats.backlog = 0; + sch->q.qlen = 0; + + q->last = 0; +} + +static void etf_destroy(struct Qdisc *sch) +{ + struct etf_sched_data *q = qdisc_priv(sch); + + /* Only cancel watchdog if it's been initialized. */ + if (q->watchdog.qdisc == sch) + qdisc_watchdog_cancel(&q->watchdog); +} + +static int etf_dump(struct Qdisc *sch, struct sk_buff *skb) +{ + struct etf_sched_data *q = qdisc_priv(sch); + struct tc_etf_qopt opt = { }; + struct nlattr *nest; + + nest = nla_nest_start(skb, TCA_OPTIONS); + if (!nest) + goto nla_put_failure; + + opt.delta = q->delta; + opt.clockid = q->clockid; + if (q->deadline_mode) + opt.flags |= TC_ETF_DEADLINE_MODE_ON; + + if (nla_put(skb, TCA_ETF_PARMS, sizeof(opt), &opt)) + goto nla_put_failure; + + return nla_nest_end(skb, nest); + +nla_put_failure: + nla_nest_cancel(skb, nest); + return -1; +} + +static struct Qdisc_ops etf_qdisc_ops __read_mostly = { + .id = "etf", + .priv_size = sizeof(struct etf_sched_data), + .enqueue = etf_enqueue_timesortedlist, + .dequeue = etf_dequeue_timesortedlist, + .peek = etf_peek_timesortedlist, + .init = etf_init, + .reset = etf_reset, + .destroy = etf_destroy, + .dump = etf_dump, + .owner = THIS_MODULE, +}; + +static int __init etf_module_init(void) +{ + return register_qdisc(&etf_qdisc_ops); +} + +static void __exit etf_module_exit(void) +{ + unregister_qdisc(&etf_qdisc_ops); +} +module_init(etf_module_init) +module_exit(etf_module_exit) +MODULE_LICENSE("GPL"); -- cgit v1.2.3-59-g8ed1b From 4d4fb5dc988a36307711be292bde6e39b8bdbceb Mon Sep 17 00:00:00 2001 From: Yonatan Cohen Date: Tue, 19 Jun 2018 08:47:22 +0300 Subject: net/mlx5: Limit scope of dump_fill_mkey function mlx5_core_dump_fill_mkey() is going to be used in next patch in IB and doesn't need to be visible to whole mlx5_core. Move that command to mlx5_ib. Signed-off-by: Yonatan Cohen Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/mlx5/cmd.c | 15 +++++++++++++++ drivers/infiniband/hw/mlx5/cmd.h | 1 + drivers/net/ethernet/mellanox/mlx5/core/mr.c | 17 ----------------- include/linux/mlx5/driver.h | 2 -- 4 files changed, 16 insertions(+), 19 deletions(-) (limited to 'include/linux') diff --git a/drivers/infiniband/hw/mlx5/cmd.c b/drivers/infiniband/hw/mlx5/cmd.c index 188512bf46e6..ccc0b5d06a7d 100644 --- a/drivers/infiniband/hw/mlx5/cmd.c +++ b/drivers/infiniband/hw/mlx5/cmd.c @@ -32,6 +32,21 @@ #include "cmd.h" +int mlx5_cmd_dump_fill_mkey(struct mlx5_core_dev *dev, u32 *mkey) +{ + u32 out[MLX5_ST_SZ_DW(query_special_contexts_out)] = {0}; + u32 in[MLX5_ST_SZ_DW(query_special_contexts_in)] = {0}; + int err; + + MLX5_SET(query_special_contexts_in, in, opcode, + MLX5_CMD_OP_QUERY_SPECIAL_CONTEXTS); + err = mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out)); + if (!err) + *mkey = MLX5_GET(query_special_contexts_out, out, + dump_fill_mkey); + return err; +} + int mlx5_cmd_null_mkey(struct mlx5_core_dev *dev, u32 *null_mkey) { u32 out[MLX5_ST_SZ_DW(query_special_contexts_out)] = {}; diff --git a/drivers/infiniband/hw/mlx5/cmd.h b/drivers/infiniband/hw/mlx5/cmd.h index e7206c8a8011..98ea4648c655 100644 --- a/drivers/infiniband/hw/mlx5/cmd.h +++ b/drivers/infiniband/hw/mlx5/cmd.h @@ -37,6 +37,7 @@ #include #include +int mlx5_cmd_dump_fill_mkey(struct mlx5_core_dev *dev, u32 *mkey); int mlx5_cmd_null_mkey(struct mlx5_core_dev *dev, u32 *null_mkey); int mlx5_cmd_query_cong_params(struct mlx5_core_dev *dev, int cong_point, void *out, int out_size); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/mr.c b/drivers/net/ethernet/mellanox/mlx5/core/mr.c index f4f02f775c93..0670165afd5f 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/mr.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/mr.c @@ -146,23 +146,6 @@ int mlx5_core_query_mkey(struct mlx5_core_dev *dev, struct mlx5_core_mkey *mkey, } EXPORT_SYMBOL(mlx5_core_query_mkey); -int mlx5_core_dump_fill_mkey(struct mlx5_core_dev *dev, struct mlx5_core_mkey *_mkey, - u32 *mkey) -{ - u32 out[MLX5_ST_SZ_DW(query_special_contexts_out)] = {0}; - u32 in[MLX5_ST_SZ_DW(query_special_contexts_in)] = {0}; - int err; - - MLX5_SET(query_special_contexts_in, in, opcode, - MLX5_CMD_OP_QUERY_SPECIAL_CONTEXTS); - err = mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out)); - if (!err) - *mkey = MLX5_GET(query_special_contexts_out, out, - dump_fill_mkey); - return err; -} -EXPORT_SYMBOL(mlx5_core_dump_fill_mkey); - static inline u32 mlx5_get_psv(u32 *out, int psv_index) { switch (psv_index) { diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index 80cbb7fdce4a..1cb1c0317b77 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -1067,8 +1067,6 @@ int mlx5_core_destroy_mkey(struct mlx5_core_dev *dev, struct mlx5_core_mkey *mkey); int mlx5_core_query_mkey(struct mlx5_core_dev *dev, struct mlx5_core_mkey *mkey, u32 *out, int outlen); -int mlx5_core_dump_fill_mkey(struct mlx5_core_dev *dev, struct mlx5_core_mkey *_mkey, - u32 *mkey); int mlx5_core_alloc_pd(struct mlx5_core_dev *dev, u32 *pdn); int mlx5_core_dealloc_pd(struct mlx5_core_dev *dev, u32 pdn); int mlx5_core_mad_ifc(struct mlx5_core_dev *dev, const void *inb, void *outb, -- cgit v1.2.3-59-g8ed1b From b183ee27f5fb07c8428e2fe45d5f35dac611c45d Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Tue, 19 Jun 2018 08:47:23 +0300 Subject: net/mlx5: Add hardware definitions for dump_fill_mkey MLX5 IB HCA offers the memory key, dump_fill_mkey to boost performance by forcing local HCA operations to skip the PCI bus access, This patch adds needed hardware definitions. Signed-off-by: Leon Romanovsky --- include/linux/mlx5/mlx5_ifc.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index deb3a459a22e..1853e7fd6924 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -885,7 +885,9 @@ struct mlx5_ifc_cmd_hca_cap_bits { u8 log_max_eq_sz[0x8]; u8 reserved_at_e8[0x2]; u8 log_max_mkey[0x6]; - u8 reserved_at_f0[0xc]; + u8 reserved_at_f0[0x8]; + u8 dump_fill_mkey[0x1]; + u8 reserved_at_f9[0x3]; u8 log_max_eq[0x4]; u8 max_indirection[0x8]; -- cgit v1.2.3-59-g8ed1b From 6312fe77751f57d4fa2b28abeef84c6a95c28136 Mon Sep 17 00:00:00 2001 From: Li RongQing Date: Thu, 5 Jul 2018 14:34:32 +0800 Subject: net: limit each hash list length to MAX_GRO_SKBS After commit 07d78363dcff ("net: Convert NAPI gro list into a small hash table.")' there is 8 hash buckets, which allows more flows to be held for merging. but MAX_GRO_SKBS, the total held skb for merging, is 8 skb still, limit the hash table performance. keep MAX_GRO_SKBS as 8 skb, but limit each hash list length to 8 skb, not the total 8 skb Signed-off-by: Li RongQing Signed-off-by: David S. Miller --- include/linux/netdevice.h | 7 +++++- net/core/dev.c | 56 +++++++++++++++++++---------------------------- 2 files changed, 29 insertions(+), 34 deletions(-) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index f06ee8f91e74..b683971e500d 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -302,6 +302,11 @@ struct netdev_boot_setup { int __init netdev_boot_setup(char *str); +struct gro_list { + struct list_head list; + int count; +}; + /* * Structure for NAPI scheduling similar to tasklet but with weighting */ @@ -323,7 +328,7 @@ struct napi_struct { int poll_owner; #endif struct net_device *dev; - struct list_head gro_hash[GRO_HASH_BUCKETS]; + struct gro_list gro_hash[GRO_HASH_BUCKETS]; struct sk_buff *skb; struct hrtimer timer; struct list_head dev_list; diff --git a/net/core/dev.c b/net/core/dev.c index 7e6a2f66db5c..89825c1eccdc 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -149,7 +149,6 @@ #include "net-sysfs.h" -/* Instead of increasing this, you should create a hash table. */ #define MAX_GRO_SKBS 8 /* This should be increased if a protocol with a bigger head is added. */ @@ -5151,9 +5150,10 @@ out: return netif_receive_skb_internal(skb); } -static void __napi_gro_flush_chain(struct napi_struct *napi, struct list_head *head, +static void __napi_gro_flush_chain(struct napi_struct *napi, u32 index, bool flush_old) { + struct list_head *head = &napi->gro_hash[index].list; struct sk_buff *skb, *p; list_for_each_entry_safe_reverse(skb, p, head, list) { @@ -5162,22 +5162,20 @@ static void __napi_gro_flush_chain(struct napi_struct *napi, struct list_head *h list_del_init(&skb->list); napi_gro_complete(skb); napi->gro_count--; + napi->gro_hash[index].count--; } } -/* napi->gro_hash contains packets ordered by age. +/* napi->gro_hash[].list contains packets ordered by age. * youngest packets at the head of it. * Complete skbs in reverse order to reduce latencies. */ void napi_gro_flush(struct napi_struct *napi, bool flush_old) { - int i; - - for (i = 0; i < GRO_HASH_BUCKETS; i++) { - struct list_head *head = &napi->gro_hash[i]; + u32 i; - __napi_gro_flush_chain(napi, head, flush_old); - } + for (i = 0; i < GRO_HASH_BUCKETS; i++) + __napi_gro_flush_chain(napi, i, flush_old); } EXPORT_SYMBOL(napi_gro_flush); @@ -5189,7 +5187,7 @@ static struct list_head *gro_list_prepare(struct napi_struct *napi, struct list_head *head; struct sk_buff *p; - head = &napi->gro_hash[hash & (GRO_HASH_BUCKETS - 1)]; + head = &napi->gro_hash[hash & (GRO_HASH_BUCKETS - 1)].list; list_for_each_entry(p, head, list) { unsigned long diffs; @@ -5257,27 +5255,13 @@ static void gro_pull_from_frag0(struct sk_buff *skb, int grow) } } -static void gro_flush_oldest(struct napi_struct *napi) +static void gro_flush_oldest(struct list_head *head) { - struct sk_buff *oldest = NULL; - unsigned long age = jiffies; - int i; - - for (i = 0; i < GRO_HASH_BUCKETS; i++) { - struct list_head *head = &napi->gro_hash[i]; - struct sk_buff *skb; - - if (list_empty(head)) - continue; + struct sk_buff *oldest; - skb = list_last_entry(head, struct sk_buff, list); - if (!oldest || time_before(NAPI_GRO_CB(skb)->age, age)) { - oldest = skb; - age = NAPI_GRO_CB(skb)->age; - } - } + oldest = list_last_entry(head, struct sk_buff, list); - /* We are called with napi->gro_count >= MAX_GRO_SKBS, so this is + /* We are called with head length >= MAX_GRO_SKBS, so this is * impossible. */ if (WARN_ON_ONCE(!oldest)) @@ -5292,6 +5276,7 @@ static void gro_flush_oldest(struct napi_struct *napi) static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb) { + u32 hash = skb_get_hash_raw(skb) & (GRO_HASH_BUCKETS - 1); struct list_head *head = &offload_base; struct packet_offload *ptype; __be16 type = skb->protocol; @@ -5358,6 +5343,7 @@ static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff list_del_init(&pp->list); napi_gro_complete(pp); napi->gro_count--; + napi->gro_hash[hash].count--; } if (same_flow) @@ -5366,10 +5352,11 @@ static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff if (NAPI_GRO_CB(skb)->flush) goto normal; - if (unlikely(napi->gro_count >= MAX_GRO_SKBS)) { - gro_flush_oldest(napi); + if (unlikely(napi->gro_hash[hash].count >= MAX_GRO_SKBS)) { + gro_flush_oldest(gro_head); } else { napi->gro_count++; + napi->gro_hash[hash].count++; } NAPI_GRO_CB(skb)->count = 1; NAPI_GRO_CB(skb)->age = jiffies; @@ -6006,8 +5993,10 @@ void netif_napi_add(struct net_device *dev, struct napi_struct *napi, hrtimer_init(&napi->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED); napi->timer.function = napi_watchdog; napi->gro_count = 0; - for (i = 0; i < GRO_HASH_BUCKETS; i++) - INIT_LIST_HEAD(&napi->gro_hash[i]); + for (i = 0; i < GRO_HASH_BUCKETS; i++) { + INIT_LIST_HEAD(&napi->gro_hash[i].list); + napi->gro_hash[i].count = 0; + } napi->skb = NULL; napi->poll = poll; if (weight > NAPI_POLL_WEIGHT) @@ -6047,8 +6036,9 @@ static void flush_gro_hash(struct napi_struct *napi) for (i = 0; i < GRO_HASH_BUCKETS; i++) { struct sk_buff *skb, *n; - list_for_each_entry_safe(skb, n, &napi->gro_hash[i], list) + list_for_each_entry_safe(skb, n, &napi->gro_hash[i].list, list) kfree_skb(skb); + napi->gro_hash[i].count = 0; } } -- cgit v1.2.3-59-g8ed1b From 1b0707a781eeaeeaacb464e18bb3f049b99b496b Mon Sep 17 00:00:00 2001 From: Corentin Labbe Date: Thu, 7 Jun 2018 19:50:38 +0000 Subject: Bluetooth: remove unused bt-nokia-h4p.h header Nothing in tree use this header which seems a remains of a staging driver. This patch remove it. Signed-off-by: Corentin Labbe Reviewed-by: Sebastian Reichel Signed-off-by: Marcel Holtmann --- include/linux/platform_data/bt-nokia-h4p.h | 38 ------------------------------ 1 file changed, 38 deletions(-) delete mode 100644 include/linux/platform_data/bt-nokia-h4p.h (limited to 'include/linux') diff --git a/include/linux/platform_data/bt-nokia-h4p.h b/include/linux/platform_data/bt-nokia-h4p.h deleted file mode 100644 index 30d169dfadf3..000000000000 --- a/include/linux/platform_data/bt-nokia-h4p.h +++ /dev/null @@ -1,38 +0,0 @@ -/* - * This file is part of Nokia H4P bluetooth driver - * - * Copyright (C) 2010 Nokia Corporation. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * version 2 as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA - * 02110-1301 USA - * - */ - - -/** - * struct hci_h4p_platform data - hci_h4p Platform data structure - */ -struct hci_h4p_platform_data { - int chip_type; - int bt_sysclk; - unsigned int bt_wakeup_gpio; - unsigned int host_wakeup_gpio; - unsigned int reset_gpio; - int reset_gpio_shared; - unsigned int uart_irq; - phys_addr_t uart_base; - const char *uart_iclk; - const char *uart_fclk; - void (*set_pm_limits)(struct device *dev, bool set); -}; -- cgit v1.2.3-59-g8ed1b From 06ae48269d1e0324d806fca30fe77112f4a4a14a Mon Sep 17 00:00:00 2001 From: Jiong Wang Date: Fri, 6 Jul 2018 15:13:18 -0700 Subject: lib: reciprocal_div: implement the improved algorithm on the paper mentioned The new added "reciprocal_value_adv" implements the advanced version of the algorithm described in Figure 4.2 of the paper except when "divisor > (1U << 31)" whose ceil(log2(d)) result will be 32 which then requires u128 divide on host. The exception case could be easily handled before calling "reciprocal_value_adv". The advanced version requires more complex calculation to get the reciprocal multiplier and other control variables, but then could reduce the required emulation operations. It makes no sense to use this advanced version for host divide emulation, those extra complexities for calculating multiplier etc could completely waive our saving on emulation operations. However, it makes sense to use it for JIT divide code generation (for example eBPF JIT backends) for which we are willing to trade performance of JITed code with that of host. As shown by the following pseudo code, the required emulation operations could go down from 6 (the basic version) to 3 or 4. To use the result of "reciprocal_value_adv", suppose we want to calculate n/d, the C-style pseudo code will be the following, it could be easily changed to real code generation for other JIT targets. struct reciprocal_value_adv rvalue; u8 pre_shift, exp; // handle exception case. if (d >= (1U << 31)) { result = n >= d; return; } rvalue = reciprocal_value_adv(d, 32) exp = rvalue.exp; if (rvalue.is_wide_m && !(d & 1)) { // floor(log2(d & (2^32 -d))) pre_shift = fls(d & -d) - 1; rvalue = reciprocal_value_adv(d >> pre_shift, 32 - pre_shift); } else { pre_shift = 0; } // code generation starts. if (imm == 1U << exp) { result = n >> exp; } else if (rvalue.is_wide_m) { // pre_shift must be zero when reached here. t = (n * rvalue.m) >> 32; result = n - t; result >>= 1; result += t; result >>= rvalue.sh - 1; } else { if (pre_shift) result = n >> pre_shift; result = ((u64)result * rvalue.m) >> 32; result >>= rvalue.sh; } Signed-off-by: Jiong Wang Reviewed-by: Jakub Kicinski Signed-off-by: Daniel Borkmann --- include/linux/reciprocal_div.h | 68 ++++++++++++++++++++++++++++++++++++++++++ lib/reciprocal_div.c | 41 +++++++++++++++++++++++++ 2 files changed, 109 insertions(+) (limited to 'include/linux') diff --git a/include/linux/reciprocal_div.h b/include/linux/reciprocal_div.h index e031e9f2f9d8..585ce89c0f33 100644 --- a/include/linux/reciprocal_div.h +++ b/include/linux/reciprocal_div.h @@ -25,6 +25,9 @@ struct reciprocal_value { u8 sh1, sh2; }; +/* "reciprocal_value" and "reciprocal_divide" together implement the basic + * version of the algorithm described in Figure 4.1 of the paper. + */ struct reciprocal_value reciprocal_value(u32 d); static inline u32 reciprocal_divide(u32 a, struct reciprocal_value R) @@ -33,4 +36,69 @@ static inline u32 reciprocal_divide(u32 a, struct reciprocal_value R) return (t + ((a - t) >> R.sh1)) >> R.sh2; } +struct reciprocal_value_adv { + u32 m; + u8 sh, exp; + bool is_wide_m; +}; + +/* "reciprocal_value_adv" implements the advanced version of the algorithm + * described in Figure 4.2 of the paper except when "divisor > (1U << 31)" whose + * ceil(log2(d)) result will be 32 which then requires u128 divide on host. The + * exception case could be easily handled before calling "reciprocal_value_adv". + * + * The advanced version requires more complex calculation to get the reciprocal + * multiplier and other control variables, but then could reduce the required + * emulation operations. + * + * It makes no sense to use this advanced version for host divide emulation, + * those extra complexities for calculating multiplier etc could completely + * waive our saving on emulation operations. + * + * However, it makes sense to use it for JIT divide code generation for which + * we are willing to trade performance of JITed code with that of host. As shown + * by the following pseudo code, the required emulation operations could go down + * from 6 (the basic version) to 3 or 4. + * + * To use the result of "reciprocal_value_adv", suppose we want to calculate + * n/d, the pseudo C code will be: + * + * struct reciprocal_value_adv rvalue; + * u8 pre_shift, exp; + * + * // handle exception case. + * if (d >= (1U << 31)) { + * result = n >= d; + * return; + * } + * + * rvalue = reciprocal_value_adv(d, 32) + * exp = rvalue.exp; + * if (rvalue.is_wide_m && !(d & 1)) { + * // floor(log2(d & (2^32 -d))) + * pre_shift = fls(d & -d) - 1; + * rvalue = reciprocal_value_adv(d >> pre_shift, 32 - pre_shift); + * } else { + * pre_shift = 0; + * } + * + * // code generation starts. + * if (imm == 1U << exp) { + * result = n >> exp; + * } else if (rvalue.is_wide_m) { + * // pre_shift must be zero when reached here. + * t = (n * rvalue.m) >> 32; + * result = n - t; + * result >>= 1; + * result += t; + * result >>= rvalue.sh - 1; + * } else { + * if (pre_shift) + * result = n >> pre_shift; + * result = ((u64)result * rvalue.m) >> 32; + * result >>= rvalue.sh; + * } + */ +struct reciprocal_value_adv reciprocal_value_adv(u32 d, u8 prec); + #endif /* _LINUX_RECIPROCAL_DIV_H */ diff --git a/lib/reciprocal_div.c b/lib/reciprocal_div.c index fcb4ce682c6f..bf043258fa00 100644 --- a/lib/reciprocal_div.c +++ b/lib/reciprocal_div.c @@ -1,4 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 +#include #include #include #include @@ -26,3 +27,43 @@ struct reciprocal_value reciprocal_value(u32 d) return R; } EXPORT_SYMBOL(reciprocal_value); + +struct reciprocal_value_adv reciprocal_value_adv(u32 d, u8 prec) +{ + struct reciprocal_value_adv R; + u32 l, post_shift; + u64 mhigh, mlow; + + /* ceil(log2(d)) */ + l = fls(d - 1); + /* NOTE: mlow/mhigh could overflow u64 when l == 32. This case needs to + * be handled before calling "reciprocal_value_adv", please see the + * comment at include/linux/reciprocal_div.h. + */ + WARN(l == 32, + "ceil(log2(0x%08x)) == 32, %s doesn't support such divisor", + d, __func__); + post_shift = l; + mlow = 1ULL << (32 + l); + do_div(mlow, d); + mhigh = (1ULL << (32 + l)) + (1ULL << (32 + l - prec)); + do_div(mhigh, d); + + for (; post_shift > 0; post_shift--) { + u64 lo = mlow >> 1, hi = mhigh >> 1; + + if (lo >= hi) + break; + + mlow = lo; + mhigh = hi; + } + + R.m = (u32)mhigh; + R.sh = post_shift; + R.exp = l; + R.is_wide_m = mhigh > U32_MAX; + + return R; +} +EXPORT_SYMBOL(reciprocal_value_adv); -- cgit v1.2.3-59-g8ed1b From b233504033dbd65740e59681820ccfd0a2a8ec53 Mon Sep 17 00:00:00 2001 From: Yifeng Sun Date: Mon, 2 Jul 2018 08:18:03 -0700 Subject: openvswitch: kernel datapath clone action Add 'clone' action to kernel datapath by using existing functions. When actions within clone don't modify the current flow, the flow key is not cloned before executing clone actions. This is a follow up patch for this incomplete work: https://patchwork.ozlabs.org/patch/722096/ v1 -> v2: Refactor as advised by reviewer. Signed-off-by: Yifeng Sun Signed-off-by: Andy Zhou Acked-by: Pravin B Shelar Signed-off-by: David S. Miller --- include/linux/openvswitch.h | 5 +++ include/uapi/linux/openvswitch.h | 3 ++ net/openvswitch/actions.c | 33 ++++++++++++++++++ net/openvswitch/flow_netlink.c | 73 ++++++++++++++++++++++++++++++++++++++++ 4 files changed, 114 insertions(+) (limited to 'include/linux') diff --git a/include/linux/openvswitch.h b/include/linux/openvswitch.h index e6b240b6196c..379affc63e24 100644 --- a/include/linux/openvswitch.h +++ b/include/linux/openvswitch.h @@ -21,4 +21,9 @@ #include +#define OVS_CLONE_ATTR_EXEC 0 /* Specify an u32 value. When nonzero, + * actions in clone will not change flow + * keys. False otherwise. + */ + #endif /* _LINUX_OPENVSWITCH_H */ diff --git a/include/uapi/linux/openvswitch.h b/include/uapi/linux/openvswitch.h index 863aabaa5cc9..dbe0cbe4f1b7 100644 --- a/include/uapi/linux/openvswitch.h +++ b/include/uapi/linux/openvswitch.h @@ -840,6 +840,8 @@ struct ovs_action_push_eth { * @OVS_ACTION_ATTR_POP_NSH: pop the outermost NSH header off the packet. * @OVS_ACTION_ATTR_METER: Run packet through a meter, which may drop the * packet, or modify the packet (e.g., change the DSCP field). + * @OVS_ACTION_ATTR_CLONE: make a copy of the packet and execute a list of + * actions without affecting the original packet and key. * * Only a single header can be set with a single %OVS_ACTION_ATTR_SET. Not all * fields within a header are modifiable, e.g. the IPv4 protocol and fragment @@ -873,6 +875,7 @@ enum ovs_action_attr { OVS_ACTION_ATTR_PUSH_NSH, /* Nested OVS_NSH_KEY_ATTR_*. */ OVS_ACTION_ATTR_POP_NSH, /* No argument. */ OVS_ACTION_ATTR_METER, /* u32 meter ID. */ + OVS_ACTION_ATTR_CLONE, /* Nested OVS_CLONE_ATTR_*. */ __OVS_ACTION_ATTR_MAX, /* Nothing past this will be accepted * from userspace. */ diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c index 30a5df27116e..85ae53d8fd09 100644 --- a/net/openvswitch/actions.c +++ b/net/openvswitch/actions.c @@ -1057,6 +1057,28 @@ static int sample(struct datapath *dp, struct sk_buff *skb, clone_flow_key); } +/* When 'last' is true, clone() should always consume the 'skb'. + * Otherwise, clone() should keep 'skb' intact regardless what + * actions are executed within clone(). + */ +static int clone(struct datapath *dp, struct sk_buff *skb, + struct sw_flow_key *key, const struct nlattr *attr, + bool last) +{ + struct nlattr *actions; + struct nlattr *clone_arg; + int rem = nla_len(attr); + bool dont_clone_flow_key; + + /* The first action is always 'OVS_CLONE_ATTR_ARG'. */ + clone_arg = nla_data(attr); + dont_clone_flow_key = nla_get_u32(clone_arg); + actions = nla_next(clone_arg, &rem); + + return clone_execute(dp, skb, key, 0, actions, rem, last, + !dont_clone_flow_key); +} + static void execute_hash(struct sk_buff *skb, struct sw_flow_key *key, const struct nlattr *attr) { @@ -1336,6 +1358,17 @@ static int do_execute_actions(struct datapath *dp, struct sk_buff *skb, consume_skb(skb); return 0; } + break; + + case OVS_ACTION_ATTR_CLONE: { + bool last = nla_is_last(a, rem); + + err = clone(dp, skb, key, a, last); + if (last) + return err; + + break; + } } if (unlikely(err)) { diff --git a/net/openvswitch/flow_netlink.c b/net/openvswitch/flow_netlink.c index 391c4073a6dc..a70097ecf33c 100644 --- a/net/openvswitch/flow_netlink.c +++ b/net/openvswitch/flow_netlink.c @@ -2460,6 +2460,40 @@ static int validate_and_copy_sample(struct net *net, const struct nlattr *attr, return 0; } +static int validate_and_copy_clone(struct net *net, + const struct nlattr *attr, + const struct sw_flow_key *key, + struct sw_flow_actions **sfa, + __be16 eth_type, __be16 vlan_tci, + bool log, bool last) +{ + int start, err; + u32 exec; + + if (nla_len(attr) && nla_len(attr) < NLA_HDRLEN) + return -EINVAL; + + start = add_nested_action_start(sfa, OVS_ACTION_ATTR_CLONE, log); + if (start < 0) + return start; + + exec = last || !actions_may_change_flow(attr); + + err = ovs_nla_add_action(sfa, OVS_CLONE_ATTR_EXEC, &exec, + sizeof(exec), log); + if (err) + return err; + + err = __ovs_nla_copy_actions(net, attr, key, sfa, + eth_type, vlan_tci, log); + if (err) + return err; + + add_nested_action_end(*sfa, start); + + return 0; +} + void ovs_match_init(struct sw_flow_match *match, struct sw_flow_key *key, bool reset_key, @@ -2849,6 +2883,7 @@ static int __ovs_nla_copy_actions(struct net *net, const struct nlattr *attr, [OVS_ACTION_ATTR_PUSH_NSH] = (u32)-1, [OVS_ACTION_ATTR_POP_NSH] = 0, [OVS_ACTION_ATTR_METER] = sizeof(u32), + [OVS_ACTION_ATTR_CLONE] = (u32)-1, }; const struct ovs_action_push_vlan *vlan; int type = nla_type(a); @@ -3038,6 +3073,18 @@ static int __ovs_nla_copy_actions(struct net *net, const struct nlattr *attr, /* Non-existent meters are simply ignored. */ break; + case OVS_ACTION_ATTR_CLONE: { + bool last = nla_is_last(a, rem); + + err = validate_and_copy_clone(net, a, key, sfa, + eth_type, vlan_tci, + log, last); + if (err) + return err; + skip_copy = true; + break; + } + default: OVS_NLERR(log, "Unknown Action type %d", type); return -EINVAL; @@ -3116,6 +3163,26 @@ out: return err; } +static int clone_action_to_attr(const struct nlattr *attr, + struct sk_buff *skb) +{ + struct nlattr *start; + int err = 0, rem = nla_len(attr); + + start = nla_nest_start(skb, OVS_ACTION_ATTR_CLONE); + if (!start) + return -EMSGSIZE; + + err = ovs_nla_put_actions(nla_data(attr), rem, skb); + + if (err) + nla_nest_cancel(skb, start); + else + nla_nest_end(skb, start); + + return err; +} + static int set_action_to_attr(const struct nlattr *a, struct sk_buff *skb) { const struct nlattr *ovs_key = nla_data(a); @@ -3204,6 +3271,12 @@ int ovs_nla_put_actions(const struct nlattr *attr, int len, struct sk_buff *skb) return err; break; + case OVS_ACTION_ATTR_CLONE: + err = clone_action_to_attr(a, skb); + if (err) + return err; + break; + default: if (nla_put(skb, type, nla_len(a), nla_data(a))) return -EMSGSIZE; -- cgit v1.2.3-59-g8ed1b From ffcfe25bb50f27395e15fa999f1a7eb769f55360 Mon Sep 17 00:00:00 2001 From: Alexander Duyck Date: Mon, 9 Jul 2018 12:19:38 -0400 Subject: net: Add support for subordinate device traffic classes This patch is meant to provide the basic tools needed to allow us to create subordinate device traffic classes. The general idea here is to allow subdividing the queues of a device into queue groups accessible through an upper device such as a macvlan. The idea here is to enforce the idea that an upper device has to be a single queue device, ideally with IFF_NO_QUQUE set. With that being the case we can pretty much guarantee that the tc_to_txq mappings and XPS maps for the upper device are unused. As such we could reuse those in order to support subdividing the lower device and distributing those queues between the subordinate devices. In order to distinguish between a regular set of traffic classes and if a device is carrying subordinate traffic classes I changed num_tc from a u8 to a s16 value and use the negative values to represent the subordinate pool values. So starting at -1 and running to -32768 we can encode those as pool values, and the existing values of 0 to 15 can be maintained. Signed-off-by: Alexander Duyck Tested-by: Andrew Bowers Signed-off-by: Jeff Kirsher --- include/linux/netdevice.h | 16 ++++++++- net/core/dev.c | 89 +++++++++++++++++++++++++++++++++++++++++++++++ net/core/net-sysfs.c | 21 ++++++++++- 3 files changed, 124 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index b683971e500d..b1ff77276bc4 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -575,6 +575,9 @@ struct netdev_queue { * (/sys/class/net/DEV/Q/trans_timeout) */ unsigned long trans_timeout; + + /* Subordinate device that the queue has been assigned to */ + struct net_device *sb_dev; /* * write-mostly part */ @@ -1991,7 +1994,7 @@ struct net_device { #ifdef CONFIG_DCB const struct dcbnl_rtnl_ops *dcbnl_ops; #endif - u8 num_tc; + s16 num_tc; struct netdev_tc_txq tc_to_txq[TC_MAX_QUEUE]; u8 prio_tc_map[TC_BITMASK + 1]; @@ -2045,6 +2048,17 @@ int netdev_get_num_tc(struct net_device *dev) return dev->num_tc; } +void netdev_unbind_sb_channel(struct net_device *dev, + struct net_device *sb_dev); +int netdev_bind_sb_channel_queue(struct net_device *dev, + struct net_device *sb_dev, + u8 tc, u16 count, u16 offset); +int netdev_set_sb_channel(struct net_device *dev, u16 channel); +static inline int netdev_get_sb_channel(struct net_device *dev) +{ + return max_t(int, -dev->num_tc, 0); +} + static inline struct netdev_queue *netdev_get_tx_queue(const struct net_device *dev, unsigned int index) diff --git a/net/core/dev.c b/net/core/dev.c index 89825c1eccdc..cc1d6bba017a 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -2067,11 +2067,13 @@ int netdev_txq_to_tc(struct net_device *dev, unsigned int txq) struct netdev_tc_txq *tc = &dev->tc_to_txq[0]; int i; + /* walk through the TCs and see if it falls into any of them */ for (i = 0; i < TC_MAX_QUEUE; i++, tc++) { if ((txq - tc->offset) < tc->count) return i; } + /* didn't find it, just return -1 to indicate no match */ return -1; } @@ -2260,7 +2262,14 @@ int __netif_set_xps_queue(struct net_device *dev, const unsigned long *mask, unsigned int nr_ids; if (dev->num_tc) { + /* Do not allow XPS on subordinate device directly */ num_tc = dev->num_tc; + if (num_tc < 0) + return -EINVAL; + + /* If queue belongs to subordinate dev use its map */ + dev = netdev_get_tx_queue(dev, index)->sb_dev ? : dev; + tc = netdev_txq_to_tc(dev, index); if (tc < 0) return -EINVAL; @@ -2448,11 +2457,25 @@ int netif_set_xps_queue(struct net_device *dev, const struct cpumask *mask, EXPORT_SYMBOL(netif_set_xps_queue); #endif +static void netdev_unbind_all_sb_channels(struct net_device *dev) +{ + struct netdev_queue *txq = &dev->_tx[dev->num_tx_queues]; + + /* Unbind any subordinate channels */ + while (txq-- != &dev->_tx[0]) { + if (txq->sb_dev) + netdev_unbind_sb_channel(dev, txq->sb_dev); + } +} + void netdev_reset_tc(struct net_device *dev) { #ifdef CONFIG_XPS netif_reset_xps_queues_gt(dev, 0); #endif + netdev_unbind_all_sb_channels(dev); + + /* Reset TC configuration of device */ dev->num_tc = 0; memset(dev->tc_to_txq, 0, sizeof(dev->tc_to_txq)); memset(dev->prio_tc_map, 0, sizeof(dev->prio_tc_map)); @@ -2481,11 +2504,77 @@ int netdev_set_num_tc(struct net_device *dev, u8 num_tc) #ifdef CONFIG_XPS netif_reset_xps_queues_gt(dev, 0); #endif + netdev_unbind_all_sb_channels(dev); + dev->num_tc = num_tc; return 0; } EXPORT_SYMBOL(netdev_set_num_tc); +void netdev_unbind_sb_channel(struct net_device *dev, + struct net_device *sb_dev) +{ + struct netdev_queue *txq = &dev->_tx[dev->num_tx_queues]; + +#ifdef CONFIG_XPS + netif_reset_xps_queues_gt(sb_dev, 0); +#endif + memset(sb_dev->tc_to_txq, 0, sizeof(sb_dev->tc_to_txq)); + memset(sb_dev->prio_tc_map, 0, sizeof(sb_dev->prio_tc_map)); + + while (txq-- != &dev->_tx[0]) { + if (txq->sb_dev == sb_dev) + txq->sb_dev = NULL; + } +} +EXPORT_SYMBOL(netdev_unbind_sb_channel); + +int netdev_bind_sb_channel_queue(struct net_device *dev, + struct net_device *sb_dev, + u8 tc, u16 count, u16 offset) +{ + /* Make certain the sb_dev and dev are already configured */ + if (sb_dev->num_tc >= 0 || tc >= dev->num_tc) + return -EINVAL; + + /* We cannot hand out queues we don't have */ + if ((offset + count) > dev->real_num_tx_queues) + return -EINVAL; + + /* Record the mapping */ + sb_dev->tc_to_txq[tc].count = count; + sb_dev->tc_to_txq[tc].offset = offset; + + /* Provide a way for Tx queue to find the tc_to_txq map or + * XPS map for itself. + */ + while (count--) + netdev_get_tx_queue(dev, count + offset)->sb_dev = sb_dev; + + return 0; +} +EXPORT_SYMBOL(netdev_bind_sb_channel_queue); + +int netdev_set_sb_channel(struct net_device *dev, u16 channel) +{ + /* Do not use a multiqueue device to represent a subordinate channel */ + if (netif_is_multiqueue(dev)) + return -ENODEV; + + /* We allow channels 1 - 32767 to be used for subordinate channels. + * Channel 0 is meant to be "native" mode and used only to represent + * the main root device. We allow writing 0 to reset the device back + * to normal mode after being used as a subordinate channel. + */ + if (channel > S16_MAX) + return -EINVAL; + + dev->num_tc = -channel; + + return 0; +} +EXPORT_SYMBOL(netdev_set_sb_channel); + /* * Routine to help set real_num_tx_queues. To avoid skbs mapped to queues * greater than real_num_tx_queues stale skbs on the qdisc must be flushed. diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c index dce3ae0fbca2..ffa1d18f2c2c 100644 --- a/net/core/net-sysfs.c +++ b/net/core/net-sysfs.c @@ -1054,11 +1054,23 @@ static ssize_t traffic_class_show(struct netdev_queue *queue, return -ENOENT; index = get_netdev_queue_index(queue); + + /* If queue belongs to subordinate dev use its TC mapping */ + dev = netdev_get_tx_queue(dev, index)->sb_dev ? : dev; + tc = netdev_txq_to_tc(dev, index); if (tc < 0) return -EINVAL; - return sprintf(buf, "%u\n", tc); + /* We can report the traffic class one of two ways: + * Subordinate device traffic classes are reported with the traffic + * class first, and then the subordinate class so for example TC0 on + * subordinate device 2 will be reported as "0-2". If the queue + * belongs to the root device it will be reported with just the + * traffic class, so just "0" for TC 0 for example. + */ + return dev->num_tc < 0 ? sprintf(buf, "%u%d\n", tc, dev->num_tc) : + sprintf(buf, "%u\n", tc); } #ifdef CONFIG_XPS @@ -1225,7 +1237,14 @@ static ssize_t xps_cpus_show(struct netdev_queue *queue, index = get_netdev_queue_index(queue); if (dev->num_tc) { + /* Do not allow XPS on subordinate device directly */ num_tc = dev->num_tc; + if (num_tc < 0) + return -EINVAL; + + /* If queue belongs to subordinate dev use its map */ + dev = netdev_get_tx_queue(dev, index)->sb_dev ? : dev; + tc = netdev_txq_to_tc(dev, index); if (tc < 0) return -EINVAL; -- cgit v1.2.3-59-g8ed1b From eadec877ce9ca46a94e9036b5a44e7941d4fc501 Mon Sep 17 00:00:00 2001 From: Alexander Duyck Date: Mon, 9 Jul 2018 12:19:48 -0400 Subject: net: Add support for subordinate traffic classes to netdev_pick_tx This change makes it so that we can support the concept of subordinate device traffic classes to the core networking code. In doing this we can start pulling out the driver specific bits needed to support selecting a queue based on an upper device. The solution at is currently stands is only partially implemented. I have the start of some XPS bits in here, but I would still need to allow for configuration of the XPS maps on the queues reserved for the subordinate devices. For now I am using the reference to the sb_dev XPS map as just a way to skip the lookup of the lower device XPS map for now as that would result in the wrong queue being picked. Signed-off-by: Alexander Duyck Tested-by: Andrew Bowers Signed-off-by: Jeff Kirsher --- drivers/net/ethernet/intel/ixgbe/ixgbe_main.c | 19 +++------ drivers/net/macvlan.c | 10 +---- include/linux/netdevice.h | 4 +- net/core/dev.c | 58 ++++++++++++++++----------- 4 files changed, 45 insertions(+), 46 deletions(-) (limited to 'include/linux') diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c index 80225af2acb1..abb176df2e7f 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c @@ -8208,20 +8208,17 @@ static void ixgbe_atr(struct ixgbe_ring *ring, input, common, ring->queue_index); } +#ifdef IXGBE_FCOE static u16 ixgbe_select_queue(struct net_device *dev, struct sk_buff *skb, void *accel_priv, select_queue_fallback_t fallback) { - struct ixgbe_fwd_adapter *fwd_adapter = accel_priv; -#ifdef IXGBE_FCOE struct ixgbe_adapter *adapter; struct ixgbe_ring_feature *f; -#endif int txq; - if (fwd_adapter) { - u8 tc = netdev_get_num_tc(dev) ? - netdev_get_prio_tc_map(dev, skb->priority) : 0; - struct net_device *vdev = fwd_adapter->netdev; + if (accel_priv) { + u8 tc = netdev_get_prio_tc_map(dev, skb->priority); + struct net_device *vdev = accel_priv; txq = vdev->tc_to_txq[tc].offset; txq += reciprocal_scale(skb_get_hash(skb), @@ -8230,8 +8227,6 @@ static u16 ixgbe_select_queue(struct net_device *dev, struct sk_buff *skb, return txq; } -#ifdef IXGBE_FCOE - /* * only execute the code below if protocol is FCoE * or FIP and we have FCoE enabled on the adapter @@ -8257,11 +8252,9 @@ static u16 ixgbe_select_queue(struct net_device *dev, struct sk_buff *skb, txq -= f->indices; return txq + f->offset; -#else - return fallback(dev, skb); -#endif } +#endif static int ixgbe_xmit_xdp_ring(struct ixgbe_adapter *adapter, struct xdp_frame *xdpf) { @@ -10058,7 +10051,6 @@ static const struct net_device_ops ixgbe_netdev_ops = { .ndo_open = ixgbe_open, .ndo_stop = ixgbe_close, .ndo_start_xmit = ixgbe_xmit_frame, - .ndo_select_queue = ixgbe_select_queue, .ndo_set_rx_mode = ixgbe_set_rx_mode, .ndo_validate_addr = eth_validate_addr, .ndo_set_mac_address = ixgbe_set_mac, @@ -10081,6 +10073,7 @@ static const struct net_device_ops ixgbe_netdev_ops = { .ndo_poll_controller = ixgbe_netpoll, #endif #ifdef IXGBE_FCOE + .ndo_select_queue = ixgbe_select_queue, .ndo_fcoe_ddp_setup = ixgbe_fcoe_ddp_get, .ndo_fcoe_ddp_target = ixgbe_fcoe_ddp_target, .ndo_fcoe_ddp_done = ixgbe_fcoe_ddp_put, diff --git a/drivers/net/macvlan.c b/drivers/net/macvlan.c index adde8fc45588..401e1d1ce1ec 100644 --- a/drivers/net/macvlan.c +++ b/drivers/net/macvlan.c @@ -514,7 +514,6 @@ static int macvlan_queue_xmit(struct sk_buff *skb, struct net_device *dev) const struct macvlan_dev *vlan = netdev_priv(dev); const struct macvlan_port *port = vlan->port; const struct macvlan_dev *dest; - void *accel_priv = NULL; if (vlan->mode == MACVLAN_MODE_BRIDGE) { const struct ethhdr *eth = (void *)skb->data; @@ -533,15 +532,10 @@ static int macvlan_queue_xmit(struct sk_buff *skb, struct net_device *dev) return NET_XMIT_SUCCESS; } } - - /* For packets that are non-multicast and not bridged we will pass - * the necessary information so that the lowerdev can distinguish - * the source of the packets via the accel_priv value. - */ - accel_priv = vlan->accel_priv; xmit_world: skb->dev = vlan->lowerdev; - return dev_queue_xmit_accel(skb, accel_priv); + return dev_queue_xmit_accel(skb, + netdev_get_sb_channel(dev) ? dev : NULL); } static inline netdev_tx_t macvlan_netpoll_send_skb(struct macvlan_dev *vlan, struct sk_buff *skb) diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index b1ff77276bc4..fda0bcda7a42 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -2103,7 +2103,7 @@ static inline void netdev_for_each_tx_queue(struct net_device *dev, struct netdev_queue *netdev_pick_tx(struct net_device *dev, struct sk_buff *skb, - void *accel_priv); + struct net_device *sb_dev); /* returns the headroom that the master device needs to take in account * when forwarding to this dev @@ -2568,7 +2568,7 @@ void dev_close_many(struct list_head *head, bool unlink); void dev_disable_lro(struct net_device *dev); int dev_loopback_xmit(struct net *net, struct sock *sk, struct sk_buff *newskb); int dev_queue_xmit(struct sk_buff *skb); -int dev_queue_xmit_accel(struct sk_buff *skb, void *accel_priv); +int dev_queue_xmit_accel(struct sk_buff *skb, struct net_device *sb_dev); int dev_direct_xmit(struct sk_buff *skb, u16 queue_id); int register_netdevice(struct net_device *dev); void unregister_netdevice_queue(struct net_device *dev, struct list_head *head); diff --git a/net/core/dev.c b/net/core/dev.c index cc1d6bba017a..09a7cc2f3c55 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -2786,24 +2786,26 @@ EXPORT_SYMBOL(netif_device_attach); * Returns a Tx hash based on the given packet descriptor a Tx queues' number * to be used as a distribution range. */ -static u16 skb_tx_hash(const struct net_device *dev, struct sk_buff *skb) +static u16 skb_tx_hash(const struct net_device *dev, + const struct net_device *sb_dev, + struct sk_buff *skb) { u32 hash; u16 qoffset = 0; u16 qcount = dev->real_num_tx_queues; + if (dev->num_tc) { + u8 tc = netdev_get_prio_tc_map(dev, skb->priority); + + qoffset = sb_dev->tc_to_txq[tc].offset; + qcount = sb_dev->tc_to_txq[tc].count; + } + if (skb_rx_queue_recorded(skb)) { hash = skb_get_rx_queue(skb); while (unlikely(hash >= qcount)) hash -= qcount; - return hash; - } - - if (dev->num_tc) { - u8 tc = netdev_get_prio_tc_map(dev, skb->priority); - - qoffset = dev->tc_to_txq[tc].offset; - qcount = dev->tc_to_txq[tc].count; + return hash + qoffset; } return (u16) reciprocal_scale(skb_get_hash(skb), qcount) + qoffset; @@ -3573,7 +3575,8 @@ static int __get_xps_queue_idx(struct net_device *dev, struct sk_buff *skb, } #endif -static int get_xps_queue(struct net_device *dev, struct sk_buff *skb) +static int get_xps_queue(struct net_device *dev, struct net_device *sb_dev, + struct sk_buff *skb) { #ifdef CONFIG_XPS struct xps_dev_maps *dev_maps; @@ -3587,7 +3590,7 @@ static int get_xps_queue(struct net_device *dev, struct sk_buff *skb) if (!static_key_false(&xps_rxqs_needed)) goto get_cpus_map; - dev_maps = rcu_dereference(dev->xps_rxqs_map); + dev_maps = rcu_dereference(sb_dev->xps_rxqs_map); if (dev_maps) { int tci = sk_rx_queue_get(sk); @@ -3598,7 +3601,7 @@ static int get_xps_queue(struct net_device *dev, struct sk_buff *skb) get_cpus_map: if (queue_index < 0) { - dev_maps = rcu_dereference(dev->xps_cpus_map); + dev_maps = rcu_dereference(sb_dev->xps_cpus_map); if (dev_maps) { unsigned int tci = skb->sender_cpu - 1; @@ -3614,17 +3617,20 @@ get_cpus_map: #endif } -static u16 __netdev_pick_tx(struct net_device *dev, struct sk_buff *skb) +static u16 ___netdev_pick_tx(struct net_device *dev, struct sk_buff *skb, + struct net_device *sb_dev) { struct sock *sk = skb->sk; int queue_index = sk_tx_queue_get(sk); + sb_dev = sb_dev ? : dev; + if (queue_index < 0 || skb->ooo_okay || queue_index >= dev->real_num_tx_queues) { - int new_index = get_xps_queue(dev, skb); + int new_index = get_xps_queue(dev, sb_dev, skb); if (new_index < 0) - new_index = skb_tx_hash(dev, skb); + new_index = skb_tx_hash(dev, sb_dev, skb); if (queue_index != new_index && sk && sk_fullsock(sk) && @@ -3637,9 +3643,15 @@ static u16 __netdev_pick_tx(struct net_device *dev, struct sk_buff *skb) return queue_index; } +static u16 __netdev_pick_tx(struct net_device *dev, + struct sk_buff *skb) +{ + return ___netdev_pick_tx(dev, skb, NULL); +} + struct netdev_queue *netdev_pick_tx(struct net_device *dev, struct sk_buff *skb, - void *accel_priv) + struct net_device *sb_dev) { int queue_index = 0; @@ -3654,10 +3666,10 @@ struct netdev_queue *netdev_pick_tx(struct net_device *dev, const struct net_device_ops *ops = dev->netdev_ops; if (ops->ndo_select_queue) - queue_index = ops->ndo_select_queue(dev, skb, accel_priv, + queue_index = ops->ndo_select_queue(dev, skb, sb_dev, __netdev_pick_tx); else - queue_index = __netdev_pick_tx(dev, skb); + queue_index = ___netdev_pick_tx(dev, skb, sb_dev); queue_index = netdev_cap_txqueue(dev, queue_index); } @@ -3669,7 +3681,7 @@ struct netdev_queue *netdev_pick_tx(struct net_device *dev, /** * __dev_queue_xmit - transmit a buffer * @skb: buffer to transmit - * @accel_priv: private data used for L2 forwarding offload + * @sb_dev: suboordinate device used for L2 forwarding offload * * Queue a buffer for transmission to a network device. The caller must * have set the device and priority and built the buffer before calling @@ -3692,7 +3704,7 @@ struct netdev_queue *netdev_pick_tx(struct net_device *dev, * the BH enable code must have IRQs enabled so that it will not deadlock. * --BLG */ -static int __dev_queue_xmit(struct sk_buff *skb, void *accel_priv) +static int __dev_queue_xmit(struct sk_buff *skb, struct net_device *sb_dev) { struct net_device *dev = skb->dev; struct netdev_queue *txq; @@ -3731,7 +3743,7 @@ static int __dev_queue_xmit(struct sk_buff *skb, void *accel_priv) else skb_dst_force(skb); - txq = netdev_pick_tx(dev, skb, accel_priv); + txq = netdev_pick_tx(dev, skb, sb_dev); q = rcu_dereference_bh(txq->qdisc); trace_net_dev_queue(skb); @@ -3805,9 +3817,9 @@ int dev_queue_xmit(struct sk_buff *skb) } EXPORT_SYMBOL(dev_queue_xmit); -int dev_queue_xmit_accel(struct sk_buff *skb, void *accel_priv) +int dev_queue_xmit_accel(struct sk_buff *skb, struct net_device *sb_dev) { - return __dev_queue_xmit(skb, accel_priv); + return __dev_queue_xmit(skb, sb_dev); } EXPORT_SYMBOL(dev_queue_xmit_accel); -- cgit v1.2.3-59-g8ed1b From a4ea8a3dacc312c3402c78f6e4843afdda9b43a0 Mon Sep 17 00:00:00 2001 From: Alexander Duyck Date: Mon, 9 Jul 2018 12:19:54 -0400 Subject: net: Add generic ndo_select_queue functions This patch adds a generic version of the ndo_select_queue functions for either returning 0 or selecting a queue based on the processor ID. This is generally meant to just reduce the number of functions we have to change in the future when we have to deal with ndo_select_queue changes. Signed-off-by: Alexander Duyck Tested-by: Andrew Bowers Signed-off-by: Jeff Kirsher --- drivers/net/ethernet/lantiq_etop.c | 10 +--------- drivers/net/ethernet/ti/netcp_core.c | 9 +-------- drivers/staging/netlogic/xlr_net.c | 9 +-------- include/linux/netdevice.h | 4 ++++ net/core/dev.c | 14 ++++++++++++++ net/packet/af_packet.c | 2 +- 6 files changed, 22 insertions(+), 26 deletions(-) (limited to 'include/linux') diff --git a/drivers/net/ethernet/lantiq_etop.c b/drivers/net/ethernet/lantiq_etop.c index afc810069440..7a637b51c7d2 100644 --- a/drivers/net/ethernet/lantiq_etop.c +++ b/drivers/net/ethernet/lantiq_etop.c @@ -563,14 +563,6 @@ ltq_etop_set_multicast_list(struct net_device *dev) spin_unlock_irqrestore(&priv->lock, flags); } -static u16 -ltq_etop_select_queue(struct net_device *dev, struct sk_buff *skb, - void *accel_priv, select_queue_fallback_t fallback) -{ - /* we are currently only using the first queue */ - return 0; -} - static int ltq_etop_init(struct net_device *dev) { @@ -641,7 +633,7 @@ static const struct net_device_ops ltq_eth_netdev_ops = { .ndo_set_mac_address = ltq_etop_set_mac_address, .ndo_validate_addr = eth_validate_addr, .ndo_set_rx_mode = ltq_etop_set_multicast_list, - .ndo_select_queue = ltq_etop_select_queue, + .ndo_select_queue = dev_pick_tx_zero, .ndo_init = ltq_etop_init, .ndo_tx_timeout = ltq_etop_tx_timeout, }; diff --git a/drivers/net/ethernet/ti/netcp_core.c b/drivers/net/ethernet/ti/netcp_core.c index 6ebf110cd594..a1d335a3c5e4 100644 --- a/drivers/net/ethernet/ti/netcp_core.c +++ b/drivers/net/ethernet/ti/netcp_core.c @@ -1889,13 +1889,6 @@ static int netcp_rx_kill_vid(struct net_device *ndev, __be16 proto, u16 vid) return err; } -static u16 netcp_select_queue(struct net_device *dev, struct sk_buff *skb, - void *accel_priv, - select_queue_fallback_t fallback) -{ - return 0; -} - static int netcp_setup_tc(struct net_device *dev, enum tc_setup_type type, void *type_data) { @@ -1972,7 +1965,7 @@ static const struct net_device_ops netcp_netdev_ops = { .ndo_vlan_rx_add_vid = netcp_rx_add_vid, .ndo_vlan_rx_kill_vid = netcp_rx_kill_vid, .ndo_tx_timeout = netcp_ndo_tx_timeout, - .ndo_select_queue = netcp_select_queue, + .ndo_select_queue = dev_pick_tx_zero, .ndo_setup_tc = netcp_setup_tc, }; diff --git a/drivers/staging/netlogic/xlr_net.c b/drivers/staging/netlogic/xlr_net.c index e461168313bf..4e6611e4c59b 100644 --- a/drivers/staging/netlogic/xlr_net.c +++ b/drivers/staging/netlogic/xlr_net.c @@ -290,13 +290,6 @@ static netdev_tx_t xlr_net_start_xmit(struct sk_buff *skb, return NETDEV_TX_OK; } -static u16 xlr_net_select_queue(struct net_device *ndev, struct sk_buff *skb, - void *accel_priv, - select_queue_fallback_t fallback) -{ - return (u16)smp_processor_id(); -} - static void xlr_hw_set_mac_addr(struct net_device *ndev) { struct xlr_net_priv *priv = netdev_priv(ndev); @@ -403,7 +396,7 @@ static const struct net_device_ops xlr_netdev_ops = { .ndo_open = xlr_net_open, .ndo_stop = xlr_net_stop, .ndo_start_xmit = xlr_net_start_xmit, - .ndo_select_queue = xlr_net_select_queue, + .ndo_select_queue = dev_pick_tx_cpu_id, .ndo_set_mac_address = xlr_net_set_mac_addr, .ndo_set_rx_mode = xlr_set_rx_mode, .ndo_get_stats64 = xlr_stats, diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index fda0bcda7a42..46f4c44ce3e4 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -2567,6 +2567,10 @@ void dev_close(struct net_device *dev); void dev_close_many(struct list_head *head, bool unlink); void dev_disable_lro(struct net_device *dev); int dev_loopback_xmit(struct net *net, struct sock *sk, struct sk_buff *newskb); +u16 dev_pick_tx_zero(struct net_device *dev, struct sk_buff *skb, + void *accel_priv, select_queue_fallback_t fallback); +u16 dev_pick_tx_cpu_id(struct net_device *dev, struct sk_buff *skb, + void *accel_priv, select_queue_fallback_t fallback); int dev_queue_xmit(struct sk_buff *skb); int dev_queue_xmit_accel(struct sk_buff *skb, struct net_device *sb_dev); int dev_direct_xmit(struct sk_buff *skb, u16 queue_id); diff --git a/net/core/dev.c b/net/core/dev.c index 09a7cc2f3c55..b5e538032d5e 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -3617,6 +3617,20 @@ get_cpus_map: #endif } +u16 dev_pick_tx_zero(struct net_device *dev, struct sk_buff *skb, + void *accel_priv, select_queue_fallback_t fallback) +{ + return 0; +} +EXPORT_SYMBOL(dev_pick_tx_zero); + +u16 dev_pick_tx_cpu_id(struct net_device *dev, struct sk_buff *skb, + void *accel_priv, select_queue_fallback_t fallback) +{ + return (u16)raw_smp_processor_id() % dev->real_num_tx_queues; +} +EXPORT_SYMBOL(dev_pick_tx_cpu_id); + static u16 ___netdev_pick_tx(struct net_device *dev, struct sk_buff *skb, struct net_device *sb_dev) { diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index 47931ebfaef3..f37d087ae652 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -277,7 +277,7 @@ static bool packet_use_direct_xmit(const struct packet_sock *po) static u16 __packet_pick_tx_queue(struct net_device *dev, struct sk_buff *skb) { - return (u16) raw_smp_processor_id() % dev->real_num_tx_queues; + return dev_pick_tx_cpu_id(dev, skb, NULL, NULL); } static u16 packet_pick_tx_queue(struct sk_buff *skb) -- cgit v1.2.3-59-g8ed1b From 4f49dec9075aa0277b8c9c657ec31e6361f88724 Mon Sep 17 00:00:00 2001 From: Alexander Duyck Date: Mon, 9 Jul 2018 12:19:59 -0400 Subject: net: allow ndo_select_queue to pass netdev This patch makes it so that instead of passing a void pointer as the accel_priv we instead pass a net_device pointer as sb_dev. Making this change allows us to pass the subordinate device through to the fallback function eventually so that we can keep the actual code in the ndo_select_queue call as focused on possible on the exception cases. Signed-off-by: Alexander Duyck Tested-by: Andrew Bowers Signed-off-by: Jeff Kirsher --- drivers/infiniband/hw/hfi1/vnic_main.c | 2 +- drivers/infiniband/ulp/opa_vnic/opa_vnic_netdev.c | 4 ++-- drivers/net/bonding/bond_main.c | 3 ++- drivers/net/ethernet/amazon/ena/ena_netdev.c | 3 ++- drivers/net/ethernet/broadcom/bcmsysport.c | 2 +- drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c | 3 ++- drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.h | 3 ++- drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c | 3 ++- drivers/net/ethernet/hisilicon/hns/hns_enet.c | 3 ++- drivers/net/ethernet/intel/ixgbe/ixgbe_main.c | 7 ++++--- drivers/net/ethernet/mellanox/mlx4/en_tx.c | 3 ++- drivers/net/ethernet/mellanox/mlx4/mlx4_en.h | 3 ++- drivers/net/ethernet/mellanox/mlx5/core/en.h | 3 ++- drivers/net/ethernet/mellanox/mlx5/core/en_tx.c | 3 ++- drivers/net/ethernet/renesas/ravb_main.c | 3 ++- drivers/net/ethernet/sun/ldmvsw.c | 3 ++- drivers/net/ethernet/sun/sunvnet.c | 3 ++- drivers/net/hyperv/netvsc_drv.c | 4 ++-- drivers/net/net_failover.c | 5 +++-- drivers/net/team/team.c | 3 ++- drivers/net/tun.c | 3 ++- drivers/net/wireless/marvell/mwifiex/main.c | 3 ++- drivers/net/xen-netback/interface.c | 2 +- drivers/net/xen-netfront.c | 3 ++- drivers/staging/rtl8188eu/os_dep/os_intfs.c | 3 ++- drivers/staging/rtl8723bs/os_dep/os_intfs.c | 7 +++---- include/linux/netdevice.h | 11 +++++++---- net/core/dev.c | 6 ++++-- net/mac80211/iface.c | 4 ++-- 29 files changed, 66 insertions(+), 42 deletions(-) (limited to 'include/linux') diff --git a/drivers/infiniband/hw/hfi1/vnic_main.c b/drivers/infiniband/hw/hfi1/vnic_main.c index 5d65582fe4d9..616fc9b6fad8 100644 --- a/drivers/infiniband/hw/hfi1/vnic_main.c +++ b/drivers/infiniband/hw/hfi1/vnic_main.c @@ -423,7 +423,7 @@ tx_finish: static u16 hfi1_vnic_select_queue(struct net_device *netdev, struct sk_buff *skb, - void *accel_priv, + struct net_device *sb_dev, select_queue_fallback_t fallback) { struct hfi1_vnic_vport_info *vinfo = opa_vnic_dev_priv(netdev); diff --git a/drivers/infiniband/ulp/opa_vnic/opa_vnic_netdev.c b/drivers/infiniband/ulp/opa_vnic/opa_vnic_netdev.c index 0c8aec62a425..61558788b3fa 100644 --- a/drivers/infiniband/ulp/opa_vnic/opa_vnic_netdev.c +++ b/drivers/infiniband/ulp/opa_vnic/opa_vnic_netdev.c @@ -95,7 +95,7 @@ static netdev_tx_t opa_netdev_start_xmit(struct sk_buff *skb, } static u16 opa_vnic_select_queue(struct net_device *netdev, struct sk_buff *skb, - void *accel_priv, + struct net_device *sb_dev, select_queue_fallback_t fallback) { struct opa_vnic_adapter *adapter = opa_vnic_priv(netdev); @@ -107,7 +107,7 @@ static u16 opa_vnic_select_queue(struct net_device *netdev, struct sk_buff *skb, mdata->entropy = opa_vnic_calc_entropy(skb); mdata->vl = opa_vnic_get_vl(adapter, skb); rc = adapter->rn_ops->ndo_select_queue(netdev, skb, - accel_priv, fallback); + sb_dev, fallback); skb_pull(skb, sizeof(*mdata)); return rc; } diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c index 63e3844c5bec..9a2ea3c1f949 100644 --- a/drivers/net/bonding/bond_main.c +++ b/drivers/net/bonding/bond_main.c @@ -4094,7 +4094,8 @@ static inline int bond_slave_override(struct bonding *bond, static u16 bond_select_queue(struct net_device *dev, struct sk_buff *skb, - void *accel_priv, select_queue_fallback_t fallback) + struct net_device *sb_dev, + select_queue_fallback_t fallback) { /* This helper function exists to help dev_pick_tx get the correct * destination queue. Using a helper function skips a call to diff --git a/drivers/net/ethernet/amazon/ena/ena_netdev.c b/drivers/net/ethernet/amazon/ena/ena_netdev.c index f2af87d70594..e3befb1f9204 100644 --- a/drivers/net/ethernet/amazon/ena/ena_netdev.c +++ b/drivers/net/ethernet/amazon/ena/ena_netdev.c @@ -2213,7 +2213,8 @@ static void ena_netpoll(struct net_device *netdev) #endif /* CONFIG_NET_POLL_CONTROLLER */ static u16 ena_select_queue(struct net_device *dev, struct sk_buff *skb, - void *accel_priv, select_queue_fallback_t fallback) + struct net_device *sb_dev, + select_queue_fallback_t fallback) { u16 qid; /* we suspect that this is good for in--kernel network services that diff --git a/drivers/net/ethernet/broadcom/bcmsysport.c b/drivers/net/ethernet/broadcom/bcmsysport.c index d5fca2e5a9bc..32f548e6431d 100644 --- a/drivers/net/ethernet/broadcom/bcmsysport.c +++ b/drivers/net/ethernet/broadcom/bcmsysport.c @@ -2107,7 +2107,7 @@ static const struct ethtool_ops bcm_sysport_ethtool_ops = { }; static u16 bcm_sysport_select_queue(struct net_device *dev, struct sk_buff *skb, - void *accel_priv, + struct net_device *sb_dev, select_queue_fallback_t fallback) { struct bcm_sysport_priv *priv = netdev_priv(dev); diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c index af7b5a4d8ba0..e4e1cf907ac6 100644 --- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c +++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c @@ -1910,7 +1910,8 @@ void bnx2x_netif_stop(struct bnx2x *bp, int disable_hw) } u16 bnx2x_select_queue(struct net_device *dev, struct sk_buff *skb, - void *accel_priv, select_queue_fallback_t fallback) + struct net_device *sb_dev, + select_queue_fallback_t fallback) { struct bnx2x *bp = netdev_priv(dev); diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.h b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.h index a8ce5c55bbb0..0e508e5defce 100644 --- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.h +++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.h @@ -497,7 +497,8 @@ int bnx2x_set_vf_vlan(struct net_device *netdev, int vf, u16 vlan, u8 qos, /* select_queue callback */ u16 bnx2x_select_queue(struct net_device *dev, struct sk_buff *skb, - void *accel_priv, select_queue_fallback_t fallback); + struct net_device *sb_dev, + select_queue_fallback_t fallback); static inline void bnx2x_update_rx_prod(struct bnx2x *bp, struct bnx2x_fastpath *fp, diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c index 0d91716a2566..5dc5e5604f05 100644 --- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c +++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c @@ -930,7 +930,8 @@ freeout: } static u16 cxgb_select_queue(struct net_device *dev, struct sk_buff *skb, - void *accel_priv, select_queue_fallback_t fallback) + struct net_device *sb_dev, + select_queue_fallback_t fallback) { int txq; diff --git a/drivers/net/ethernet/hisilicon/hns/hns_enet.c b/drivers/net/ethernet/hisilicon/hns/hns_enet.c index ef9ef703d13a..ff7a74ec8f11 100644 --- a/drivers/net/ethernet/hisilicon/hns/hns_enet.c +++ b/drivers/net/ethernet/hisilicon/hns/hns_enet.c @@ -2022,7 +2022,8 @@ static void hns_nic_get_stats64(struct net_device *ndev, static u16 hns_nic_select_queue(struct net_device *ndev, struct sk_buff *skb, - void *accel_priv, select_queue_fallback_t fallback) + struct net_device *sb_dev, + select_queue_fallback_t fallback) { struct ethhdr *eth_hdr = (struct ethhdr *)skb->data; struct hns_nic_priv *priv = netdev_priv(ndev); diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c index abb176df2e7f..8c7a68c57afa 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c @@ -8210,15 +8210,16 @@ static void ixgbe_atr(struct ixgbe_ring *ring, #ifdef IXGBE_FCOE static u16 ixgbe_select_queue(struct net_device *dev, struct sk_buff *skb, - void *accel_priv, select_queue_fallback_t fallback) + struct net_device *sb_dev, + select_queue_fallback_t fallback) { struct ixgbe_adapter *adapter; struct ixgbe_ring_feature *f; int txq; - if (accel_priv) { + if (sb_dev) { u8 tc = netdev_get_prio_tc_map(dev, skb->priority); - struct net_device *vdev = accel_priv; + struct net_device *vdev = sb_dev; txq = vdev->tc_to_txq[tc].offset; txq += reciprocal_scale(skb_get_hash(skb), diff --git a/drivers/net/ethernet/mellanox/mlx4/en_tx.c b/drivers/net/ethernet/mellanox/mlx4/en_tx.c index 0227786308af..df2996618cd1 100644 --- a/drivers/net/ethernet/mellanox/mlx4/en_tx.c +++ b/drivers/net/ethernet/mellanox/mlx4/en_tx.c @@ -688,7 +688,8 @@ static void build_inline_wqe(struct mlx4_en_tx_desc *tx_desc, } u16 mlx4_en_select_queue(struct net_device *dev, struct sk_buff *skb, - void *accel_priv, select_queue_fallback_t fallback) + struct net_device *sb_dev, + select_queue_fallback_t fallback) { struct mlx4_en_priv *priv = netdev_priv(dev); u16 rings_p_up = priv->num_tx_rings_p_up; diff --git a/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h b/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h index ace6545f82e6..c3228b89df46 100644 --- a/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h +++ b/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h @@ -699,7 +699,8 @@ void mlx4_en_arm_cq(struct mlx4_en_priv *priv, struct mlx4_en_cq *cq); void mlx4_en_tx_irq(struct mlx4_cq *mcq); u16 mlx4_en_select_queue(struct net_device *dev, struct sk_buff *skb, - void *accel_priv, select_queue_fallback_t fallback); + struct net_device *sb_dev, + select_queue_fallback_t fallback); netdev_tx_t mlx4_en_xmit(struct sk_buff *skb, struct net_device *dev); netdev_tx_t mlx4_en_xmit_frame(struct mlx4_en_rx_ring *rx_ring, struct mlx4_en_rx_alloc *frame, diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en.h b/drivers/net/ethernet/mellanox/mlx5/core/en.h index e2b7586ed7a0..e1b237ccdf56 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h @@ -865,7 +865,8 @@ struct mlx5e_profile { void mlx5e_build_ptys2ethtool_map(void); u16 mlx5e_select_queue(struct net_device *dev, struct sk_buff *skb, - void *accel_priv, select_queue_fallback_t fallback); + struct net_device *sb_dev, + select_queue_fallback_t fallback); netdev_tx_t mlx5e_xmit(struct sk_buff *skb, struct net_device *dev); netdev_tx_t mlx5e_sq_xmit(struct mlx5e_txqsq *sq, struct sk_buff *skb, struct mlx5e_tx_wqe *wqe, u16 pi); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c index f0739dae7b56..dfcc3710b65f 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c @@ -111,7 +111,8 @@ static inline int mlx5e_get_dscp_up(struct mlx5e_priv *priv, struct sk_buff *skb #endif u16 mlx5e_select_queue(struct net_device *dev, struct sk_buff *skb, - void *accel_priv, select_queue_fallback_t fallback) + struct net_device *sb_dev, + select_queue_fallback_t fallback) { struct mlx5e_priv *priv = netdev_priv(dev); int channel_ix = fallback(dev, skb); diff --git a/drivers/net/ethernet/renesas/ravb_main.c b/drivers/net/ethernet/renesas/ravb_main.c index 68f122140966..4a7f54c8e7aa 100644 --- a/drivers/net/ethernet/renesas/ravb_main.c +++ b/drivers/net/ethernet/renesas/ravb_main.c @@ -1656,7 +1656,8 @@ drop: } static u16 ravb_select_queue(struct net_device *ndev, struct sk_buff *skb, - void *accel_priv, select_queue_fallback_t fallback) + struct net_device *sb_dev, + select_queue_fallback_t fallback) { /* If skb needs TX timestamp, it is handled in network control queue */ return (skb_shinfo(skb)->tx_flags & SKBTX_HW_TSTAMP) ? RAVB_NC : diff --git a/drivers/net/ethernet/sun/ldmvsw.c b/drivers/net/ethernet/sun/ldmvsw.c index a5dd627fe2f9..d42f47f6c632 100644 --- a/drivers/net/ethernet/sun/ldmvsw.c +++ b/drivers/net/ethernet/sun/ldmvsw.c @@ -101,7 +101,8 @@ static struct vnet_port *vsw_tx_port_find(struct sk_buff *skb, } static u16 vsw_select_queue(struct net_device *dev, struct sk_buff *skb, - void *accel_priv, select_queue_fallback_t fallback) + struct net_device *sb_dev, + select_queue_fallback_t fallback) { struct vnet_port *port = netdev_priv(dev); diff --git a/drivers/net/ethernet/sun/sunvnet.c b/drivers/net/ethernet/sun/sunvnet.c index a94f50442613..12539b357a78 100644 --- a/drivers/net/ethernet/sun/sunvnet.c +++ b/drivers/net/ethernet/sun/sunvnet.c @@ -234,7 +234,8 @@ static struct vnet_port *vnet_tx_port_find(struct sk_buff *skb, } static u16 vnet_select_queue(struct net_device *dev, struct sk_buff *skb, - void *accel_priv, select_queue_fallback_t fallback) + struct net_device *sb_dev, + select_queue_fallback_t fallback) { struct vnet *vp = netdev_priv(dev); struct vnet_port *port = __tx_port_find(vp, skb); diff --git a/drivers/net/hyperv/netvsc_drv.c b/drivers/net/hyperv/netvsc_drv.c index dd1d6e115145..98c0107d6ca1 100644 --- a/drivers/net/hyperv/netvsc_drv.c +++ b/drivers/net/hyperv/netvsc_drv.c @@ -329,7 +329,7 @@ static u16 netvsc_pick_tx(struct net_device *ndev, struct sk_buff *skb) } static u16 netvsc_select_queue(struct net_device *ndev, struct sk_buff *skb, - void *accel_priv, + struct net_device *sb_dev, select_queue_fallback_t fallback) { struct net_device_context *ndc = netdev_priv(ndev); @@ -343,7 +343,7 @@ static u16 netvsc_select_queue(struct net_device *ndev, struct sk_buff *skb, if (vf_ops->ndo_select_queue) txq = vf_ops->ndo_select_queue(vf_netdev, skb, - accel_priv, fallback); + sb_dev, fallback); else txq = fallback(vf_netdev, skb); diff --git a/drivers/net/net_failover.c b/drivers/net/net_failover.c index 4f390fa557e4..78b549698b7b 100644 --- a/drivers/net/net_failover.c +++ b/drivers/net/net_failover.c @@ -115,7 +115,8 @@ static netdev_tx_t net_failover_start_xmit(struct sk_buff *skb, } static u16 net_failover_select_queue(struct net_device *dev, - struct sk_buff *skb, void *accel_priv, + struct sk_buff *skb, + struct net_device *sb_dev, select_queue_fallback_t fallback) { struct net_failover_info *nfo_info = netdev_priv(dev); @@ -128,7 +129,7 @@ static u16 net_failover_select_queue(struct net_device *dev, if (ops->ndo_select_queue) txq = ops->ndo_select_queue(primary_dev, skb, - accel_priv, fallback); + sb_dev, fallback); else txq = fallback(primary_dev, skb); diff --git a/drivers/net/team/team.c b/drivers/net/team/team.c index b070959737ff..3a95eaae0c98 100644 --- a/drivers/net/team/team.c +++ b/drivers/net/team/team.c @@ -1707,7 +1707,8 @@ static netdev_tx_t team_xmit(struct sk_buff *skb, struct net_device *dev) } static u16 team_select_queue(struct net_device *dev, struct sk_buff *skb, - void *accel_priv, select_queue_fallback_t fallback) + struct net_device *sb_dev, + select_queue_fallback_t fallback) { /* * This helper function exists to help dev_pick_tx get the correct diff --git a/drivers/net/tun.c b/drivers/net/tun.c index a192a017cc68..76f0f4131197 100644 --- a/drivers/net/tun.c +++ b/drivers/net/tun.c @@ -607,7 +607,8 @@ static u16 tun_ebpf_select_queue(struct tun_struct *tun, struct sk_buff *skb) } static u16 tun_select_queue(struct net_device *dev, struct sk_buff *skb, - void *accel_priv, select_queue_fallback_t fallback) + struct net_device *sb_dev, + select_queue_fallback_t fallback) { struct tun_struct *tun = netdev_priv(dev); u16 ret; diff --git a/drivers/net/wireless/marvell/mwifiex/main.c b/drivers/net/wireless/marvell/mwifiex/main.c index 510f6b8e717d..fa3e8ddfe9a9 100644 --- a/drivers/net/wireless/marvell/mwifiex/main.c +++ b/drivers/net/wireless/marvell/mwifiex/main.c @@ -1279,7 +1279,8 @@ static struct net_device_stats *mwifiex_get_stats(struct net_device *dev) static u16 mwifiex_netdev_select_wmm_queue(struct net_device *dev, struct sk_buff *skb, - void *accel_priv, select_queue_fallback_t fallback) + struct net_device *sb_dev, + select_queue_fallback_t fallback) { skb->priority = cfg80211_classify8021d(skb, NULL); return mwifiex_1d_to_wmm_queue[skb->priority]; diff --git a/drivers/net/xen-netback/interface.c b/drivers/net/xen-netback/interface.c index 78ebe494fef0..19c4c585f472 100644 --- a/drivers/net/xen-netback/interface.c +++ b/drivers/net/xen-netback/interface.c @@ -148,7 +148,7 @@ void xenvif_wake_queue(struct xenvif_queue *queue) } static u16 xenvif_select_queue(struct net_device *dev, struct sk_buff *skb, - void *accel_priv, + struct net_device *sb_dev, select_queue_fallback_t fallback) { struct xenvif *vif = netdev_priv(dev); diff --git a/drivers/net/xen-netfront.c b/drivers/net/xen-netfront.c index a57daecf1d57..d67cd379d156 100644 --- a/drivers/net/xen-netfront.c +++ b/drivers/net/xen-netfront.c @@ -545,7 +545,8 @@ static int xennet_count_skb_slots(struct sk_buff *skb) } static u16 xennet_select_queue(struct net_device *dev, struct sk_buff *skb, - void *accel_priv, select_queue_fallback_t fallback) + struct net_device *sb_dev, + select_queue_fallback_t fallback) { unsigned int num_queues = dev->real_num_tx_queues; u32 hash; diff --git a/drivers/staging/rtl8188eu/os_dep/os_intfs.c b/drivers/staging/rtl8188eu/os_dep/os_intfs.c index add1ba00f3e9..38e85c8a85c8 100644 --- a/drivers/staging/rtl8188eu/os_dep/os_intfs.c +++ b/drivers/staging/rtl8188eu/os_dep/os_intfs.c @@ -253,7 +253,8 @@ static unsigned int rtw_classify8021d(struct sk_buff *skb) } static u16 rtw_select_queue(struct net_device *dev, struct sk_buff *skb, - void *accel_priv, select_queue_fallback_t fallback) + struct net_device *sb_dev, + select_queue_fallback_t fallback) { struct adapter *padapter = rtw_netdev_priv(dev); struct mlme_priv *pmlmepriv = &padapter->mlmepriv; diff --git a/drivers/staging/rtl8723bs/os_dep/os_intfs.c b/drivers/staging/rtl8723bs/os_dep/os_intfs.c index ace68f023b49..181642358e3f 100644 --- a/drivers/staging/rtl8723bs/os_dep/os_intfs.c +++ b/drivers/staging/rtl8723bs/os_dep/os_intfs.c @@ -403,10 +403,9 @@ static unsigned int rtw_classify8021d(struct sk_buff *skb) } -static u16 rtw_select_queue(struct net_device *dev, struct sk_buff *skb - , void *accel_priv - , select_queue_fallback_t fallback -) +static u16 rtw_select_queue(struct net_device *dev, struct sk_buff *skb, + struct net_device *sb_dev, + select_queue_fallback_t fallback) { struct adapter *padapter = rtw_netdev_priv(dev); struct mlme_priv *pmlmepriv = &padapter->mlmepriv; diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 46f4c44ce3e4..bbf062c1ca8a 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -957,7 +957,8 @@ struct dev_ifalias { * those the driver believes to be appropriate. * * u16 (*ndo_select_queue)(struct net_device *dev, struct sk_buff *skb, - * void *accel_priv, select_queue_fallback_t fallback); + * struct net_device *sb_dev, + * select_queue_fallback_t fallback); * Called to decide which queue to use when device supports multiple * transmit queues. * @@ -1229,7 +1230,7 @@ struct net_device_ops { netdev_features_t features); u16 (*ndo_select_queue)(struct net_device *dev, struct sk_buff *skb, - void *accel_priv, + struct net_device *sb_dev, select_queue_fallback_t fallback); void (*ndo_change_rx_flags)(struct net_device *dev, int flags); @@ -2568,9 +2569,11 @@ void dev_close_many(struct list_head *head, bool unlink); void dev_disable_lro(struct net_device *dev); int dev_loopback_xmit(struct net *net, struct sock *sk, struct sk_buff *newskb); u16 dev_pick_tx_zero(struct net_device *dev, struct sk_buff *skb, - void *accel_priv, select_queue_fallback_t fallback); + struct net_device *sb_dev, + select_queue_fallback_t fallback); u16 dev_pick_tx_cpu_id(struct net_device *dev, struct sk_buff *skb, - void *accel_priv, select_queue_fallback_t fallback); + struct net_device *sb_dev, + select_queue_fallback_t fallback); int dev_queue_xmit(struct sk_buff *skb); int dev_queue_xmit_accel(struct sk_buff *skb, struct net_device *sb_dev); int dev_direct_xmit(struct sk_buff *skb, u16 queue_id); diff --git a/net/core/dev.c b/net/core/dev.c index b5e538032d5e..a051ce27198b 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -3618,14 +3618,16 @@ get_cpus_map: } u16 dev_pick_tx_zero(struct net_device *dev, struct sk_buff *skb, - void *accel_priv, select_queue_fallback_t fallback) + struct net_device *sb_dev, + select_queue_fallback_t fallback) { return 0; } EXPORT_SYMBOL(dev_pick_tx_zero); u16 dev_pick_tx_cpu_id(struct net_device *dev, struct sk_buff *skb, - void *accel_priv, select_queue_fallback_t fallback) + struct net_device *sb_dev, + select_queue_fallback_t fallback) { return (u16)raw_smp_processor_id() % dev->real_num_tx_queues; } diff --git a/net/mac80211/iface.c b/net/mac80211/iface.c index 555e389b7dfa..5e6cf2cee965 100644 --- a/net/mac80211/iface.c +++ b/net/mac80211/iface.c @@ -1130,7 +1130,7 @@ static void ieee80211_uninit(struct net_device *dev) static u16 ieee80211_netdev_select_queue(struct net_device *dev, struct sk_buff *skb, - void *accel_priv, + struct net_device *sb_dev, select_queue_fallback_t fallback) { return ieee80211_select_queue(IEEE80211_DEV_TO_SUB_IF(dev), skb); @@ -1176,7 +1176,7 @@ static const struct net_device_ops ieee80211_dataif_ops = { static u16 ieee80211_monitor_select_queue(struct net_device *dev, struct sk_buff *skb, - void *accel_priv, + struct net_device *sb_dev, select_queue_fallback_t fallback) { struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); -- cgit v1.2.3-59-g8ed1b From 8ec56fc3c5ee6f9700adac190e9ce5b8859a58b6 Mon Sep 17 00:00:00 2001 From: Alexander Duyck Date: Mon, 9 Jul 2018 12:20:04 -0400 Subject: net: allow fallback function to pass netdev For most of these calls we can just pass NULL through to the fallback function as the sb_dev. The only cases where we cannot are the cases where we might be dealing with either an upper device or a driver that would have configured things to support an sb_dev itself. The only driver that has any significant change in this patch set should be ixgbe as we can drop the redundant functionality that existed in both the ndo_select_queue function and the fallback function that was passed through to us. Signed-off-by: Alexander Duyck Tested-by: Andrew Bowers Signed-off-by: Jeff Kirsher --- drivers/net/ethernet/amazon/ena/ena_netdev.c | 2 +- drivers/net/ethernet/broadcom/bcmsysport.c | 4 ++-- drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c | 3 ++- drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c | 2 +- drivers/net/ethernet/hisilicon/hns/hns_enet.c | 2 +- drivers/net/ethernet/intel/ixgbe/ixgbe_main.c | 4 ++-- drivers/net/ethernet/mellanox/mlx4/en_tx.c | 4 ++-- drivers/net/ethernet/mellanox/mlx5/core/en_tx.c | 2 +- drivers/net/hyperv/netvsc_drv.c | 2 +- drivers/net/net_failover.c | 2 +- drivers/net/xen-netback/interface.c | 2 +- include/linux/netdevice.h | 3 ++- net/core/dev.c | 12 +++--------- net/packet/af_packet.c | 7 ++++--- 14 files changed, 24 insertions(+), 27 deletions(-) (limited to 'include/linux') diff --git a/drivers/net/ethernet/amazon/ena/ena_netdev.c b/drivers/net/ethernet/amazon/ena/ena_netdev.c index e3befb1f9204..c673ac2df65b 100644 --- a/drivers/net/ethernet/amazon/ena/ena_netdev.c +++ b/drivers/net/ethernet/amazon/ena/ena_netdev.c @@ -2224,7 +2224,7 @@ static u16 ena_select_queue(struct net_device *dev, struct sk_buff *skb, if (skb_rx_queue_recorded(skb)) qid = skb_get_rx_queue(skb); else - qid = fallback(dev, skb); + qid = fallback(dev, skb, NULL); return qid; } diff --git a/drivers/net/ethernet/broadcom/bcmsysport.c b/drivers/net/ethernet/broadcom/bcmsysport.c index 32f548e6431d..eb890c4b3b2d 100644 --- a/drivers/net/ethernet/broadcom/bcmsysport.c +++ b/drivers/net/ethernet/broadcom/bcmsysport.c @@ -2116,7 +2116,7 @@ static u16 bcm_sysport_select_queue(struct net_device *dev, struct sk_buff *skb, unsigned int q, port; if (!netdev_uses_dsa(dev)) - return fallback(dev, skb); + return fallback(dev, skb, NULL); /* DSA tagging layer will have configured the correct queue */ q = BRCM_TAG_GET_QUEUE(queue); @@ -2124,7 +2124,7 @@ static u16 bcm_sysport_select_queue(struct net_device *dev, struct sk_buff *skb, tx_ring = priv->ring_map[q + port * priv->per_port_num_tx_queues]; if (unlikely(!tx_ring)) - return fallback(dev, skb); + return fallback(dev, skb, NULL); return tx_ring->index; } diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c index e4e1cf907ac6..5a727d4729da 100644 --- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c +++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c @@ -1933,7 +1933,8 @@ u16 bnx2x_select_queue(struct net_device *dev, struct sk_buff *skb, } /* select a non-FCoE queue */ - return fallback(dev, skb) % (BNX2X_NUM_ETH_QUEUES(bp) * bp->max_cos); + return fallback(dev, skb, NULL) % + (BNX2X_NUM_ETH_QUEUES(bp) * bp->max_cos); } void bnx2x_set_num_queues(struct bnx2x *bp) diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c index 5dc5e5604f05..40cf8dc9f163 100644 --- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c +++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c @@ -973,7 +973,7 @@ static u16 cxgb_select_queue(struct net_device *dev, struct sk_buff *skb, return txq; } - return fallback(dev, skb) % dev->real_num_tx_queues; + return fallback(dev, skb, NULL) % dev->real_num_tx_queues; } static int closest_timer(const struct sge *s, int time) diff --git a/drivers/net/ethernet/hisilicon/hns/hns_enet.c b/drivers/net/ethernet/hisilicon/hns/hns_enet.c index ff7a74ec8f11..948b3e0d18f4 100644 --- a/drivers/net/ethernet/hisilicon/hns/hns_enet.c +++ b/drivers/net/ethernet/hisilicon/hns/hns_enet.c @@ -2033,7 +2033,7 @@ hns_nic_select_queue(struct net_device *ndev, struct sk_buff *skb, is_multicast_ether_addr(eth_hdr->h_dest)) return 0; else - return fallback(ndev, skb); + return fallback(ndev, skb, NULL); } static const struct net_device_ops hns_nic_netdev_ops = { diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c index 8c7a68c57afa..bd6d9ea27b4b 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c @@ -8237,11 +8237,11 @@ static u16 ixgbe_select_queue(struct net_device *dev, struct sk_buff *skb, case htons(ETH_P_FIP): adapter = netdev_priv(dev); - if (adapter->flags & IXGBE_FLAG_FCOE_ENABLED) + if (!sb_dev && (adapter->flags & IXGBE_FLAG_FCOE_ENABLED)) break; /* fall through */ default: - return fallback(dev, skb); + return fallback(dev, skb, sb_dev); } f = &adapter->ring_feature[RING_F_FCOE]; diff --git a/drivers/net/ethernet/mellanox/mlx4/en_tx.c b/drivers/net/ethernet/mellanox/mlx4/en_tx.c index df2996618cd1..1857ee0f0871 100644 --- a/drivers/net/ethernet/mellanox/mlx4/en_tx.c +++ b/drivers/net/ethernet/mellanox/mlx4/en_tx.c @@ -695,9 +695,9 @@ u16 mlx4_en_select_queue(struct net_device *dev, struct sk_buff *skb, u16 rings_p_up = priv->num_tx_rings_p_up; if (netdev_get_num_tc(dev)) - return fallback(dev, skb); + return fallback(dev, skb, NULL); - return fallback(dev, skb) % rings_p_up; + return fallback(dev, skb, NULL) % rings_p_up; } static void mlx4_bf_copy(void __iomem *dst, const void *src, diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c index dfcc3710b65f..9106ea45e3cb 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c @@ -115,7 +115,7 @@ u16 mlx5e_select_queue(struct net_device *dev, struct sk_buff *skb, select_queue_fallback_t fallback) { struct mlx5e_priv *priv = netdev_priv(dev); - int channel_ix = fallback(dev, skb); + int channel_ix = fallback(dev, skb, NULL); u16 num_channels; int up = 0; diff --git a/drivers/net/hyperv/netvsc_drv.c b/drivers/net/hyperv/netvsc_drv.c index 98c0107d6ca1..cf4f40a04194 100644 --- a/drivers/net/hyperv/netvsc_drv.c +++ b/drivers/net/hyperv/netvsc_drv.c @@ -345,7 +345,7 @@ static u16 netvsc_select_queue(struct net_device *ndev, struct sk_buff *skb, txq = vf_ops->ndo_select_queue(vf_netdev, skb, sb_dev, fallback); else - txq = fallback(vf_netdev, skb); + txq = fallback(vf_netdev, skb, NULL); /* Record the queue selected by VF so that it can be * used for common case where VF has more queues than diff --git a/drivers/net/net_failover.c b/drivers/net/net_failover.c index 78b549698b7b..d00d42c845b7 100644 --- a/drivers/net/net_failover.c +++ b/drivers/net/net_failover.c @@ -131,7 +131,7 @@ static u16 net_failover_select_queue(struct net_device *dev, txq = ops->ndo_select_queue(primary_dev, skb, sb_dev, fallback); else - txq = fallback(primary_dev, skb); + txq = fallback(primary_dev, skb, NULL); qdisc_skb_cb(skb)->slave_dev_queue_mapping = skb->queue_mapping; diff --git a/drivers/net/xen-netback/interface.c b/drivers/net/xen-netback/interface.c index 19c4c585f472..92274c237200 100644 --- a/drivers/net/xen-netback/interface.c +++ b/drivers/net/xen-netback/interface.c @@ -155,7 +155,7 @@ static u16 xenvif_select_queue(struct net_device *dev, struct sk_buff *skb, unsigned int size = vif->hash.size; if (vif->hash.alg == XEN_NETIF_CTRL_HASH_ALGORITHM_NONE) - return fallback(dev, skb) % dev->real_num_tx_queues; + return fallback(dev, skb, NULL) % dev->real_num_tx_queues; xenvif_set_skb_hash(vif, skb); diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index bbf062c1ca8a..2daf2fa6554f 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -793,7 +793,8 @@ static inline bool netdev_phys_item_id_same(struct netdev_phys_item_id *a, } typedef u16 (*select_queue_fallback_t)(struct net_device *dev, - struct sk_buff *skb); + struct sk_buff *skb, + struct net_device *sb_dev); enum tc_setup_type { TC_SETUP_QDISC_MQPRIO, diff --git a/net/core/dev.c b/net/core/dev.c index a051ce27198b..e18d81837a6c 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -3633,8 +3633,8 @@ u16 dev_pick_tx_cpu_id(struct net_device *dev, struct sk_buff *skb, } EXPORT_SYMBOL(dev_pick_tx_cpu_id); -static u16 ___netdev_pick_tx(struct net_device *dev, struct sk_buff *skb, - struct net_device *sb_dev) +static u16 __netdev_pick_tx(struct net_device *dev, struct sk_buff *skb, + struct net_device *sb_dev) { struct sock *sk = skb->sk; int queue_index = sk_tx_queue_get(sk); @@ -3659,12 +3659,6 @@ static u16 ___netdev_pick_tx(struct net_device *dev, struct sk_buff *skb, return queue_index; } -static u16 __netdev_pick_tx(struct net_device *dev, - struct sk_buff *skb) -{ - return ___netdev_pick_tx(dev, skb, NULL); -} - struct netdev_queue *netdev_pick_tx(struct net_device *dev, struct sk_buff *skb, struct net_device *sb_dev) @@ -3685,7 +3679,7 @@ struct netdev_queue *netdev_pick_tx(struct net_device *dev, queue_index = ops->ndo_select_queue(dev, skb, sb_dev, __netdev_pick_tx); else - queue_index = ___netdev_pick_tx(dev, skb, sb_dev); + queue_index = __netdev_pick_tx(dev, skb, sb_dev); queue_index = netdev_cap_txqueue(dev, queue_index); } diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index f37d087ae652..00189a3b07f2 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -275,9 +275,10 @@ static bool packet_use_direct_xmit(const struct packet_sock *po) return po->xmit == packet_direct_xmit; } -static u16 __packet_pick_tx_queue(struct net_device *dev, struct sk_buff *skb) +static u16 __packet_pick_tx_queue(struct net_device *dev, struct sk_buff *skb, + struct net_device *sb_dev) { - return dev_pick_tx_cpu_id(dev, skb, NULL, NULL); + return dev_pick_tx_cpu_id(dev, skb, sb_dev, NULL); } static u16 packet_pick_tx_queue(struct sk_buff *skb) @@ -291,7 +292,7 @@ static u16 packet_pick_tx_queue(struct sk_buff *skb) __packet_pick_tx_queue); queue_index = netdev_cap_txqueue(dev, queue_index); } else { - queue_index = __packet_pick_tx_queue(dev, skb); + queue_index = __packet_pick_tx_queue(dev, skb, NULL); } return queue_index; -- cgit v1.2.3-59-g8ed1b From 9f17dbf04ddf55ae48f5bbafea4c4920ea943215 Mon Sep 17 00:00:00 2001 From: Edward Cree Date: Mon, 9 Jul 2018 18:10:02 +0100 Subject: netfilter: fix use-after-free in NF_HOOK_LIST nf_hook() can free the skb, so we need to remove it from the list before calling, and add passed skbs to a sublist afterwards. Fixes: 17266ee93984 ("net: ipv4: listified version of ip_rcv") Reported-by: Dan Carpenter Signed-off-by: Edward Cree Signed-off-by: David S. Miller --- include/linux/netfilter.h | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/netfilter.h b/include/linux/netfilter.h index 5a5e0a2ab2a3..23b48de8c2e2 100644 --- a/include/linux/netfilter.h +++ b/include/linux/netfilter.h @@ -294,12 +294,16 @@ NF_HOOK_LIST(uint8_t pf, unsigned int hook, struct net *net, struct sock *sk, int (*okfn)(struct net *, struct sock *, struct sk_buff *)) { struct sk_buff *skb, *next; + struct list_head sublist; + INIT_LIST_HEAD(&sublist); list_for_each_entry_safe(skb, next, head, list) { - int ret = nf_hook(pf, hook, net, sk, skb, in, out, okfn); - if (ret != 1) - list_del(&skb->list); + list_del(&skb->list); + if (nf_hook(pf, hook, net, sk, skb, in, out, okfn) == 1) + list_add_tail(&skb->list, &sublist); } + /* Put passed packets back on main list */ + list_splice(&sublist, head); } /* Call setsockopt() */ -- cgit v1.2.3-59-g8ed1b From b60a60405fb95a688eb2ef4ef20f5fcaa7b64f68 Mon Sep 17 00:00:00 2001 From: Toke Høiland-Jørgensen Date: Fri, 6 Jul 2018 17:37:19 +0200 Subject: netfilter: Add nf_ct_get_tuple_skb global lookup function MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This adds a global netfilter function to extract a conntrack tuple from an skb. The function uses a new function added to nf_ct_hook, which will try to get the tuple from skb->_nfct, and do a full lookup if that fails. This makes it possible to use the lookup function before the skb has passed through the conntrack init hooks (e.g., in an ingress qdisc). The tuple is copied to the caller to avoid issues with reference counting. The function returns false if conntrack is not loaded, allowing it to be used without incurring a module dependency on conntrack. This is used by the NAT mode in sch_cake. Cc: netfilter-devel@vger.kernel.org Signed-off-by: Toke Høiland-Jørgensen Signed-off-by: David S. Miller --- include/linux/netfilter.h | 11 +++++++++++ net/netfilter/core.c | 15 +++++++++++++++ net/netfilter/nf_conntrack_core.c | 36 ++++++++++++++++++++++++++++++++++++ 3 files changed, 62 insertions(+) (limited to 'include/linux') diff --git a/include/linux/netfilter.h b/include/linux/netfilter.h index 23b48de8c2e2..07efffd0c759 100644 --- a/include/linux/netfilter.h +++ b/include/linux/netfilter.h @@ -414,8 +414,17 @@ nf_nat_decode_session(struct sk_buff *skb, struct flowi *fl, u_int8_t family) extern void (*ip_ct_attach)(struct sk_buff *, const struct sk_buff *) __rcu; void nf_ct_attach(struct sk_buff *, const struct sk_buff *); +struct nf_conntrack_tuple; +bool nf_ct_get_tuple_skb(struct nf_conntrack_tuple *dst_tuple, + const struct sk_buff *skb); #else static inline void nf_ct_attach(struct sk_buff *new, struct sk_buff *skb) {} +struct nf_conntrack_tuple; +static inline bool nf_ct_get_tuple_skb(struct nf_conntrack_tuple *dst_tuple, + const struct sk_buff *skb) +{ + return false; +} #endif struct nf_conn; @@ -424,6 +433,8 @@ enum ip_conntrack_info; struct nf_ct_hook { int (*update)(struct net *net, struct sk_buff *skb); void (*destroy)(struct nf_conntrack *); + bool (*get_tuple_skb)(struct nf_conntrack_tuple *, + const struct sk_buff *); }; extern struct nf_ct_hook __rcu *nf_ct_hook; diff --git a/net/netfilter/core.c b/net/netfilter/core.c index 168af54db975..dc240cb47ddf 100644 --- a/net/netfilter/core.c +++ b/net/netfilter/core.c @@ -603,6 +603,21 @@ void nf_conntrack_destroy(struct nf_conntrack *nfct) } EXPORT_SYMBOL(nf_conntrack_destroy); +bool nf_ct_get_tuple_skb(struct nf_conntrack_tuple *dst_tuple, + const struct sk_buff *skb) +{ + struct nf_ct_hook *ct_hook; + bool ret = false; + + rcu_read_lock(); + ct_hook = rcu_dereference(nf_ct_hook); + if (ct_hook) + ret = ct_hook->get_tuple_skb(dst_tuple, skb); + rcu_read_unlock(); + return ret; +} +EXPORT_SYMBOL(nf_ct_get_tuple_skb); + /* Built-in default zone used e.g. by modules. */ const struct nf_conntrack_zone nf_ct_zone_dflt = { .id = NF_CT_DEFAULT_ZONE_ID, diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index 3465da2a98bd..85ab2fd6a665 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -1683,6 +1683,41 @@ static int nf_conntrack_update(struct net *net, struct sk_buff *skb) return 0; } +static bool nf_conntrack_get_tuple_skb(struct nf_conntrack_tuple *dst_tuple, + const struct sk_buff *skb) +{ + const struct nf_conntrack_tuple *src_tuple; + const struct nf_conntrack_tuple_hash *hash; + struct nf_conntrack_tuple srctuple; + enum ip_conntrack_info ctinfo; + struct nf_conn *ct; + + ct = nf_ct_get(skb, &ctinfo); + if (ct) { + src_tuple = nf_ct_tuple(ct, CTINFO2DIR(ctinfo)); + memcpy(dst_tuple, src_tuple, sizeof(*dst_tuple)); + return true; + } + + if (!nf_ct_get_tuplepr(skb, skb_network_offset(skb), + NFPROTO_IPV4, dev_net(skb->dev), + &srctuple)) + return false; + + hash = nf_conntrack_find_get(dev_net(skb->dev), + &nf_ct_zone_dflt, + &srctuple); + if (!hash) + return false; + + ct = nf_ct_tuplehash_to_ctrack(hash); + src_tuple = nf_ct_tuple(ct, !hash->tuple.dst.dir); + memcpy(dst_tuple, src_tuple, sizeof(*dst_tuple)); + nf_ct_put(ct); + + return true; +} + /* Bring out ya dead! */ static struct nf_conn * get_next_corpse(int (*iter)(struct nf_conn *i, void *data), @@ -2204,6 +2239,7 @@ err_cachep: static struct nf_ct_hook nf_conntrack_hook = { .update = nf_conntrack_update, .destroy = destroy_conntrack, + .get_tuple_skb = nf_conntrack_get_tuple_skb, }; void nf_conntrack_init_end(void) -- cgit v1.2.3-59-g8ed1b From 3443b00e07eed5798605ba6524de37e1d3f1a4bf Mon Sep 17 00:00:00 2001 From: Petr Machata Date: Tue, 10 Jul 2018 10:02:57 +0300 Subject: team: Publish team_port_get_rcu() A follow-up patch adds a new entry point, team_port_dev_txable(). Making it an ordinary exported function would mean that any module that may need the service in one of the supported configurations also unconditionally needs to pull in the team module, whether or not the user actually intends to create team interfaces. To prevent that, team_port_dev_txable() is defined in if_team.h, and therefore all dependencies of that function also need to be publicly-visible. Therefore move team_port_get_rcu() from team.c to if_team.h. Signed-off-by: Petr Machata Reviewed-by: Jiri Pirko Signed-off-by: Ido Schimmel Signed-off-by: David S. Miller --- drivers/net/team/team.c | 5 ----- include/linux/if_team.h | 5 +++++ 2 files changed, 5 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/drivers/net/team/team.c b/drivers/net/team/team.c index 3a95eaae0c98..6a047d30e8c6 100644 --- a/drivers/net/team/team.c +++ b/drivers/net/team/team.c @@ -41,11 +41,6 @@ #define team_port_exists(dev) (dev->priv_flags & IFF_TEAM_PORT) -static struct team_port *team_port_get_rcu(const struct net_device *dev) -{ - return rcu_dereference(dev->rx_handler_data); -} - static struct team_port *team_port_get_rtnl(const struct net_device *dev) { struct team_port *port = rtnl_dereference(dev->rx_handler_data); diff --git a/include/linux/if_team.h b/include/linux/if_team.h index d95cae09dea0..0d07c6655cce 100644 --- a/include/linux/if_team.h +++ b/include/linux/if_team.h @@ -74,6 +74,11 @@ struct team_port { long mode_priv[0]; }; +static inline struct team_port *team_port_get_rcu(const struct net_device *dev) +{ + return rcu_dereference(dev->rx_handler_data); +} + static inline bool team_port_enabled(struct team_port *port) { return port->index != -1; -- cgit v1.2.3-59-g8ed1b From eeed992b776c54af6108187c87ac60d028e69d37 Mon Sep 17 00:00:00 2001 From: Petr Machata Date: Tue, 10 Jul 2018 10:02:58 +0300 Subject: net: Add lag.h, net_lag_port_dev_txable() LAG devices (team or bond) recognize for each one of their slave devices whether LAG traffic is going to be sent through that device. Bond calls such devices "active", team calls them "txable". When this state changes, a NETDEV_CHANGELOWERSTATE notification is distributed, together with a netdev_notifier_changelowerstate_info structure that for LAG devices includes a tx_enabled flag that refers to the new state. The notification thus makes it possible to react to the changes in txability in drivers. However there's no way to query txability from the outside on demand. That is problematic namely for mlxsw, which when resolving ERSPAN packet path, may encounter a LAG device, and needs to determine which of the slaves it should choose. To that end, introduce a new function, net_lag_port_dev_txable(), which determines whether a given slave device is "active" or "txable" (depending on the flavor of the LAG device). That function then dispatches to per-LAG-flavor helpers, bond_is_active_slave_dev() resp. team_port_dev_txable(). Because there currently is no good place where net_lag_port_dev_txable() should be added, introduce a new header file, lag.h, which should from now on hold any logic common to both team and bond. (But keep netif_is_lag_master() together with the rest of netif_is_*_master() functions). Signed-off-by: Petr Machata Reviewed-by: Jiri Pirko Signed-off-by: Ido Schimmel Signed-off-by: David S. Miller --- include/linux/if_team.h | 13 +++++++++++++ include/net/bonding.h | 13 +++++++++++++ include/net/lag.h | 17 +++++++++++++++++ 3 files changed, 43 insertions(+) create mode 100644 include/net/lag.h (limited to 'include/linux') diff --git a/include/linux/if_team.h b/include/linux/if_team.h index 0d07c6655cce..ac42da56f7a2 100644 --- a/include/linux/if_team.h +++ b/include/linux/if_team.h @@ -89,6 +89,19 @@ static inline bool team_port_txable(struct team_port *port) return port->linkup && team_port_enabled(port); } +static inline bool team_port_dev_txable(const struct net_device *port_dev) +{ + struct team_port *port; + bool txable; + + rcu_read_lock(); + port = team_port_get_rcu(port_dev); + txable = port ? team_port_txable(port) : false; + rcu_read_unlock(); + + return txable; +} + #ifdef CONFIG_NET_POLL_CONTROLLER static inline void team_netpoll_send_skb(struct team_port *port, struct sk_buff *skb) diff --git a/include/net/bonding.h b/include/net/bonding.h index 808f1d167349..a2d058170ea3 100644 --- a/include/net/bonding.h +++ b/include/net/bonding.h @@ -411,6 +411,19 @@ static inline bool bond_slave_can_tx(struct slave *slave) bond_is_active_slave(slave); } +static inline bool bond_is_active_slave_dev(const struct net_device *slave_dev) +{ + struct slave *slave; + bool active; + + rcu_read_lock(); + slave = bond_slave_get_rcu(slave_dev); + active = bond_is_active_slave(slave); + rcu_read_unlock(); + + return active; +} + static inline void bond_hw_addr_copy(u8 *dst, const u8 *src, unsigned int len) { if (len == ETH_ALEN) { diff --git a/include/net/lag.h b/include/net/lag.h new file mode 100644 index 000000000000..95b880e6fdde --- /dev/null +++ b/include/net/lag.h @@ -0,0 +1,17 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LINUX_IF_LAG_H +#define _LINUX_IF_LAG_H + +#include +#include +#include + +static inline bool net_lag_port_dev_txable(const struct net_device *port_dev) +{ + if (netif_is_team_port(port_dev)) + return team_port_dev_txable(port_dev); + else + return bond_is_active_slave_dev(port_dev); +} + +#endif /* _LINUX_IF_LAG_H */ -- cgit v1.2.3-59-g8ed1b From cca9bab1b72cd2296097c75f59ef11ef80461279 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Wed, 11 Jul 2018 12:16:12 +0200 Subject: tcp: use monotonic timestamps for PAWS Using get_seconds() for timestamps is deprecated since it can lead to overflows on 32-bit systems. While the interface generally doesn't overflow until year 2106, the specific implementation of the TCP PAWS algorithm breaks in 2038 when the intermediate signed 32-bit timestamps overflow. A related problem is that the local timestamps in CLOCK_REALTIME form lead to unexpected behavior when settimeofday is called to set the system clock backwards or forwards by more than 24 days. While the first problem could be solved by using an overflow-safe method of comparing the timestamps, a nicer solution is to use a monotonic clocksource with ktime_get_seconds() that simply doesn't overflow (at least not until 136 years after boot) and that doesn't change during settimeofday(). To make 32-bit and 64-bit architectures behave the same way here, and also save a few bytes in the tcp_options_received structure, I'm changing the type to a 32-bit integer, which is now safe on all architectures. Finally, the ts_recent_stamp field also (confusingly) gets used to store a jiffies value in tcp_synq_overflow()/tcp_synq_no_recent_overflow(). This is currently safe, but changing the type to 32-bit requires some small changes there to keep it working. Signed-off-by: Arnd Bergmann Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- drivers/crypto/chelsio/chtls/chtls_cm.c | 2 +- include/linux/tcp.h | 4 ++-- include/net/tcp.h | 17 ++++++++++------- net/ipv4/tcp_input.c | 2 +- net/ipv4/tcp_ipv4.c | 3 ++- net/ipv4/tcp_minisocks.c | 8 ++++---- 6 files changed, 20 insertions(+), 16 deletions(-) (limited to 'include/linux') diff --git a/drivers/crypto/chelsio/chtls/chtls_cm.c b/drivers/crypto/chelsio/chtls/chtls_cm.c index 2bb6f0380758..0997e166ea57 100644 --- a/drivers/crypto/chelsio/chtls/chtls_cm.c +++ b/drivers/crypto/chelsio/chtls/chtls_cm.c @@ -1673,7 +1673,7 @@ static void chtls_timewait(struct sock *sk) struct tcp_sock *tp = tcp_sk(sk); tp->rcv_nxt++; - tp->rx_opt.ts_recent_stamp = get_seconds(); + tp->rx_opt.ts_recent_stamp = ktime_get_seconds(); tp->srtt_us = 0; tcp_time_wait(sk, TCP_TIME_WAIT, 0); } diff --git a/include/linux/tcp.h b/include/linux/tcp.h index 3dbea6610304..58a8d7d71354 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -89,7 +89,7 @@ struct tcp_sack_block { struct tcp_options_received { /* PAWS/RTTM data */ - long ts_recent_stamp;/* Time we stored ts_recent (for aging) */ + int ts_recent_stamp;/* Time we stored ts_recent (for aging) */ u32 ts_recent; /* Time stamp to echo next */ u32 rcv_tsval; /* Time stamp value */ u32 rcv_tsecr; /* Time stamp echo reply */ @@ -426,7 +426,7 @@ struct tcp_timewait_sock { /* The time we sent the last out-of-window ACK: */ u32 tw_last_oow_ack_time; - long tw_ts_recent_stamp; + int tw_ts_recent_stamp; #ifdef CONFIG_TCP_MD5SIG struct tcp_md5sig_key *tw_md5_key; #endif diff --git a/include/net/tcp.h b/include/net/tcp.h index f6cb20e6e524..582304955087 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -472,19 +472,20 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb); */ static inline void tcp_synq_overflow(const struct sock *sk) { - unsigned long last_overflow = tcp_sk(sk)->rx_opt.ts_recent_stamp; - unsigned long now = jiffies; + unsigned int last_overflow = tcp_sk(sk)->rx_opt.ts_recent_stamp; + unsigned int now = jiffies; - if (time_after(now, last_overflow + HZ)) + if (time_after32(now, last_overflow + HZ)) tcp_sk(sk)->rx_opt.ts_recent_stamp = now; } /* syncookies: no recent synqueue overflow on this listening socket? */ static inline bool tcp_synq_no_recent_overflow(const struct sock *sk) { - unsigned long last_overflow = tcp_sk(sk)->rx_opt.ts_recent_stamp; + unsigned int last_overflow = tcp_sk(sk)->rx_opt.ts_recent_stamp; + unsigned int now = jiffies; - return time_after(jiffies, last_overflow + TCP_SYNCOOKIE_VALID); + return time_after32(now, last_overflow + TCP_SYNCOOKIE_VALID); } static inline u32 tcp_cookie_time(void) @@ -1375,7 +1376,8 @@ static inline bool tcp_paws_check(const struct tcp_options_received *rx_opt, { if ((s32)(rx_opt->ts_recent - rx_opt->rcv_tsval) <= paws_win) return true; - if (unlikely(get_seconds() >= rx_opt->ts_recent_stamp + TCP_PAWS_24DAYS)) + if (unlikely(!time_before32(ktime_get_seconds(), + rx_opt->ts_recent_stamp + TCP_PAWS_24DAYS))) return true; /* * Some OSes send SYN and SYNACK messages with tsval=0 tsecr=0, @@ -1405,7 +1407,8 @@ static inline bool tcp_paws_reject(const struct tcp_options_received *rx_opt, However, we can relax time bounds for RST segments to MSL. */ - if (rst && get_seconds() >= rx_opt->ts_recent_stamp + TCP_PAWS_MSL) + if (rst && !time_before32(ktime_get_seconds(), + rx_opt->ts_recent_stamp + TCP_PAWS_MSL)) return false; return true; } diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 814ea43dd12f..d3b6390ecf23 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -3462,7 +3462,7 @@ static void tcp_send_challenge_ack(struct sock *sk, const struct sk_buff *skb) static void tcp_store_ts_recent(struct tcp_sock *tp) { tp->rx_opt.ts_recent = tp->rx_opt.rcv_tsval; - tp->rx_opt.ts_recent_stamp = get_seconds(); + tp->rx_opt.ts_recent_stamp = ktime_get_seconds(); } static void tcp_replace_ts_recent(struct tcp_sock *tp, u32 seq) diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index bea17f1e8302..dc415c66a33a 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -155,7 +155,8 @@ int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp) and use initial timestamp retrieved from peer table. */ if (tcptw->tw_ts_recent_stamp && - (!twp || (reuse && get_seconds() - tcptw->tw_ts_recent_stamp > 1))) { + (!twp || (reuse && time_after32(ktime_get_seconds(), + tcptw->tw_ts_recent_stamp)))) { tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2; if (tp->write_seq == 0) tp->write_seq = 1; diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index dac5893a52b4..75ef332a7caf 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -144,7 +144,7 @@ tcp_timewait_state_process(struct inet_timewait_sock *tw, struct sk_buff *skb, tw->tw_substate = TCP_TIME_WAIT; tcptw->tw_rcv_nxt = TCP_SKB_CB(skb)->end_seq; if (tmp_opt.saw_tstamp) { - tcptw->tw_ts_recent_stamp = get_seconds(); + tcptw->tw_ts_recent_stamp = ktime_get_seconds(); tcptw->tw_ts_recent = tmp_opt.rcv_tsval; } @@ -189,7 +189,7 @@ kill: if (tmp_opt.saw_tstamp) { tcptw->tw_ts_recent = tmp_opt.rcv_tsval; - tcptw->tw_ts_recent_stamp = get_seconds(); + tcptw->tw_ts_recent_stamp = ktime_get_seconds(); } inet_twsk_put(tw); @@ -537,7 +537,7 @@ struct sock *tcp_create_openreq_child(const struct sock *sk, if (newtp->rx_opt.tstamp_ok) { newtp->rx_opt.ts_recent = req->ts_recent; - newtp->rx_opt.ts_recent_stamp = get_seconds(); + newtp->rx_opt.ts_recent_stamp = ktime_get_seconds(); newtp->tcp_header_len = sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED; } else { newtp->rx_opt.ts_recent_stamp = 0; @@ -603,7 +603,7 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb, * it can be estimated (approximately) * from another data. */ - tmp_opt.ts_recent_stamp = get_seconds() - ((TCP_TIMEOUT_INIT/HZ)<num_timeout); + tmp_opt.ts_recent_stamp = ktime_get_seconds() - ((TCP_TIMEOUT_INIT/HZ)<num_timeout); paws_reject = tcp_paws_reject(&tmp_opt, th->rst); } } -- cgit v1.2.3-59-g8ed1b From 523f9eb1ef25aab4aaf9aeb5356160e8039411ef Mon Sep 17 00:00:00 2001 From: Alex Vesker Date: Thu, 12 Jul 2018 15:13:15 +0300 Subject: net/mlx4_core: Add health buffer address capability Health buffer address is a 32 bit PCI address offset provided by the FW. This offset is used for reading FW health debug data located on the shared CR space. Cr space is accessible in both driver and FW and allows for different queries and configurations. Health buffer size is always 64B of readable data followed by a lock which is used to block volatile CR space access. Signed-off-by: Alex Vesker Signed-off-by: Tariq Toukan Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- drivers/net/ethernet/mellanox/mlx4/fw.c | 5 ++++- drivers/net/ethernet/mellanox/mlx4/fw.h | 1 + drivers/net/ethernet/mellanox/mlx4/main.c | 1 + include/linux/mlx4/device.h | 1 + 4 files changed, 7 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/drivers/net/ethernet/mellanox/mlx4/fw.c b/drivers/net/ethernet/mellanox/mlx4/fw.c index 46dcbfbe4c5e..babcfd9c0571 100644 --- a/drivers/net/ethernet/mellanox/mlx4/fw.c +++ b/drivers/net/ethernet/mellanox/mlx4/fw.c @@ -825,7 +825,7 @@ int mlx4_QUERY_DEV_CAP(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap) #define QUERY_DEV_CAP_QP_RATE_LIMIT_NUM_OFFSET 0xcc #define QUERY_DEV_CAP_QP_RATE_LIMIT_MAX_OFFSET 0xd0 #define QUERY_DEV_CAP_QP_RATE_LIMIT_MIN_OFFSET 0xd2 - +#define QUERY_DEV_CAP_HEALTH_BUFFER_ADDRESS_OFFSET 0xe4 dev_cap->flags2 = 0; mailbox = mlx4_alloc_cmd_mailbox(dev); @@ -1082,6 +1082,9 @@ int mlx4_QUERY_DEV_CAP(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap) dev_cap->rl_caps.min_unit = size >> 14; } + MLX4_GET(dev_cap->health_buffer_addrs, outbox, + QUERY_DEV_CAP_HEALTH_BUFFER_ADDRESS_OFFSET); + MLX4_GET(field32, outbox, QUERY_DEV_CAP_EXT_2_FLAGS_OFFSET); if (field32 & (1 << 16)) dev_cap->flags2 |= MLX4_DEV_CAP_FLAG2_UPDATE_QP; diff --git a/drivers/net/ethernet/mellanox/mlx4/fw.h b/drivers/net/ethernet/mellanox/mlx4/fw.h index cd6399c76bfd..650ae08c71de 100644 --- a/drivers/net/ethernet/mellanox/mlx4/fw.h +++ b/drivers/net/ethernet/mellanox/mlx4/fw.h @@ -128,6 +128,7 @@ struct mlx4_dev_cap { u32 dmfs_high_rate_qpn_base; u32 dmfs_high_rate_qpn_range; struct mlx4_rate_limit_caps rl_caps; + u32 health_buffer_addrs; struct mlx4_port_cap port_cap[MLX4_MAX_PORTS + 1]; bool wol_port[MLX4_MAX_PORTS + 1]; }; diff --git a/drivers/net/ethernet/mellanox/mlx4/main.c b/drivers/net/ethernet/mellanox/mlx4/main.c index c42eddfcd2b5..806d441b3701 100644 --- a/drivers/net/ethernet/mellanox/mlx4/main.c +++ b/drivers/net/ethernet/mellanox/mlx4/main.c @@ -523,6 +523,7 @@ static int mlx4_dev_cap(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap) dev->caps.max_rss_tbl_sz = dev_cap->max_rss_tbl_sz; dev->caps.wol_port[1] = dev_cap->wol_port[1]; dev->caps.wol_port[2] = dev_cap->wol_port[2]; + dev->caps.health_buffer_addrs = dev_cap->health_buffer_addrs; /* Save uar page shift */ if (!mlx4_is_slave(dev)) { diff --git a/include/linux/mlx4/device.h b/include/linux/mlx4/device.h index 122e7e9d3091..e3bfe76aea98 100644 --- a/include/linux/mlx4/device.h +++ b/include/linux/mlx4/device.h @@ -630,6 +630,7 @@ struct mlx4_caps { u32 vf_caps; bool wol_port[MLX4_MAX_PORTS + 1]; struct mlx4_rate_limit_caps rl_caps; + u32 health_buffer_addrs; }; struct mlx4_buf_list { -- cgit v1.2.3-59-g8ed1b From bedc989b0c98285b277ff8a08ff9514e580913f4 Mon Sep 17 00:00:00 2001 From: Alex Vesker Date: Thu, 12 Jul 2018 15:13:16 +0300 Subject: net/mlx4_core: Add Crdump FW snapshot support Crdump allows the driver to create a snapshot of the FW PCI crspace and health buffer during a critical FW issue. In case of a FW command timeout, FW getting stuck or a non zero value on the catastrophic buffer, a snapshot will be taken. The snapshot is exposed using devlink, cr-space, fw-health address regions are registered on init and snapshots are attached once a new snapshot is collected by the driver. Signed-off-by: Alex Vesker Signed-off-by: Tariq Toukan Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- drivers/net/ethernet/mellanox/mlx4/Makefile | 2 +- drivers/net/ethernet/mellanox/mlx4/catas.c | 6 +- drivers/net/ethernet/mellanox/mlx4/crdump.c | 231 ++++++++++++++++++++++++++++ drivers/net/ethernet/mellanox/mlx4/main.c | 10 +- drivers/net/ethernet/mellanox/mlx4/mlx4.h | 4 + include/linux/mlx4/device.h | 6 + 6 files changed, 255 insertions(+), 4 deletions(-) create mode 100644 drivers/net/ethernet/mellanox/mlx4/crdump.c (limited to 'include/linux') diff --git a/drivers/net/ethernet/mellanox/mlx4/Makefile b/drivers/net/ethernet/mellanox/mlx4/Makefile index 16b10d01fcf4..3f400770fcd8 100644 --- a/drivers/net/ethernet/mellanox/mlx4/Makefile +++ b/drivers/net/ethernet/mellanox/mlx4/Makefile @@ -3,7 +3,7 @@ obj-$(CONFIG_MLX4_CORE) += mlx4_core.o mlx4_core-y := alloc.o catas.o cmd.o cq.o eq.o fw.o fw_qos.o icm.o intf.o \ main.o mcg.o mr.o pd.o port.o profile.o qp.o reset.o sense.o \ - srq.o resource_tracker.o + srq.o resource_tracker.o crdump.o obj-$(CONFIG_MLX4_EN) += mlx4_en.o diff --git a/drivers/net/ethernet/mellanox/mlx4/catas.c b/drivers/net/ethernet/mellanox/mlx4/catas.c index 8afe4b5fb09b..c81d15bf259c 100644 --- a/drivers/net/ethernet/mellanox/mlx4/catas.c +++ b/drivers/net/ethernet/mellanox/mlx4/catas.c @@ -178,10 +178,12 @@ void mlx4_enter_error_state(struct mlx4_dev_persistent *persist) dev = persist->dev; mlx4_err(dev, "device is going to be reset\n"); - if (mlx4_is_slave(dev)) + if (mlx4_is_slave(dev)) { err = mlx4_reset_slave(dev); - else + } else { + mlx4_crdump_collect(dev); err = mlx4_reset_master(dev); + } if (!err) { mlx4_err(dev, "device was reset successfully\n"); diff --git a/drivers/net/ethernet/mellanox/mlx4/crdump.c b/drivers/net/ethernet/mellanox/mlx4/crdump.c new file mode 100644 index 000000000000..4d5524dffec4 --- /dev/null +++ b/drivers/net/ethernet/mellanox/mlx4/crdump.c @@ -0,0 +1,231 @@ +/* + * Copyright (c) 2018, Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "mlx4.h" + +#define BAD_ACCESS 0xBADACCE5 +#define HEALTH_BUFFER_SIZE 0x40 +#define CR_ENABLE_BIT swab32(BIT(6)) +#define CR_ENABLE_BIT_OFFSET 0xF3F04 +#define MAX_NUM_OF_DUMPS_TO_STORE (8) + +static const char *region_cr_space_str = "cr-space"; +static const char *region_fw_health_str = "fw-health"; + +/* Set to true in case cr enable bit was set to true before crdump */ +static bool crdump_enbale_bit_set; + +static void crdump_enable_crspace_access(struct mlx4_dev *dev, + u8 __iomem *cr_space) +{ + /* Get current enable bit value */ + crdump_enbale_bit_set = + readl(cr_space + CR_ENABLE_BIT_OFFSET) & CR_ENABLE_BIT; + + /* Enable FW CR filter (set bit6 to 0) */ + if (crdump_enbale_bit_set) + writel(readl(cr_space + CR_ENABLE_BIT_OFFSET) & ~CR_ENABLE_BIT, + cr_space + CR_ENABLE_BIT_OFFSET); + + /* Enable block volatile crspace accesses */ + writel(swab32(1), cr_space + dev->caps.health_buffer_addrs + + HEALTH_BUFFER_SIZE); +} + +static void crdump_disable_crspace_access(struct mlx4_dev *dev, + u8 __iomem *cr_space) +{ + /* Disable block volatile crspace accesses */ + writel(0, cr_space + dev->caps.health_buffer_addrs + + HEALTH_BUFFER_SIZE); + + /* Restore FW CR filter value (set bit6 to original value) */ + if (crdump_enbale_bit_set) + writel(readl(cr_space + CR_ENABLE_BIT_OFFSET) | CR_ENABLE_BIT, + cr_space + CR_ENABLE_BIT_OFFSET); +} + +static void mlx4_crdump_collect_crspace(struct mlx4_dev *dev, + u8 __iomem *cr_space, + u32 id) +{ + struct mlx4_fw_crdump *crdump = &dev->persist->crdump; + struct pci_dev *pdev = dev->persist->pdev; + unsigned long cr_res_size; + u8 *crspace_data; + int offset; + int err; + + if (!crdump->region_crspace) { + mlx4_err(dev, "crdump: cr-space region is NULL\n"); + return; + } + + /* Try to collect CR space */ + cr_res_size = pci_resource_len(pdev, 0); + crspace_data = kvmalloc(cr_res_size, GFP_KERNEL); + if (crspace_data) { + for (offset = 0; offset < cr_res_size; offset += 4) + *(u32 *)(crspace_data + offset) = + readl(cr_space + offset); + + err = devlink_region_snapshot_create(crdump->region_crspace, + cr_res_size, crspace_data, + id, &kvfree); + if (err) { + kvfree(crspace_data); + mlx4_warn(dev, "crdump: devlink create %s snapshot id %d err %d\n", + region_cr_space_str, id, err); + } else { + mlx4_info(dev, "crdump: added snapshot %d to devlink region %s\n", + id, region_cr_space_str); + } + } else { + mlx4_err(dev, "crdump: Failed to allocate crspace buffer\n"); + } +} + +static void mlx4_crdump_collect_fw_health(struct mlx4_dev *dev, + u8 __iomem *cr_space, + u32 id) +{ + struct mlx4_fw_crdump *crdump = &dev->persist->crdump; + u8 *health_data; + int offset; + int err; + + if (!crdump->region_fw_health) { + mlx4_err(dev, "crdump: fw-health region is NULL\n"); + return; + } + + /* Try to collect health buffer */ + health_data = kvmalloc(HEALTH_BUFFER_SIZE, GFP_KERNEL); + if (health_data) { + u8 __iomem *health_buf_start = + cr_space + dev->caps.health_buffer_addrs; + + for (offset = 0; offset < HEALTH_BUFFER_SIZE; offset += 4) + *(u32 *)(health_data + offset) = + readl(health_buf_start + offset); + + err = devlink_region_snapshot_create(crdump->region_fw_health, + HEALTH_BUFFER_SIZE, + health_data, + id, &kvfree); + if (err) { + kvfree(health_data); + mlx4_warn(dev, "crdump: devlink create %s snapshot id %d err %d\n", + region_fw_health_str, id, err); + } else { + mlx4_info(dev, "crdump: added snapshot %d to devlink region %s\n", + id, region_fw_health_str); + } + } else { + mlx4_err(dev, "crdump: Failed to allocate health buffer\n"); + } +} + +int mlx4_crdump_collect(struct mlx4_dev *dev) +{ + struct devlink *devlink = priv_to_devlink(mlx4_priv(dev)); + struct pci_dev *pdev = dev->persist->pdev; + unsigned long cr_res_size; + u8 __iomem *cr_space; + u32 id; + + if (!dev->caps.health_buffer_addrs) { + mlx4_info(dev, "crdump: FW doesn't support health buffer access, skipping\n"); + return 0; + } + + cr_res_size = pci_resource_len(pdev, 0); + + cr_space = ioremap(pci_resource_start(pdev, 0), cr_res_size); + if (!cr_space) { + mlx4_err(dev, "crdump: Failed to map pci cr region\n"); + return -ENODEV; + } + + crdump_enable_crspace_access(dev, cr_space); + + /* Get the available snapshot ID for the dumps */ + id = devlink_region_shapshot_id_get(devlink); + + /* Try to capture dumps */ + mlx4_crdump_collect_crspace(dev, cr_space, id); + mlx4_crdump_collect_fw_health(dev, cr_space, id); + + crdump_disable_crspace_access(dev, cr_space); + + iounmap(cr_space); + return 0; +} + +int mlx4_crdump_init(struct mlx4_dev *dev) +{ + struct devlink *devlink = priv_to_devlink(mlx4_priv(dev)); + struct mlx4_fw_crdump *crdump = &dev->persist->crdump; + struct pci_dev *pdev = dev->persist->pdev; + + /* Create cr-space region */ + crdump->region_crspace = + devlink_region_create(devlink, + region_cr_space_str, + MAX_NUM_OF_DUMPS_TO_STORE, + pci_resource_len(pdev, 0)); + if (IS_ERR(crdump->region_crspace)) + mlx4_warn(dev, "crdump: create devlink region %s err %ld\n", + region_cr_space_str, + PTR_ERR(crdump->region_crspace)); + + /* Create fw-health region */ + crdump->region_fw_health = + devlink_region_create(devlink, + region_fw_health_str, + MAX_NUM_OF_DUMPS_TO_STORE, + HEALTH_BUFFER_SIZE); + if (IS_ERR(crdump->region_fw_health)) + mlx4_warn(dev, "crdump: create devlink region %s err %ld\n", + region_fw_health_str, + PTR_ERR(crdump->region_fw_health)); + + return 0; +} + +void mlx4_crdump_end(struct mlx4_dev *dev) +{ + struct mlx4_fw_crdump *crdump = &dev->persist->crdump; + + devlink_region_destroy(crdump->region_fw_health); + devlink_region_destroy(crdump->region_crspace); +} diff --git a/drivers/net/ethernet/mellanox/mlx4/main.c b/drivers/net/ethernet/mellanox/mlx4/main.c index 806d441b3701..46b021409b8b 100644 --- a/drivers/net/ethernet/mellanox/mlx4/main.c +++ b/drivers/net/ethernet/mellanox/mlx4/main.c @@ -3807,10 +3807,14 @@ static int __mlx4_init_one(struct pci_dev *pdev, int pci_dev_data, } } - err = mlx4_catas_init(&priv->dev); + err = mlx4_crdump_init(&priv->dev); if (err) goto err_release_regions; + err = mlx4_catas_init(&priv->dev); + if (err) + goto err_crdump; + err = mlx4_load_one(pdev, pci_dev_data, total_vfs, nvfs, priv, 0); if (err) goto err_catas; @@ -3820,6 +3824,9 @@ static int __mlx4_init_one(struct pci_dev *pdev, int pci_dev_data, err_catas: mlx4_catas_end(&priv->dev); +err_crdump: + mlx4_crdump_end(&priv->dev); + err_release_regions: pci_release_regions(pdev); @@ -4081,6 +4088,7 @@ static void mlx4_remove_one(struct pci_dev *pdev) else mlx4_info(dev, "%s: interface is down\n", __func__); mlx4_catas_end(dev); + mlx4_crdump_end(dev); if (dev->flags & MLX4_FLAG_SRIOV && !active_vfs) { mlx4_warn(dev, "Disabling SR-IOV\n"); pci_disable_sriov(pdev); diff --git a/drivers/net/ethernet/mellanox/mlx4/mlx4.h b/drivers/net/ethernet/mellanox/mlx4/mlx4.h index 2ebaa3bee42f..6e016092a6f1 100644 --- a/drivers/net/ethernet/mellanox/mlx4/mlx4.h +++ b/drivers/net/ethernet/mellanox/mlx4/mlx4.h @@ -1042,6 +1042,8 @@ void mlx4_start_catas_poll(struct mlx4_dev *dev); void mlx4_stop_catas_poll(struct mlx4_dev *dev); int mlx4_catas_init(struct mlx4_dev *dev); void mlx4_catas_end(struct mlx4_dev *dev); +int mlx4_crdump_init(struct mlx4_dev *dev); +void mlx4_crdump_end(struct mlx4_dev *dev); int mlx4_restart_one(struct pci_dev *pdev, bool reload, struct devlink *devlink); int mlx4_register_device(struct mlx4_dev *dev); @@ -1228,6 +1230,8 @@ void mlx4_srq_event(struct mlx4_dev *dev, u32 srqn, int event_type); void mlx4_enter_error_state(struct mlx4_dev_persistent *persist); int mlx4_comm_internal_err(u32 slave_read); +int mlx4_crdump_collect(struct mlx4_dev *dev); + int mlx4_SENSE_PORT(struct mlx4_dev *dev, int port, enum mlx4_port_type *type); void mlx4_do_sense_ports(struct mlx4_dev *dev, diff --git a/include/linux/mlx4/device.h b/include/linux/mlx4/device.h index e3bfe76aea98..300b944e6e1e 100644 --- a/include/linux/mlx4/device.h +++ b/include/linux/mlx4/device.h @@ -852,6 +852,11 @@ struct mlx4_vf_dev { u8 n_ports; }; +struct mlx4_fw_crdump { + struct devlink_region *region_crspace; + struct devlink_region *region_fw_health; +}; + enum mlx4_pci_status { MLX4_PCI_STATUS_DISABLED, MLX4_PCI_STATUS_ENABLED, @@ -872,6 +877,7 @@ struct mlx4_dev_persistent { u8 interface_state; struct mutex pci_status_mutex; /* sync pci state */ enum mlx4_pci_status pci_status; + struct mlx4_fw_crdump crdump; }; struct mlx4_dev { -- cgit v1.2.3-59-g8ed1b From 3c641ba4a852cf4e90e3d7f29c5df08e24213c5d Mon Sep 17 00:00:00 2001 From: Alex Vesker Date: Thu, 12 Jul 2018 15:13:18 +0300 Subject: net/mlx4_core: Use devlink region_snapshot parameter This parameter enables capturing region snapshot of the crspace during critical errors. The default value of this parameter is disabled, it can be enabled using devlink param commands. It is possible to configure during runtime and also driver init. Signed-off-by: Alex Vesker Signed-off-by: Jiri Pirko Reviewed-by: Moshe Shemesh Signed-off-by: David S. Miller --- drivers/net/ethernet/mellanox/mlx4/crdump.c | 8 ++++++ drivers/net/ethernet/mellanox/mlx4/main.c | 41 +++++++++++++++++++++++++++++ include/linux/mlx4/device.h | 1 + 3 files changed, 50 insertions(+) (limited to 'include/linux') diff --git a/drivers/net/ethernet/mellanox/mlx4/crdump.c b/drivers/net/ethernet/mellanox/mlx4/crdump.c index 4d5524dffec4..88316c743820 100644 --- a/drivers/net/ethernet/mellanox/mlx4/crdump.c +++ b/drivers/net/ethernet/mellanox/mlx4/crdump.c @@ -158,6 +158,7 @@ static void mlx4_crdump_collect_fw_health(struct mlx4_dev *dev, int mlx4_crdump_collect(struct mlx4_dev *dev) { struct devlink *devlink = priv_to_devlink(mlx4_priv(dev)); + struct mlx4_fw_crdump *crdump = &dev->persist->crdump; struct pci_dev *pdev = dev->persist->pdev; unsigned long cr_res_size; u8 __iomem *cr_space; @@ -168,6 +169,11 @@ int mlx4_crdump_collect(struct mlx4_dev *dev) return 0; } + if (!crdump->snapshot_enable) { + mlx4_info(dev, "crdump: devlink snapshot disabled, skipping\n"); + return 0; + } + cr_res_size = pci_resource_len(pdev, 0); cr_space = ioremap(pci_resource_start(pdev, 0), cr_res_size); @@ -197,6 +203,8 @@ int mlx4_crdump_init(struct mlx4_dev *dev) struct mlx4_fw_crdump *crdump = &dev->persist->crdump; struct pci_dev *pdev = dev->persist->pdev; + crdump->snapshot_enable = false; + /* Create cr-space region */ crdump->region_crspace = devlink_region_create(devlink, diff --git a/drivers/net/ethernet/mellanox/mlx4/main.c b/drivers/net/ethernet/mellanox/mlx4/main.c index 46b021409b8b..2d979a652b7b 100644 --- a/drivers/net/ethernet/mellanox/mlx4/main.c +++ b/drivers/net/ethernet/mellanox/mlx4/main.c @@ -191,6 +191,26 @@ static int mlx4_devlink_ierr_reset_set(struct devlink *devlink, u32 id, return 0; } +static int mlx4_devlink_crdump_snapshot_get(struct devlink *devlink, u32 id, + struct devlink_param_gset_ctx *ctx) +{ + struct mlx4_priv *priv = devlink_priv(devlink); + struct mlx4_dev *dev = &priv->dev; + + ctx->val.vbool = dev->persist->crdump.snapshot_enable; + return 0; +} + +static int mlx4_devlink_crdump_snapshot_set(struct devlink *devlink, u32 id, + struct devlink_param_gset_ctx *ctx) +{ + struct mlx4_priv *priv = devlink_priv(devlink); + struct mlx4_dev *dev = &priv->dev; + + dev->persist->crdump.snapshot_enable = ctx->val.vbool; + return 0; +} + static int mlx4_devlink_max_macs_validate(struct devlink *devlink, u32 id, union devlink_param_value val, @@ -224,6 +244,11 @@ static const struct devlink_param mlx4_devlink_params[] = { DEVLINK_PARAM_GENERIC(MAX_MACS, BIT(DEVLINK_PARAM_CMODE_DRIVERINIT), NULL, NULL, mlx4_devlink_max_macs_validate), + DEVLINK_PARAM_GENERIC(REGION_SNAPSHOT, + BIT(DEVLINK_PARAM_CMODE_RUNTIME) | + BIT(DEVLINK_PARAM_CMODE_DRIVERINIT), + mlx4_devlink_crdump_snapshot_get, + mlx4_devlink_crdump_snapshot_set, NULL), DEVLINK_PARAM_DRIVER(MLX4_DEVLINK_PARAM_ID_ENABLE_64B_CQE_EQE, "enable_64b_cqe_eqe", DEVLINK_PARAM_TYPE_BOOL, BIT(DEVLINK_PARAM_CMODE_DRIVERINIT), @@ -270,6 +295,11 @@ static void mlx4_devlink_set_params_init_values(struct devlink *devlink) mlx4_devlink_set_init_value(devlink, MLX4_DEVLINK_PARAM_ID_ENABLE_4K_UAR, value); + + value.vbool = false; + mlx4_devlink_set_init_value(devlink, + DEVLINK_PARAM_GENERIC_ID_REGION_SNAPSHOT, + value); } static inline void mlx4_set_num_reserved_uars(struct mlx4_dev *dev, @@ -3862,6 +3892,9 @@ static int mlx4_devlink_port_type_set(struct devlink_port *devlink_port, static void mlx4_devlink_param_load_driverinit_values(struct devlink *devlink) { + struct mlx4_priv *priv = devlink_priv(devlink); + struct mlx4_dev *dev = &priv->dev; + struct mlx4_fw_crdump *crdump = &dev->persist->crdump; union devlink_param_value saved_value; int err; @@ -3889,6 +3922,14 @@ static void mlx4_devlink_param_load_driverinit_values(struct devlink *devlink) &saved_value); if (!err) enable_4k_uar = saved_value.vbool; + err = devlink_param_driverinit_value_get(devlink, + DEVLINK_PARAM_GENERIC_ID_REGION_SNAPSHOT, + &saved_value); + if (!err && crdump->snapshot_enable != saved_value.vbool) { + crdump->snapshot_enable = saved_value.vbool; + devlink_param_value_changed(devlink, + DEVLINK_PARAM_GENERIC_ID_REGION_SNAPSHOT); + } } static int mlx4_devlink_reload(struct devlink *devlink, diff --git a/include/linux/mlx4/device.h b/include/linux/mlx4/device.h index 300b944e6e1e..dca6ab4eaa99 100644 --- a/include/linux/mlx4/device.h +++ b/include/linux/mlx4/device.h @@ -853,6 +853,7 @@ struct mlx4_vf_dev { }; struct mlx4_fw_crdump { + bool snapshot_enable; struct devlink_region *region_crspace; struct devlink_region *region_fw_health; }; -- cgit v1.2.3-59-g8ed1b From 6b8675897338f874c41612655a85d8e10cdb23d8 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Wed, 11 Jul 2018 20:36:39 -0700 Subject: xdp: don't make drivers report attachment mode prog_attached of struct netdev_bpf should have been superseded by simply setting prog_id long time ago, but we kept it around to allow offloading drivers to communicate attachment mode (drv vs hw). Subsequently drivers were also allowed to report back attachment flags (prog_flags), and since nowadays only programs attached will XDP_FLAGS_HW_MODE can get offloaded, we can tell the attachment mode from the flags driver reports. Remove prog_attached member. Signed-off-by: Jakub Kicinski Reviewed-by: Quentin Monnet Signed-off-by: Daniel Borkmann --- drivers/net/ethernet/broadcom/bnxt/bnxt_xdp.c | 1 - drivers/net/ethernet/cavium/thunder/nicvf_main.c | 1 - drivers/net/ethernet/intel/i40e/i40e_main.c | 1 - drivers/net/ethernet/intel/ixgbe/ixgbe_main.c | 1 - drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c | 1 - drivers/net/ethernet/mellanox/mlx4/en_netdev.c | 1 - drivers/net/ethernet/mellanox/mlx5/core/en_main.c | 1 - drivers/net/ethernet/netronome/nfp/nfp_net_common.c | 3 --- drivers/net/ethernet/qlogic/qede/qede_filter.c | 1 - drivers/net/netdevsim/bpf.c | 1 - drivers/net/tun.c | 1 - drivers/net/virtio_net.c | 1 - include/linux/netdevice.h | 5 ----- net/core/dev.c | 7 +++---- net/core/rtnetlink.c | 8 ++++++-- 15 files changed, 9 insertions(+), 25 deletions(-) (limited to 'include/linux') diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_xdp.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_xdp.c index 1f0e872d0667..0584d07c8c33 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt_xdp.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_xdp.c @@ -219,7 +219,6 @@ int bnxt_xdp(struct net_device *dev, struct netdev_bpf *xdp) rc = bnxt_xdp_set(bp, xdp->prog); break; case XDP_QUERY_PROG: - xdp->prog_attached = !!bp->xdp_prog; xdp->prog_id = bp->xdp_prog ? bp->xdp_prog->aux->id : 0; rc = 0; break; diff --git a/drivers/net/ethernet/cavium/thunder/nicvf_main.c b/drivers/net/ethernet/cavium/thunder/nicvf_main.c index 135766c4296b..768f584f8392 100644 --- a/drivers/net/ethernet/cavium/thunder/nicvf_main.c +++ b/drivers/net/ethernet/cavium/thunder/nicvf_main.c @@ -1848,7 +1848,6 @@ static int nicvf_xdp(struct net_device *netdev, struct netdev_bpf *xdp) case XDP_SETUP_PROG: return nicvf_xdp_setup(nic, xdp->prog); case XDP_QUERY_PROG: - xdp->prog_attached = !!nic->xdp_prog; xdp->prog_id = nic->xdp_prog ? nic->xdp_prog->aux->id : 0; return 0; default: diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c b/drivers/net/ethernet/intel/i40e/i40e_main.c index 426b0ccb1fc6..51762428b40e 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_main.c +++ b/drivers/net/ethernet/intel/i40e/i40e_main.c @@ -11841,7 +11841,6 @@ static int i40e_xdp(struct net_device *dev, case XDP_SETUP_PROG: return i40e_xdp_setup(vsi, xdp->prog); case XDP_QUERY_PROG: - xdp->prog_attached = i40e_enabled_xdp_vsi(vsi); xdp->prog_id = vsi->xdp_prog ? vsi->xdp_prog->aux->id : 0; return 0; default: diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c index a8e21becb619..3862fea1c923 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c @@ -9966,7 +9966,6 @@ static int ixgbe_xdp(struct net_device *dev, struct netdev_bpf *xdp) case XDP_SETUP_PROG: return ixgbe_xdp_setup(dev, xdp->prog); case XDP_QUERY_PROG: - xdp->prog_attached = !!(adapter->xdp_prog); xdp->prog_id = adapter->xdp_prog ? adapter->xdp_prog->aux->id : 0; return 0; diff --git a/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c b/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c index 59416eddd840..d86446d202d5 100644 --- a/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c +++ b/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c @@ -4462,7 +4462,6 @@ static int ixgbevf_xdp(struct net_device *dev, struct netdev_bpf *xdp) case XDP_SETUP_PROG: return ixgbevf_xdp_setup(dev, xdp->prog); case XDP_QUERY_PROG: - xdp->prog_attached = !!(adapter->xdp_prog); xdp->prog_id = adapter->xdp_prog ? adapter->xdp_prog->aux->id : 0; return 0; diff --git a/drivers/net/ethernet/mellanox/mlx4/en_netdev.c b/drivers/net/ethernet/mellanox/mlx4/en_netdev.c index 65eb06e017e4..6785661d1a72 100644 --- a/drivers/net/ethernet/mellanox/mlx4/en_netdev.c +++ b/drivers/net/ethernet/mellanox/mlx4/en_netdev.c @@ -2926,7 +2926,6 @@ static int mlx4_xdp(struct net_device *dev, struct netdev_bpf *xdp) return mlx4_xdp_set(dev, xdp->prog); case XDP_QUERY_PROG: xdp->prog_id = mlx4_xdp_query(dev); - xdp->prog_attached = !!xdp->prog_id; return 0; default: return -EINVAL; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c index bbd2fd0b2e06..e4a9a0768a81 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c @@ -4192,7 +4192,6 @@ static int mlx5e_xdp(struct net_device *dev, struct netdev_bpf *xdp) return mlx5e_xdp_set(dev, xdp->prog); case XDP_QUERY_PROG: xdp->prog_id = mlx5e_xdp_query(dev); - xdp->prog_attached = !!xdp->prog_id; return 0; default: return -EINVAL; diff --git a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c index 7df5ca37bfb8..d20714598613 100644 --- a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c +++ b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c @@ -3459,9 +3459,6 @@ static int nfp_net_xdp(struct net_device *netdev, struct netdev_bpf *xdp) return nfp_net_xdp_setup(nn, xdp->prog, xdp->flags, xdp->extack); case XDP_QUERY_PROG: - xdp->prog_attached = !!nn->xdp_prog; - if (nn->dp.bpf_offload_xdp) - xdp->prog_attached = XDP_ATTACHED_HW; xdp->prog_id = nn->xdp_prog ? nn->xdp_prog->aux->id : 0; xdp->prog_flags = nn->xdp_prog ? nn->xdp_flags : 0; return 0; diff --git a/drivers/net/ethernet/qlogic/qede/qede_filter.c b/drivers/net/ethernet/qlogic/qede/qede_filter.c index b823bfe2ea4d..f9a327c821eb 100644 --- a/drivers/net/ethernet/qlogic/qede/qede_filter.c +++ b/drivers/net/ethernet/qlogic/qede/qede_filter.c @@ -1116,7 +1116,6 @@ int qede_xdp(struct net_device *dev, struct netdev_bpf *xdp) case XDP_SETUP_PROG: return qede_xdp_set(edev, xdp->prog); case XDP_QUERY_PROG: - xdp->prog_attached = !!edev->xdp_prog; xdp->prog_id = edev->xdp_prog ? edev->xdp_prog->aux->id : 0; return 0; default: diff --git a/drivers/net/netdevsim/bpf.c b/drivers/net/netdevsim/bpf.c index 75c25306d234..712e6f918065 100644 --- a/drivers/net/netdevsim/bpf.c +++ b/drivers/net/netdevsim/bpf.c @@ -567,7 +567,6 @@ int nsim_bpf(struct net_device *dev, struct netdev_bpf *bpf) nsim_bpf_destroy_prog(bpf->offload.prog); return 0; case XDP_QUERY_PROG: - bpf->prog_attached = ns->xdp_prog_mode; bpf->prog_id = ns->xdp_prog ? ns->xdp_prog->aux->id : 0; bpf->prog_flags = ns->xdp_prog ? ns->xdp_flags : 0; return 0; diff --git a/drivers/net/tun.c b/drivers/net/tun.c index a192a017cc68..49a50219d0da 100644 --- a/drivers/net/tun.c +++ b/drivers/net/tun.c @@ -1268,7 +1268,6 @@ static int tun_xdp(struct net_device *dev, struct netdev_bpf *xdp) return tun_xdp_set(dev, xdp->prog, xdp->extack); case XDP_QUERY_PROG: xdp->prog_id = tun_xdp_query(dev); - xdp->prog_attached = !!xdp->prog_id; return 0; default: return -EINVAL; diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c index 53085c63277b..2ff08bc103a9 100644 --- a/drivers/net/virtio_net.c +++ b/drivers/net/virtio_net.c @@ -2343,7 +2343,6 @@ static int virtnet_xdp(struct net_device *dev, struct netdev_bpf *xdp) return virtnet_xdp_set(dev, xdp->prog, xdp->extack); case XDP_QUERY_PROG: xdp->prog_id = virtnet_xdp_query(dev); - xdp->prog_attached = !!xdp->prog_id; return 0; default: return -EINVAL; diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index b683971e500d..69a664789b33 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -819,10 +819,6 @@ enum bpf_netdev_command { */ XDP_SETUP_PROG, XDP_SETUP_PROG_HW, - /* Check if a bpf program is set on the device. The callee should - * set @prog_attached to one of XDP_ATTACHED_* values, note that "true" - * is equivalent to XDP_ATTACHED_DRV. - */ XDP_QUERY_PROG, /* BPF program for offload callbacks, invoked at program load time. */ BPF_OFFLOAD_VERIFIER_PREP, @@ -849,7 +845,6 @@ struct netdev_bpf { }; /* XDP_QUERY_PROG */ struct { - u8 prog_attached; u32 prog_id; /* flags with which program was installed */ u32 prog_flags; diff --git a/net/core/dev.c b/net/core/dev.c index 89825c1eccdc..9fa3b3705a8e 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -4926,7 +4926,6 @@ static int generic_xdp_install(struct net_device *dev, struct netdev_bpf *xdp) break; case XDP_QUERY_PROG: - xdp->prog_attached = !!old; xdp->prog_id = old ? old->aux->id : 0; break; @@ -7593,13 +7592,13 @@ void __dev_xdp_query(struct net_device *dev, bpf_op_t bpf_op, WARN_ON(bpf_op(dev, xdp) < 0); } -static u8 __dev_xdp_attached(struct net_device *dev, bpf_op_t bpf_op) +static bool __dev_xdp_attached(struct net_device *dev, bpf_op_t bpf_op) { struct netdev_bpf xdp; __dev_xdp_query(dev, bpf_op, &xdp); - return xdp.prog_attached; + return xdp.prog_id; } static int dev_xdp_install(struct net_device *dev, bpf_op_t bpf_op, @@ -7634,7 +7633,7 @@ static void dev_xdp_uninstall(struct net_device *dev) return; __dev_xdp_query(dev, ndo_bpf, &xdp); - if (xdp.prog_attached == XDP_ATTACHED_NONE) + if (!xdp.prog_id) return; /* Program removal should always succeed */ diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index b40242459907..02ebc056a688 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -1372,9 +1372,13 @@ static u8 rtnl_xdp_attached_mode(struct net_device *dev, u32 *prog_id) return XDP_ATTACHED_NONE; __dev_xdp_query(dev, ops->ndo_bpf, &xdp); - *prog_id = xdp.prog_id; + if (!xdp.prog_id) + return XDP_ATTACHED_NONE; - return xdp.prog_attached; + *prog_id = xdp.prog_id; + if (xdp.prog_flags & XDP_FLAGS_HW_MODE) + return XDP_ATTACHED_HW; + return XDP_ATTACHED_DRV; } static int rtnl_xdp_fill(struct sk_buff *skb, struct net_device *dev) -- cgit v1.2.3-59-g8ed1b From a25717d2b604347d9af8da81deea7b08e8c94220 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Wed, 11 Jul 2018 20:36:41 -0700 Subject: xdp: support simultaneous driver and hw XDP attachment Split the query of HW-attached program from the software one. Introduce new .ndo_bpf command to query HW-attached program. This will allow drivers to install different programs in HW and SW at the same time. Netlink can now also carry multiple programs on dump (in which case mode will be set to XDP_ATTACHED_MULTI and user has to check per-attachment point attributes, IFLA_XDP_PROG_ID will not be present). We reuse IFLA_XDP_PROG_ID skb space for second mode, so rtnl_xdp_size() doesn't need to be updated. Note that the installation side is still not there, since all drivers currently reject installing more than one program at the time. Signed-off-by: Jakub Kicinski Reviewed-by: Quentin Monnet Signed-off-by: Daniel Borkmann --- .../net/ethernet/netronome/nfp/nfp_net_common.c | 6 ++ drivers/net/netdevsim/bpf.c | 6 ++ include/linux/netdevice.h | 7 +- include/uapi/linux/if_link.h | 1 + net/core/dev.c | 45 ++++++----- net/core/rtnetlink.c | 93 ++++++++++++---------- 6 files changed, 96 insertions(+), 62 deletions(-) (limited to 'include/linux') diff --git a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c index 4bb589dbffbc..bb1e72e8dbc2 100644 --- a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c +++ b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c @@ -3453,6 +3453,12 @@ static int nfp_net_xdp(struct net_device *netdev, struct netdev_bpf *xdp) case XDP_SETUP_PROG_HW: return nfp_net_xdp_setup(nn, xdp); case XDP_QUERY_PROG: + if (nn->dp.bpf_offload_xdp) + return 0; + return xdp_attachment_query(&nn->xdp, xdp); + case XDP_QUERY_PROG_HW: + if (!nn->dp.bpf_offload_xdp) + return 0; return xdp_attachment_query(&nn->xdp, xdp); default: return nfp_app_bpf(nn->app, nn, xdp); diff --git a/drivers/net/netdevsim/bpf.c b/drivers/net/netdevsim/bpf.c index c485d97b5df4..5544c9b51173 100644 --- a/drivers/net/netdevsim/bpf.c +++ b/drivers/net/netdevsim/bpf.c @@ -561,6 +561,12 @@ int nsim_bpf(struct net_device *dev, struct netdev_bpf *bpf) nsim_bpf_destroy_prog(bpf->offload.prog); return 0; case XDP_QUERY_PROG: + if (ns->xdp_prog_mode != XDP_ATTACHED_DRV) + return 0; + return xdp_attachment_query(&ns->xdp, bpf); + case XDP_QUERY_PROG_HW: + if (ns->xdp_prog_mode != XDP_ATTACHED_HW) + return 0; return xdp_attachment_query(&ns->xdp, bpf); case XDP_SETUP_PROG: err = nsim_setup_prog_checks(ns, bpf); diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 69a664789b33..2422c0e88f5c 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -820,6 +820,7 @@ enum bpf_netdev_command { XDP_SETUP_PROG, XDP_SETUP_PROG_HW, XDP_QUERY_PROG, + XDP_QUERY_PROG_HW, /* BPF program for offload callbacks, invoked at program load time. */ BPF_OFFLOAD_VERIFIER_PREP, BPF_OFFLOAD_TRANSLATE, @@ -843,7 +844,7 @@ struct netdev_bpf { struct bpf_prog *prog; struct netlink_ext_ack *extack; }; - /* XDP_QUERY_PROG */ + /* XDP_QUERY_PROG, XDP_QUERY_PROG_HW */ struct { u32 prog_id; /* flags with which program was installed */ @@ -3533,8 +3534,8 @@ struct sk_buff *dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev, typedef int (*bpf_op_t)(struct net_device *dev, struct netdev_bpf *bpf); int dev_change_xdp_fd(struct net_device *dev, struct netlink_ext_ack *extack, int fd, u32 flags); -void __dev_xdp_query(struct net_device *dev, bpf_op_t xdp_op, - struct netdev_bpf *xdp); +u32 __dev_xdp_query(struct net_device *dev, bpf_op_t xdp_op, + enum bpf_netdev_command cmd); int __dev_forward_skb(struct net_device *dev, struct sk_buff *skb); int dev_forward_skb(struct net_device *dev, struct sk_buff *skb); diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h index bc86c2b105ec..8759cfb8aa2e 100644 --- a/include/uapi/linux/if_link.h +++ b/include/uapi/linux/if_link.h @@ -920,6 +920,7 @@ enum { XDP_ATTACHED_DRV, XDP_ATTACHED_SKB, XDP_ATTACHED_HW, + XDP_ATTACHED_MULTI, }; enum { diff --git a/net/core/dev.c b/net/core/dev.c index 9fa3b3705a8e..993cdc3cd086 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -7582,21 +7582,19 @@ int dev_change_proto_down(struct net_device *dev, bool proto_down) } EXPORT_SYMBOL(dev_change_proto_down); -void __dev_xdp_query(struct net_device *dev, bpf_op_t bpf_op, - struct netdev_bpf *xdp) +u32 __dev_xdp_query(struct net_device *dev, bpf_op_t bpf_op, + enum bpf_netdev_command cmd) { - memset(xdp, 0, sizeof(*xdp)); - xdp->command = XDP_QUERY_PROG; + struct netdev_bpf xdp; - /* Query must always succeed. */ - WARN_ON(bpf_op(dev, xdp) < 0); -} + if (!bpf_op) + return 0; -static bool __dev_xdp_attached(struct net_device *dev, bpf_op_t bpf_op) -{ - struct netdev_bpf xdp; + memset(&xdp, 0, sizeof(xdp)); + xdp.command = cmd; - __dev_xdp_query(dev, bpf_op, &xdp); + /* Query must always succeed. */ + WARN_ON(bpf_op(dev, &xdp) < 0 && cmd == XDP_QUERY_PROG); return xdp.prog_id; } @@ -7632,12 +7630,19 @@ static void dev_xdp_uninstall(struct net_device *dev) if (!ndo_bpf) return; - __dev_xdp_query(dev, ndo_bpf, &xdp); - if (!xdp.prog_id) - return; + memset(&xdp, 0, sizeof(xdp)); + xdp.command = XDP_QUERY_PROG; + WARN_ON(ndo_bpf(dev, &xdp)); + if (xdp.prog_id) + WARN_ON(dev_xdp_install(dev, ndo_bpf, NULL, xdp.prog_flags, + NULL)); - /* Program removal should always succeed */ - WARN_ON(dev_xdp_install(dev, ndo_bpf, NULL, xdp.prog_flags, NULL)); + /* Remove HW offload */ + memset(&xdp, 0, sizeof(xdp)); + xdp.command = XDP_QUERY_PROG_HW; + if (!ndo_bpf(dev, &xdp) && xdp.prog_id) + WARN_ON(dev_xdp_install(dev, ndo_bpf, NULL, xdp.prog_flags, + NULL)); } /** @@ -7653,12 +7658,15 @@ int dev_change_xdp_fd(struct net_device *dev, struct netlink_ext_ack *extack, int fd, u32 flags) { const struct net_device_ops *ops = dev->netdev_ops; + enum bpf_netdev_command query; struct bpf_prog *prog = NULL; bpf_op_t bpf_op, bpf_chk; int err; ASSERT_RTNL(); + query = flags & XDP_FLAGS_HW_MODE ? XDP_QUERY_PROG_HW : XDP_QUERY_PROG; + bpf_op = bpf_chk = ops->ndo_bpf; if (!bpf_op && (flags & (XDP_FLAGS_DRV_MODE | XDP_FLAGS_HW_MODE))) return -EOPNOTSUPP; @@ -7668,10 +7676,11 @@ int dev_change_xdp_fd(struct net_device *dev, struct netlink_ext_ack *extack, bpf_chk = generic_xdp_install; if (fd >= 0) { - if (bpf_chk && __dev_xdp_attached(dev, bpf_chk)) + if (__dev_xdp_query(dev, bpf_chk, XDP_QUERY_PROG) || + __dev_xdp_query(dev, bpf_chk, XDP_QUERY_PROG_HW)) return -EEXIST; if ((flags & XDP_FLAGS_UPDATE_IF_NOEXIST) && - __dev_xdp_attached(dev, bpf_op)) + __dev_xdp_query(dev, bpf_op, query)) return -EBUSY; prog = bpf_prog_get_type_dev(fd, BPF_PROG_TYPE_XDP, diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 02ebc056a688..c9929ef17539 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -964,7 +964,7 @@ static size_t rtnl_xdp_size(void) { size_t xdp_size = nla_total_size(0) + /* nest IFLA_XDP */ nla_total_size(1) + /* XDP_ATTACHED */ - nla_total_size(4) + /* XDP_PROG_ID */ + nla_total_size(4) + /* XDP_PROG_ID (or 1st mode) */ nla_total_size(4); /* XDP__PROG_ID */ return xdp_size; @@ -1354,37 +1354,57 @@ static int rtnl_fill_link_ifmap(struct sk_buff *skb, struct net_device *dev) return 0; } -static u8 rtnl_xdp_attached_mode(struct net_device *dev, u32 *prog_id) +static u32 rtnl_xdp_prog_skb(struct net_device *dev) { - const struct net_device_ops *ops = dev->netdev_ops; const struct bpf_prog *generic_xdp_prog; - struct netdev_bpf xdp; ASSERT_RTNL(); - *prog_id = 0; generic_xdp_prog = rtnl_dereference(dev->xdp_prog); - if (generic_xdp_prog) { - *prog_id = generic_xdp_prog->aux->id; - return XDP_ATTACHED_SKB; - } - if (!ops->ndo_bpf) - return XDP_ATTACHED_NONE; + if (!generic_xdp_prog) + return 0; + return generic_xdp_prog->aux->id; +} + +static u32 rtnl_xdp_prog_drv(struct net_device *dev) +{ + return __dev_xdp_query(dev, dev->netdev_ops->ndo_bpf, XDP_QUERY_PROG); +} + +static u32 rtnl_xdp_prog_hw(struct net_device *dev) +{ + return __dev_xdp_query(dev, dev->netdev_ops->ndo_bpf, + XDP_QUERY_PROG_HW); +} + +static int rtnl_xdp_report_one(struct sk_buff *skb, struct net_device *dev, + u32 *prog_id, u8 *mode, u8 tgt_mode, u32 attr, + u32 (*get_prog_id)(struct net_device *dev)) +{ + u32 curr_id; + int err; + + curr_id = get_prog_id(dev); + if (!curr_id) + return 0; + + *prog_id = curr_id; + err = nla_put_u32(skb, attr, curr_id); + if (err) + return err; - __dev_xdp_query(dev, ops->ndo_bpf, &xdp); - if (!xdp.prog_id) - return XDP_ATTACHED_NONE; + if (*mode != XDP_ATTACHED_NONE) + *mode = XDP_ATTACHED_MULTI; + else + *mode = tgt_mode; - *prog_id = xdp.prog_id; - if (xdp.prog_flags & XDP_FLAGS_HW_MODE) - return XDP_ATTACHED_HW; - return XDP_ATTACHED_DRV; + return 0; } static int rtnl_xdp_fill(struct sk_buff *skb, struct net_device *dev) { - u32 prog_attr, prog_id; struct nlattr *xdp; + u32 prog_id; int err; u8 mode; @@ -1392,35 +1412,26 @@ static int rtnl_xdp_fill(struct sk_buff *skb, struct net_device *dev) if (!xdp) return -EMSGSIZE; - mode = rtnl_xdp_attached_mode(dev, &prog_id); + prog_id = 0; + mode = XDP_ATTACHED_NONE; + if (rtnl_xdp_report_one(skb, dev, &prog_id, &mode, XDP_ATTACHED_SKB, + IFLA_XDP_SKB_PROG_ID, rtnl_xdp_prog_skb)) + goto err_cancel; + if (rtnl_xdp_report_one(skb, dev, &prog_id, &mode, XDP_ATTACHED_DRV, + IFLA_XDP_DRV_PROG_ID, rtnl_xdp_prog_drv)) + goto err_cancel; + if (rtnl_xdp_report_one(skb, dev, &prog_id, &mode, XDP_ATTACHED_HW, + IFLA_XDP_HW_PROG_ID, rtnl_xdp_prog_hw)) + goto err_cancel; + err = nla_put_u8(skb, IFLA_XDP_ATTACHED, mode); if (err) goto err_cancel; - if (prog_id) { + if (prog_id && mode != XDP_ATTACHED_MULTI) { err = nla_put_u32(skb, IFLA_XDP_PROG_ID, prog_id); if (err) goto err_cancel; - - switch (mode) { - case XDP_ATTACHED_DRV: - prog_attr = IFLA_XDP_DRV_PROG_ID; - break; - case XDP_ATTACHED_SKB: - prog_attr = IFLA_XDP_SKB_PROG_ID; - break; - case XDP_ATTACHED_HW: - prog_attr = IFLA_XDP_HW_PROG_ID; - break; - case XDP_ATTACHED_NONE: - default: - err = -EINVAL; - goto err_cancel; - } - - err = nla_put_u32(skb, prog_attr, prog_id); - if (err) - goto err_cancel; } nla_nest_end(skb, xdp); -- cgit v1.2.3-59-g8ed1b From c921c2077b32081617789a645120148bc8b60c98 Mon Sep 17 00:00:00 2001 From: Nikolay Aleksandrov Date: Fri, 13 Jul 2018 12:16:43 +0300 Subject: net: ipmr: add support for passing full packet on wrong vif This patch adds support for IGMPMSG_WRVIFWHOLE which is used to pass full packet and real vif id when the incoming interface is wrong. While the RP and FHR are setting up state we need to be sending the registers encapsulated with all the data inside otherwise we lose it. The RP then decapsulates it and forwards it to the interested parties. Currently with WRONGVIF we can only be sending empty register packets and will lose that data. This behaviour can be enabled by using MRT_PIM with val == IGMPMSG_WRVIFWHOLE. This doesn't prevent IGMPMSG_WRONGVIF from happening, it happens in addition to it, also it is controlled by the same throttling parameters as WRONGVIF (i.e. 1 packet per 3 seconds currently). Both messages are generated to keep backwards compatibily and avoid breaking someone who was enabling MRT_PIM with val == 4, since any positive val is accepted and treated the same. Signed-off-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- include/linux/mroute_base.h | 1 + include/uapi/linux/mroute.h | 2 ++ net/ipv4/ipmr.c | 21 ++++++++++++++++----- 3 files changed, 19 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mroute_base.h b/include/linux/mroute_base.h index fd436cdd4725..6675b9f81979 100644 --- a/include/linux/mroute_base.h +++ b/include/linux/mroute_base.h @@ -254,6 +254,7 @@ struct mr_table { atomic_t cache_resolve_queue_len; bool mroute_do_assert; bool mroute_do_pim; + bool mroute_do_wrvifwhole; int mroute_reg_vif_num; }; diff --git a/include/uapi/linux/mroute.h b/include/uapi/linux/mroute.h index 10f9ff9426a2..5d37a9ccce63 100644 --- a/include/uapi/linux/mroute.h +++ b/include/uapi/linux/mroute.h @@ -120,6 +120,7 @@ enum { IPMRA_TABLE_MROUTE_DO_ASSERT, IPMRA_TABLE_MROUTE_DO_PIM, IPMRA_TABLE_VIFS, + IPMRA_TABLE_MROUTE_DO_WRVIFWHOLE, __IPMRA_TABLE_MAX }; #define IPMRA_TABLE_MAX (__IPMRA_TABLE_MAX - 1) @@ -173,5 +174,6 @@ enum { #define IGMPMSG_NOCACHE 1 /* Kern cache fill request to mrouted */ #define IGMPMSG_WRONGVIF 2 /* For PIM assert processing (unused) */ #define IGMPMSG_WHOLEPKT 3 /* For PIM Register processing */ +#define IGMPMSG_WRVIFWHOLE 4 /* For PIM Register and assert processing */ #endif /* _UAPI__LINUX_MROUTE_H */ diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c index 82f914122f1b..5660adcf7a04 100644 --- a/net/ipv4/ipmr.c +++ b/net/ipv4/ipmr.c @@ -1052,7 +1052,7 @@ static int ipmr_cache_report(struct mr_table *mrt, struct sk_buff *skb; int ret; - if (assert == IGMPMSG_WHOLEPKT) + if (assert == IGMPMSG_WHOLEPKT || assert == IGMPMSG_WRVIFWHOLE) skb = skb_realloc_headroom(pkt, sizeof(struct iphdr)); else skb = alloc_skb(128, GFP_ATOMIC); @@ -1060,7 +1060,7 @@ static int ipmr_cache_report(struct mr_table *mrt, if (!skb) return -ENOBUFS; - if (assert == IGMPMSG_WHOLEPKT) { + if (assert == IGMPMSG_WHOLEPKT || assert == IGMPMSG_WRVIFWHOLE) { /* Ugly, but we have no choice with this interface. * Duplicate old header, fix ihl, length etc. * And all this only to mangle msg->im_msgtype and @@ -1071,9 +1071,12 @@ static int ipmr_cache_report(struct mr_table *mrt, skb_reset_transport_header(skb); msg = (struct igmpmsg *)skb_network_header(skb); memcpy(msg, skb_network_header(pkt), sizeof(struct iphdr)); - msg->im_msgtype = IGMPMSG_WHOLEPKT; + msg->im_msgtype = assert; msg->im_mbz = 0; - msg->im_vif = mrt->mroute_reg_vif_num; + if (assert == IGMPMSG_WRVIFWHOLE) + msg->im_vif = vifi; + else + msg->im_vif = mrt->mroute_reg_vif_num; ip_hdr(skb)->ihl = sizeof(struct iphdr) >> 2; ip_hdr(skb)->tot_len = htons(ntohs(ip_hdr(pkt)->tot_len) + sizeof(struct iphdr)); @@ -1372,6 +1375,7 @@ int ip_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, struct mr_table *mrt; struct vifctl vif; struct mfcctl mfc; + bool do_wrvifwhole; u32 uval; /* There's one exception to the lock - MRT_DONE which needs to unlock */ @@ -1502,10 +1506,12 @@ int ip_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, break; } + do_wrvifwhole = (val == IGMPMSG_WRVIFWHOLE); val = !!val; if (val != mrt->mroute_do_pim) { mrt->mroute_do_pim = val; mrt->mroute_do_assert = val; + mrt->mroute_do_wrvifwhole = do_wrvifwhole; } break; case MRT_TABLE: @@ -1983,6 +1989,9 @@ static void ip_mr_forward(struct net *net, struct mr_table *mrt, MFC_ASSERT_THRESH)) { c->_c.mfc_un.res.last_assert = jiffies; ipmr_cache_report(mrt, skb, true_vifi, IGMPMSG_WRONGVIF); + if (mrt->mroute_do_wrvifwhole) + ipmr_cache_report(mrt, skb, true_vifi, + IGMPMSG_WRVIFWHOLE); } goto dont_forward; } @@ -2659,7 +2668,9 @@ static bool ipmr_fill_table(struct mr_table *mrt, struct sk_buff *skb) mrt->mroute_reg_vif_num) || nla_put_u8(skb, IPMRA_TABLE_MROUTE_DO_ASSERT, mrt->mroute_do_assert) || - nla_put_u8(skb, IPMRA_TABLE_MROUTE_DO_PIM, mrt->mroute_do_pim)) + nla_put_u8(skb, IPMRA_TABLE_MROUTE_DO_PIM, mrt->mroute_do_pim) || + nla_put_u8(skb, IPMRA_TABLE_MROUTE_DO_WRVIFWHOLE, + mrt->mroute_do_wrvifwhole)) return false; return true; -- cgit v1.2.3-59-g8ed1b From 784abe24c903b093af04cf1a043140faa556cad7 Mon Sep 17 00:00:00 2001 From: Boris Pismenny Date: Fri, 13 Jul 2018 14:33:35 +0300 Subject: net: Add decrypted field to skb The decrypted bit is propogated to cloned/copied skbs. This will be used later by the inline crypto receive side offload of tls. Signed-off-by: Boris Pismenny Signed-off-by: Ilya Lesokhin Signed-off-by: David S. Miller --- include/linux/skbuff.h | 7 ++++++- net/core/skbuff.c | 6 ++++++ 2 files changed, 12 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 7601838c2513..3ceb8dcc54da 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -630,6 +630,7 @@ typedef unsigned char *sk_buff_data_t; * @hash: the packet hash * @queue_mapping: Queue mapping for multiqueue devices * @xmit_more: More SKBs are pending for this queue + * @decrypted: Decrypted SKB * @ndisc_nodetype: router type (from link layer) * @ooo_okay: allow the mapping of a socket to a queue to be changed * @l4_hash: indicate hash is a canonical 4-tuple hash over transport @@ -736,7 +737,11 @@ struct sk_buff { peeked:1, head_frag:1, xmit_more:1, - __unused:1; /* one bit hole */ +#ifdef CONFIG_TLS_DEVICE + decrypted:1; +#else + __unused:1; +#endif /* fields enclosed in headers_start/headers_end are copied * using a single memcpy() in __copy_skb_header() diff --git a/net/core/skbuff.c b/net/core/skbuff.c index c4e24ac27464..cfd6c6f35f9c 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -805,6 +805,9 @@ static void __copy_skb_header(struct sk_buff *new, const struct sk_buff *old) * It is not yet because we do not want to have a 16 bit hole */ new->queue_mapping = old->queue_mapping; +#ifdef CONFIG_TLS_DEVICE + new->decrypted = old->decrypted; +#endif memcpy(&new->headers_start, &old->headers_start, offsetof(struct sk_buff, headers_end) - @@ -865,6 +868,9 @@ static struct sk_buff *__skb_clone(struct sk_buff *n, struct sk_buff *skb) C(head_frag); C(data); C(truesize); +#ifdef CONFIG_TLS_DEVICE + C(decrypted); +#endif refcount_set(&n->users, 1); atomic_inc(&(skb_shinfo(skb)->dataref)); -- cgit v1.2.3-59-g8ed1b From 14136564c8ee94566945e85014019cbdb1716dca Mon Sep 17 00:00:00 2001 From: Ilya Lesokhin Date: Fri, 13 Jul 2018 14:33:36 +0300 Subject: net: Add TLS RX offload feature This patch adds a netdev feature to configure TLS RX inline crypto offload. Signed-off-by: Ilya Lesokhin Signed-off-by: Boris Pismenny Signed-off-by: David S. Miller --- include/linux/netdev_features.h | 2 ++ net/core/ethtool.c | 1 + 2 files changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/netdev_features.h b/include/linux/netdev_features.h index 623bb8ced060..2b2a6dce1630 100644 --- a/include/linux/netdev_features.h +++ b/include/linux/netdev_features.h @@ -79,6 +79,7 @@ enum { NETIF_F_HW_ESP_TX_CSUM_BIT, /* ESP with TX checksum offload */ NETIF_F_RX_UDP_TUNNEL_PORT_BIT, /* Offload of RX port for UDP tunnels */ NETIF_F_HW_TLS_TX_BIT, /* Hardware TLS TX offload */ + NETIF_F_HW_TLS_RX_BIT, /* Hardware TLS RX offload */ NETIF_F_GRO_HW_BIT, /* Hardware Generic receive offload */ NETIF_F_HW_TLS_RECORD_BIT, /* Offload TLS record */ @@ -151,6 +152,7 @@ enum { #define NETIF_F_HW_TLS_RECORD __NETIF_F(HW_TLS_RECORD) #define NETIF_F_GSO_UDP_L4 __NETIF_F(GSO_UDP_L4) #define NETIF_F_HW_TLS_TX __NETIF_F(HW_TLS_TX) +#define NETIF_F_HW_TLS_RX __NETIF_F(HW_TLS_RX) #define for_each_netdev_feature(mask_addr, bit) \ for_each_set_bit(bit, (unsigned long *)mask_addr, NETDEV_FEATURE_COUNT) diff --git a/net/core/ethtool.c b/net/core/ethtool.c index e677a20180cf..c9993c6c2fd4 100644 --- a/net/core/ethtool.c +++ b/net/core/ethtool.c @@ -111,6 +111,7 @@ static const char netdev_features_strings[NETDEV_FEATURE_COUNT][ETH_GSTRING_LEN] [NETIF_F_RX_UDP_TUNNEL_PORT_BIT] = "rx-udp_tunnel-port-offload", [NETIF_F_HW_TLS_RECORD_BIT] = "tls-hw-record", [NETIF_F_HW_TLS_TX_BIT] = "tls-hw-tx-offload", + [NETIF_F_HW_TLS_RX_BIT] = "tls-hw-rx-offload", }; static const char -- cgit v1.2.3-59-g8ed1b From 16e4edc297ffc9b643b8dd3da6b0d579753ea2b3 Mon Sep 17 00:00:00 2001 From: Boris Pismenny Date: Fri, 13 Jul 2018 14:33:37 +0300 Subject: net: Add TLS rx resync NDO Add new netdev tls op for resynchronizing HW tls context Signed-off-by: Boris Pismenny Signed-off-by: David S. Miller --- include/linux/netdevice.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 4fa7f7a3f8b3..3514d67112b3 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -903,6 +903,8 @@ struct tlsdev_ops { void (*tls_dev_del)(struct net_device *netdev, struct tls_context *ctx, enum tls_offload_ctx_dir direction); + void (*tls_dev_resync_rx)(struct net_device *netdev, + struct sock *sk, u32 seq, u64 rcd_sn); }; #endif -- cgit v1.2.3-59-g8ed1b From ab412e1dd7db132c2abeb9385b4bf0dc8e6c5a65 Mon Sep 17 00:00:00 2001 From: Boris Pismenny Date: Fri, 13 Jul 2018 14:33:46 +0300 Subject: net/mlx5: Accel, add TLS rx offload routines In Innova TLS, TLS contexts are added or deleted via a command message over the SBU connection. The HW then sends a response message over the same connection. Complete the implementation for Innova TLS (FPGA-based) hardware by adding support for rx inline crypto offload. Signed-off-by: Boris Pismenny Signed-off-by: Ilya Lesokhin Signed-off-by: David S. Miller --- .../net/ethernet/mellanox/mlx5/core/accel/tls.c | 23 +++-- .../net/ethernet/mellanox/mlx5/core/accel/tls.h | 26 +++-- drivers/net/ethernet/mellanox/mlx5/core/fpga/tls.c | 113 ++++++++++++++++----- drivers/net/ethernet/mellanox/mlx5/core/fpga/tls.h | 18 ++-- include/linux/mlx5/mlx5_ifc_fpga.h | 1 + 5 files changed, 135 insertions(+), 46 deletions(-) (limited to 'include/linux') diff --git a/drivers/net/ethernet/mellanox/mlx5/core/accel/tls.c b/drivers/net/ethernet/mellanox/mlx5/core/accel/tls.c index 77ac19f38cbe..da7bd26368f9 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/accel/tls.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/accel/tls.c @@ -37,17 +37,26 @@ #include "mlx5_core.h" #include "fpga/tls.h" -int mlx5_accel_tls_add_tx_flow(struct mlx5_core_dev *mdev, void *flow, - struct tls_crypto_info *crypto_info, - u32 start_offload_tcp_sn, u32 *p_swid) +int mlx5_accel_tls_add_flow(struct mlx5_core_dev *mdev, void *flow, + struct tls_crypto_info *crypto_info, + u32 start_offload_tcp_sn, u32 *p_swid, + bool direction_sx) { - return mlx5_fpga_tls_add_tx_flow(mdev, flow, crypto_info, - start_offload_tcp_sn, p_swid); + return mlx5_fpga_tls_add_flow(mdev, flow, crypto_info, + start_offload_tcp_sn, p_swid, + direction_sx); } -void mlx5_accel_tls_del_tx_flow(struct mlx5_core_dev *mdev, u32 swid) +void mlx5_accel_tls_del_flow(struct mlx5_core_dev *mdev, u32 swid, + bool direction_sx) { - mlx5_fpga_tls_del_tx_flow(mdev, swid, GFP_KERNEL); + mlx5_fpga_tls_del_flow(mdev, swid, GFP_KERNEL, direction_sx); +} + +int mlx5_accel_tls_resync_rx(struct mlx5_core_dev *mdev, u32 handle, u32 seq, + u64 rcd_sn) +{ + return mlx5_fpga_tls_resync_rx(mdev, handle, seq, rcd_sn); } bool mlx5_accel_is_tls_device(struct mlx5_core_dev *mdev) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/accel/tls.h b/drivers/net/ethernet/mellanox/mlx5/core/accel/tls.h index 6f9c9f446ecc..2228c1083528 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/accel/tls.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/accel/tls.h @@ -60,10 +60,14 @@ struct mlx5_ifc_tls_flow_bits { u8 reserved_at_2[0x1e]; }; -int mlx5_accel_tls_add_tx_flow(struct mlx5_core_dev *mdev, void *flow, - struct tls_crypto_info *crypto_info, - u32 start_offload_tcp_sn, u32 *p_swid); -void mlx5_accel_tls_del_tx_flow(struct mlx5_core_dev *mdev, u32 swid); +int mlx5_accel_tls_add_flow(struct mlx5_core_dev *mdev, void *flow, + struct tls_crypto_info *crypto_info, + u32 start_offload_tcp_sn, u32 *p_swid, + bool direction_sx); +void mlx5_accel_tls_del_flow(struct mlx5_core_dev *mdev, u32 swid, + bool direction_sx); +int mlx5_accel_tls_resync_rx(struct mlx5_core_dev *mdev, u32 handle, u32 seq, + u64 rcd_sn); bool mlx5_accel_is_tls_device(struct mlx5_core_dev *mdev); u32 mlx5_accel_tls_device_caps(struct mlx5_core_dev *mdev); int mlx5_accel_tls_init(struct mlx5_core_dev *mdev); @@ -71,11 +75,15 @@ void mlx5_accel_tls_cleanup(struct mlx5_core_dev *mdev); #else -static inline int -mlx5_accel_tls_add_tx_flow(struct mlx5_core_dev *mdev, void *flow, - struct tls_crypto_info *crypto_info, - u32 start_offload_tcp_sn, u32 *p_swid) { return 0; } -static inline void mlx5_accel_tls_del_tx_flow(struct mlx5_core_dev *mdev, u32 swid) { } +static int +mlx5_accel_tls_add_flow(struct mlx5_core_dev *mdev, void *flow, + struct tls_crypto_info *crypto_info, + u32 start_offload_tcp_sn, u32 *p_swid, + bool direction_sx) { return -ENOTSUPP; } +static inline void mlx5_accel_tls_del_flow(struct mlx5_core_dev *mdev, u32 swid, + bool direction_sx) { } +static inline int mlx5_accel_tls_resync_rx(struct mlx5_core_dev *mdev, u32 handle, + u32 seq, u64 rcd_sn) { return 0; } static inline bool mlx5_accel_is_tls_device(struct mlx5_core_dev *mdev) { return false; } static inline u32 mlx5_accel_tls_device_caps(struct mlx5_core_dev *mdev) { return 0; } static inline int mlx5_accel_tls_init(struct mlx5_core_dev *mdev) { return 0; } diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fpga/tls.c b/drivers/net/ethernet/mellanox/mlx5/core/fpga/tls.c index c9736238604a..5cf5f2a9d51f 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/fpga/tls.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/fpga/tls.c @@ -129,6 +129,7 @@ static void mlx5_fpga_tls_cmd_send(struct mlx5_fpga_device *fdev, static int mlx5_fpga_tls_alloc_swid(struct idr *idr, spinlock_t *idr_spinlock, void *ptr) { + unsigned long flags; int ret; /* TLS metadata format is 1 byte for syndrome followed @@ -139,9 +140,9 @@ static int mlx5_fpga_tls_alloc_swid(struct idr *idr, spinlock_t *idr_spinlock, BUILD_BUG_ON((SWID_END - 1) & 0xFF000000); idr_preload(GFP_KERNEL); - spin_lock_irq(idr_spinlock); + spin_lock_irqsave(idr_spinlock, flags); ret = idr_alloc(idr, ptr, SWID_START, SWID_END, GFP_ATOMIC); - spin_unlock_irq(idr_spinlock); + spin_unlock_irqrestore(idr_spinlock, flags); idr_preload_end(); return ret; @@ -157,6 +158,13 @@ static void mlx5_fpga_tls_release_swid(struct idr *idr, spin_unlock_irqrestore(idr_spinlock, flags); } +static void mlx_tls_kfree_complete(struct mlx5_fpga_conn *conn, + struct mlx5_fpga_device *fdev, + struct mlx5_fpga_dma_buf *buf, u8 status) +{ + kfree(buf); +} + struct mlx5_teardown_stream_context { struct mlx5_fpga_tls_command_context cmd; u32 swid; @@ -178,9 +186,13 @@ mlx5_fpga_tls_teardown_completion(struct mlx5_fpga_conn *conn, mlx5_fpga_err(fdev, "Teardown stream failed with syndrome = %d", syndrome); - else + else if (MLX5_GET(tls_cmd, cmd->buf.sg[0].data, direction_sx)) mlx5_fpga_tls_release_swid(&fdev->tls->tx_idr, - &fdev->tls->idr_spinlock, + &fdev->tls->tx_idr_spinlock, + ctx->swid); + else + mlx5_fpga_tls_release_swid(&fdev->tls->rx_idr, + &fdev->tls->rx_idr_spinlock, ctx->swid); } mlx5_fpga_tls_put_command_ctx(cmd); @@ -196,6 +208,40 @@ static void mlx5_fpga_tls_flow_to_cmd(void *flow, void *cmd) MLX5_GET(tls_flow, flow, direction_sx)); } +int mlx5_fpga_tls_resync_rx(struct mlx5_core_dev *mdev, u32 handle, u32 seq, + u64 rcd_sn) +{ + struct mlx5_fpga_dma_buf *buf; + int size = sizeof(*buf) + MLX5_TLS_COMMAND_SIZE; + void *flow; + void *cmd; + int ret; + + buf = kzalloc(size, GFP_ATOMIC); + if (!buf) + return -ENOMEM; + + cmd = (buf + 1); + + rcu_read_lock(); + flow = idr_find(&mdev->fpga->tls->rx_idr, ntohl(handle)); + rcu_read_unlock(); + mlx5_fpga_tls_flow_to_cmd(flow, cmd); + + MLX5_SET(tls_cmd, cmd, swid, ntohl(handle)); + MLX5_SET64(tls_cmd, cmd, tls_rcd_sn, be64_to_cpu(rcd_sn)); + MLX5_SET(tls_cmd, cmd, tcp_sn, seq); + MLX5_SET(tls_cmd, cmd, command_type, CMD_RESYNC_RX); + + buf->sg[0].data = cmd; + buf->sg[0].size = MLX5_TLS_COMMAND_SIZE; + buf->complete = mlx_tls_kfree_complete; + + ret = mlx5_fpga_sbu_conn_sendmsg(mdev->fpga->tls->conn, buf); + + return ret; +} + static void mlx5_fpga_tls_send_teardown_cmd(struct mlx5_core_dev *mdev, void *flow, u32 swid, gfp_t flags) { @@ -223,14 +269,18 @@ static void mlx5_fpga_tls_send_teardown_cmd(struct mlx5_core_dev *mdev, mlx5_fpga_tls_teardown_completion); } -void mlx5_fpga_tls_del_tx_flow(struct mlx5_core_dev *mdev, u32 swid, - gfp_t flags) +void mlx5_fpga_tls_del_flow(struct mlx5_core_dev *mdev, u32 swid, + gfp_t flags, bool direction_sx) { struct mlx5_fpga_tls *tls = mdev->fpga->tls; void *flow; rcu_read_lock(); - flow = idr_find(&tls->tx_idr, swid); + if (direction_sx) + flow = idr_find(&tls->tx_idr, swid); + else + flow = idr_find(&tls->rx_idr, swid); + rcu_read_unlock(); if (!flow) { @@ -289,9 +339,11 @@ mlx5_fpga_tls_setup_completion(struct mlx5_fpga_conn *conn, * the command context because we might not have received * the tx completion yet. */ - mlx5_fpga_tls_del_tx_flow(fdev->mdev, - MLX5_GET(tls_cmd, tls_cmd, swid), - GFP_ATOMIC); + mlx5_fpga_tls_del_flow(fdev->mdev, + MLX5_GET(tls_cmd, tls_cmd, swid), + GFP_ATOMIC, + MLX5_GET(tls_cmd, tls_cmd, + direction_sx)); } mlx5_fpga_tls_put_command_ctx(cmd); @@ -415,8 +467,7 @@ int mlx5_fpga_tls_init(struct mlx5_core_dev *mdev) if (err) goto error; - if (!(tls->caps & (MLX5_ACCEL_TLS_TX | MLX5_ACCEL_TLS_V12 | - MLX5_ACCEL_TLS_AES_GCM128))) { + if (!(tls->caps & (MLX5_ACCEL_TLS_V12 | MLX5_ACCEL_TLS_AES_GCM128))) { err = -ENOTSUPP; goto error; } @@ -438,7 +489,9 @@ int mlx5_fpga_tls_init(struct mlx5_core_dev *mdev) INIT_LIST_HEAD(&tls->pending_cmds); idr_init(&tls->tx_idr); - spin_lock_init(&tls->idr_spinlock); + idr_init(&tls->rx_idr); + spin_lock_init(&tls->tx_idr_spinlock); + spin_lock_init(&tls->rx_idr_spinlock); fdev->tls = tls; return 0; @@ -500,9 +553,9 @@ static int mlx5_fpga_tls_set_key_material(void *cmd, u32 caps, return 0; } -static int mlx5_fpga_tls_add_flow(struct mlx5_core_dev *mdev, void *flow, - struct tls_crypto_info *crypto_info, u32 swid, - u32 tcp_sn) +static int _mlx5_fpga_tls_add_flow(struct mlx5_core_dev *mdev, void *flow, + struct tls_crypto_info *crypto_info, + u32 swid, u32 tcp_sn) { u32 caps = mlx5_fpga_tls_device_caps(mdev); struct mlx5_setup_stream_context *ctx; @@ -533,30 +586,42 @@ out: return ret; } -int mlx5_fpga_tls_add_tx_flow(struct mlx5_core_dev *mdev, void *flow, - struct tls_crypto_info *crypto_info, - u32 start_offload_tcp_sn, u32 *p_swid) +int mlx5_fpga_tls_add_flow(struct mlx5_core_dev *mdev, void *flow, + struct tls_crypto_info *crypto_info, + u32 start_offload_tcp_sn, u32 *p_swid, + bool direction_sx) { struct mlx5_fpga_tls *tls = mdev->fpga->tls; int ret = -ENOMEM; u32 swid; - ret = mlx5_fpga_tls_alloc_swid(&tls->tx_idr, &tls->idr_spinlock, flow); + if (direction_sx) + ret = mlx5_fpga_tls_alloc_swid(&tls->tx_idr, + &tls->tx_idr_spinlock, flow); + else + ret = mlx5_fpga_tls_alloc_swid(&tls->rx_idr, + &tls->rx_idr_spinlock, flow); + if (ret < 0) return ret; swid = ret; - MLX5_SET(tls_flow, flow, direction_sx, 1); + MLX5_SET(tls_flow, flow, direction_sx, direction_sx ? 1 : 0); - ret = mlx5_fpga_tls_add_flow(mdev, flow, crypto_info, swid, - start_offload_tcp_sn); + ret = _mlx5_fpga_tls_add_flow(mdev, flow, crypto_info, swid, + start_offload_tcp_sn); if (ret && ret != -EINTR) goto free_swid; *p_swid = swid; return 0; free_swid: - mlx5_fpga_tls_release_swid(&tls->tx_idr, &tls->idr_spinlock, swid); + if (direction_sx) + mlx5_fpga_tls_release_swid(&tls->tx_idr, + &tls->tx_idr_spinlock, swid); + else + mlx5_fpga_tls_release_swid(&tls->rx_idr, + &tls->rx_idr_spinlock, swid); return ret; } diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fpga/tls.h b/drivers/net/ethernet/mellanox/mlx5/core/fpga/tls.h index 800a214e4e49..3b2e37bf76fe 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/fpga/tls.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/fpga/tls.h @@ -46,15 +46,18 @@ struct mlx5_fpga_tls { struct mlx5_fpga_conn *conn; struct idr tx_idr; - spinlock_t idr_spinlock; /* protects the IDR */ + struct idr rx_idr; + spinlock_t tx_idr_spinlock; /* protects the IDR */ + spinlock_t rx_idr_spinlock; /* protects the IDR */ }; -int mlx5_fpga_tls_add_tx_flow(struct mlx5_core_dev *mdev, void *flow, - struct tls_crypto_info *crypto_info, - u32 start_offload_tcp_sn, u32 *p_swid); +int mlx5_fpga_tls_add_flow(struct mlx5_core_dev *mdev, void *flow, + struct tls_crypto_info *crypto_info, + u32 start_offload_tcp_sn, u32 *p_swid, + bool direction_sx); -void mlx5_fpga_tls_del_tx_flow(struct mlx5_core_dev *mdev, u32 swid, - gfp_t flags); +void mlx5_fpga_tls_del_flow(struct mlx5_core_dev *mdev, u32 swid, + gfp_t flags, bool direction_sx); bool mlx5_fpga_is_tls_device(struct mlx5_core_dev *mdev); int mlx5_fpga_tls_init(struct mlx5_core_dev *mdev); @@ -65,4 +68,7 @@ static inline u32 mlx5_fpga_tls_device_caps(struct mlx5_core_dev *mdev) return mdev->fpga->tls->caps; } +int mlx5_fpga_tls_resync_rx(struct mlx5_core_dev *mdev, u32 handle, u32 seq, + u64 rcd_sn); + #endif /* __MLX5_FPGA_TLS_H__ */ diff --git a/include/linux/mlx5/mlx5_ifc_fpga.h b/include/linux/mlx5/mlx5_ifc_fpga.h index 64d0f40d4cc3..37e065a80a43 100644 --- a/include/linux/mlx5/mlx5_ifc_fpga.h +++ b/include/linux/mlx5/mlx5_ifc_fpga.h @@ -576,6 +576,7 @@ struct mlx5_ifc_fpga_ipsec_sa { enum fpga_tls_cmds { CMD_SETUP_STREAM = 0x1001, CMD_TEARDOWN_STREAM = 0x1002, + CMD_RESYNC_RX = 0x1003, }; #define MLX5_TLS_1_2 (0) -- cgit v1.2.3-59-g8ed1b From d7e5a9a50245b91f016c814b0f076f7e55cbb980 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Mon, 25 Jun 2018 17:49:43 +0200 Subject: netfilter: utils: move nf_ip_checksum* from ipv4 to utils allows to make nf_ip_checksum_partial static, it no longer has an external caller. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/linux/netfilter_ipv4.h | 11 --------- net/ipv4/netfilter.c | 53 ---------------------------------------- net/netfilter/utils.c | 55 ++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 55 insertions(+), 64 deletions(-) (limited to 'include/linux') diff --git a/include/linux/netfilter_ipv4.h b/include/linux/netfilter_ipv4.h index b31dabfdb453..95ab5cc64422 100644 --- a/include/linux/netfilter_ipv4.h +++ b/include/linux/netfilter_ipv4.h @@ -23,9 +23,6 @@ struct nf_queue_entry; #ifdef CONFIG_INET __sum16 nf_ip_checksum(struct sk_buff *skb, unsigned int hook, unsigned int dataoff, u_int8_t protocol); -__sum16 nf_ip_checksum_partial(struct sk_buff *skb, unsigned int hook, - unsigned int dataoff, unsigned int len, - u_int8_t protocol); int nf_ip_route(struct net *net, struct dst_entry **dst, struct flowi *fl, bool strict); int nf_ip_reroute(struct sk_buff *skb, const struct nf_queue_entry *entry); @@ -35,14 +32,6 @@ static inline __sum16 nf_ip_checksum(struct sk_buff *skb, unsigned int hook, { return 0; } -static inline __sum16 nf_ip_checksum_partial(struct sk_buff *skb, - unsigned int hook, - unsigned int dataoff, - unsigned int len, - u_int8_t protocol) -{ - return 0; -} static inline int nf_ip_route(struct net *net, struct dst_entry **dst, struct flowi *fl, bool strict) { diff --git a/net/ipv4/netfilter.c b/net/ipv4/netfilter.c index e6774ccb7731..8d2e5dc9a827 100644 --- a/net/ipv4/netfilter.c +++ b/net/ipv4/netfilter.c @@ -98,59 +98,6 @@ int nf_ip_reroute(struct sk_buff *skb, const struct nf_queue_entry *entry) } EXPORT_SYMBOL_GPL(nf_ip_reroute); -__sum16 nf_ip_checksum(struct sk_buff *skb, unsigned int hook, - unsigned int dataoff, u_int8_t protocol) -{ - const struct iphdr *iph = ip_hdr(skb); - __sum16 csum = 0; - - switch (skb->ip_summed) { - case CHECKSUM_COMPLETE: - if (hook != NF_INET_PRE_ROUTING && hook != NF_INET_LOCAL_IN) - break; - if ((protocol == 0 && !csum_fold(skb->csum)) || - !csum_tcpudp_magic(iph->saddr, iph->daddr, - skb->len - dataoff, protocol, - skb->csum)) { - skb->ip_summed = CHECKSUM_UNNECESSARY; - break; - } - /* fall through */ - case CHECKSUM_NONE: - if (protocol == 0) - skb->csum = 0; - else - skb->csum = csum_tcpudp_nofold(iph->saddr, iph->daddr, - skb->len - dataoff, - protocol, 0); - csum = __skb_checksum_complete(skb); - } - return csum; -} -EXPORT_SYMBOL(nf_ip_checksum); - -__sum16 nf_ip_checksum_partial(struct sk_buff *skb, unsigned int hook, - unsigned int dataoff, unsigned int len, - u_int8_t protocol) -{ - const struct iphdr *iph = ip_hdr(skb); - __sum16 csum = 0; - - switch (skb->ip_summed) { - case CHECKSUM_COMPLETE: - if (len == skb->len - dataoff) - return nf_ip_checksum(skb, hook, dataoff, protocol); - /* fall through */ - case CHECKSUM_NONE: - skb->csum = csum_tcpudp_nofold(iph->saddr, iph->daddr, protocol, - skb->len - dataoff, 0); - skb->ip_summed = CHECKSUM_NONE; - return __skb_checksum_complete_head(skb, dataoff + len); - } - return csum; -} -EXPORT_SYMBOL_GPL(nf_ip_checksum_partial); - int nf_ip_route(struct net *net, struct dst_entry **dst, struct flowi *fl, bool strict __always_unused) { diff --git a/net/netfilter/utils.c b/net/netfilter/utils.c index 0b660c568156..8980c8a0fe5c 100644 --- a/net/netfilter/utils.c +++ b/net/netfilter/utils.c @@ -1,9 +1,64 @@ +// SPDX-License-Identifier: GPL-2.0 #include #include #include #include #include +#ifdef CONFIG_INET +__sum16 nf_ip_checksum(struct sk_buff *skb, unsigned int hook, + unsigned int dataoff, u8 protocol) +{ + const struct iphdr *iph = ip_hdr(skb); + __sum16 csum = 0; + + switch (skb->ip_summed) { + case CHECKSUM_COMPLETE: + if (hook != NF_INET_PRE_ROUTING && hook != NF_INET_LOCAL_IN) + break; + if ((protocol == 0 && !csum_fold(skb->csum)) || + !csum_tcpudp_magic(iph->saddr, iph->daddr, + skb->len - dataoff, protocol, + skb->csum)) { + skb->ip_summed = CHECKSUM_UNNECESSARY; + break; + } + /* fall through */ + case CHECKSUM_NONE: + if (protocol == 0) + skb->csum = 0; + else + skb->csum = csum_tcpudp_nofold(iph->saddr, iph->daddr, + skb->len - dataoff, + protocol, 0); + csum = __skb_checksum_complete(skb); + } + return csum; +} +EXPORT_SYMBOL(nf_ip_checksum); +#endif + +static __sum16 nf_ip_checksum_partial(struct sk_buff *skb, unsigned int hook, + unsigned int dataoff, unsigned int len, + u8 protocol) +{ + const struct iphdr *iph = ip_hdr(skb); + __sum16 csum = 0; + + switch (skb->ip_summed) { + case CHECKSUM_COMPLETE: + if (len == skb->len - dataoff) + return nf_ip_checksum(skb, hook, dataoff, protocol); + /* fall through */ + case CHECKSUM_NONE: + skb->csum = csum_tcpudp_nofold(iph->saddr, iph->daddr, protocol, + skb->len - dataoff, 0); + skb->ip_summed = CHECKSUM_NONE; + return __skb_checksum_complete_head(skb, dataoff + len); + } + return csum; +} + __sum16 nf_checksum(struct sk_buff *skb, unsigned int hook, unsigned int dataoff, u_int8_t protocol, unsigned short family) -- cgit v1.2.3-59-g8ed1b From ebee5a50d0b7cdc576aa8081f05b86971880054d Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Mon, 25 Jun 2018 17:49:59 +0200 Subject: netfilter: utils: move nf_ip6_checksum* from ipv6 to utils similar to previous change, this also allows to remove it from nf_ipv6_ops and avoid the indirection. It also removes the bogus dependency of nf_conntrack_ipv6 on ipv6 module: ipv6 checksum functions are built into kernel even if CONFIG_IPV6=m, but ipv6/netfilter.o isn't. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/linux/netfilter_ipv6.h | 5 --- net/ipv6/netfilter.c | 62 ---------------------------------- net/netfilter/utils.c | 76 ++++++++++++++++++++++++++++++++++++------ 3 files changed, 65 insertions(+), 78 deletions(-) (limited to 'include/linux') diff --git a/include/linux/netfilter_ipv6.h b/include/linux/netfilter_ipv6.h index 288c597e75b3..c0dc4dd78887 100644 --- a/include/linux/netfilter_ipv6.h +++ b/include/linux/netfilter_ipv6.h @@ -30,11 +30,6 @@ struct nf_ipv6_ops { void (*route_input)(struct sk_buff *skb); int (*fragment)(struct net *net, struct sock *sk, struct sk_buff *skb, int (*output)(struct net *, struct sock *, struct sk_buff *)); - __sum16 (*checksum)(struct sk_buff *skb, unsigned int hook, - unsigned int dataoff, u_int8_t protocol); - __sum16 (*checksum_partial)(struct sk_buff *skb, unsigned int hook, - unsigned int dataoff, unsigned int len, - u_int8_t protocol); int (*route)(struct net *net, struct dst_entry **dst, struct flowi *fl, bool strict); int (*reroute)(struct sk_buff *skb, const struct nf_queue_entry *entry); diff --git a/net/ipv6/netfilter.c b/net/ipv6/netfilter.c index 531d6957af36..5ae8e1c51079 100644 --- a/net/ipv6/netfilter.c +++ b/net/ipv6/netfilter.c @@ -15,7 +15,6 @@ #include #include #include -#include #include int ip6_route_me_harder(struct net *net, struct sk_buff *skb) @@ -106,71 +105,10 @@ static int nf_ip6_route(struct net *net, struct dst_entry **dst, return err; } -__sum16 nf_ip6_checksum(struct sk_buff *skb, unsigned int hook, - unsigned int dataoff, u_int8_t protocol) -{ - const struct ipv6hdr *ip6h = ipv6_hdr(skb); - __sum16 csum = 0; - - switch (skb->ip_summed) { - case CHECKSUM_COMPLETE: - if (hook != NF_INET_PRE_ROUTING && hook != NF_INET_LOCAL_IN) - break; - if (!csum_ipv6_magic(&ip6h->saddr, &ip6h->daddr, - skb->len - dataoff, protocol, - csum_sub(skb->csum, - skb_checksum(skb, 0, - dataoff, 0)))) { - skb->ip_summed = CHECKSUM_UNNECESSARY; - break; - } - /* fall through */ - case CHECKSUM_NONE: - skb->csum = ~csum_unfold( - csum_ipv6_magic(&ip6h->saddr, &ip6h->daddr, - skb->len - dataoff, - protocol, - csum_sub(0, - skb_checksum(skb, 0, - dataoff, 0)))); - csum = __skb_checksum_complete(skb); - } - return csum; -} -EXPORT_SYMBOL(nf_ip6_checksum); - -static __sum16 nf_ip6_checksum_partial(struct sk_buff *skb, unsigned int hook, - unsigned int dataoff, unsigned int len, - u_int8_t protocol) -{ - const struct ipv6hdr *ip6h = ipv6_hdr(skb); - __wsum hsum; - __sum16 csum = 0; - - switch (skb->ip_summed) { - case CHECKSUM_COMPLETE: - if (len == skb->len - dataoff) - return nf_ip6_checksum(skb, hook, dataoff, protocol); - /* fall through */ - case CHECKSUM_NONE: - hsum = skb_checksum(skb, 0, dataoff, 0); - skb->csum = ~csum_unfold(csum_ipv6_magic(&ip6h->saddr, - &ip6h->daddr, - skb->len - dataoff, - protocol, - csum_sub(0, hsum))); - skb->ip_summed = CHECKSUM_NONE; - return __skb_checksum_complete_head(skb, dataoff + len); - } - return csum; -}; - static const struct nf_ipv6_ops ipv6ops = { .chk_addr = ipv6_chk_addr, .route_input = ip6_route_input, .fragment = ip6_fragment, - .checksum = nf_ip6_checksum, - .checksum_partial = nf_ip6_checksum_partial, .route = nf_ip6_route, .reroute = nf_ip6_reroute, }; diff --git a/net/netfilter/utils.c b/net/netfilter/utils.c index 8980c8a0fe5c..e8da9a9bba73 100644 --- a/net/netfilter/utils.c +++ b/net/netfilter/utils.c @@ -4,6 +4,7 @@ #include #include #include +#include #ifdef CONFIG_INET __sum16 nf_ip_checksum(struct sk_buff *skb, unsigned int hook, @@ -59,11 +60,69 @@ static __sum16 nf_ip_checksum_partial(struct sk_buff *skb, unsigned int hook, return csum; } +__sum16 nf_ip6_checksum(struct sk_buff *skb, unsigned int hook, + unsigned int dataoff, u8 protocol) +{ + const struct ipv6hdr *ip6h = ipv6_hdr(skb); + __sum16 csum = 0; + + switch (skb->ip_summed) { + case CHECKSUM_COMPLETE: + if (hook != NF_INET_PRE_ROUTING && hook != NF_INET_LOCAL_IN) + break; + if (!csum_ipv6_magic(&ip6h->saddr, &ip6h->daddr, + skb->len - dataoff, protocol, + csum_sub(skb->csum, + skb_checksum(skb, 0, + dataoff, 0)))) { + skb->ip_summed = CHECKSUM_UNNECESSARY; + break; + } + /* fall through */ + case CHECKSUM_NONE: + skb->csum = ~csum_unfold( + csum_ipv6_magic(&ip6h->saddr, &ip6h->daddr, + skb->len - dataoff, + protocol, + csum_sub(0, + skb_checksum(skb, 0, + dataoff, 0)))); + csum = __skb_checksum_complete(skb); + } + return csum; +} +EXPORT_SYMBOL(nf_ip6_checksum); + +static __sum16 nf_ip6_checksum_partial(struct sk_buff *skb, unsigned int hook, + unsigned int dataoff, unsigned int len, + u8 protocol) +{ + const struct ipv6hdr *ip6h = ipv6_hdr(skb); + __wsum hsum; + __sum16 csum = 0; + + switch (skb->ip_summed) { + case CHECKSUM_COMPLETE: + if (len == skb->len - dataoff) + return nf_ip6_checksum(skb, hook, dataoff, protocol); + /* fall through */ + case CHECKSUM_NONE: + hsum = skb_checksum(skb, 0, dataoff, 0); + skb->csum = ~csum_unfold(csum_ipv6_magic(&ip6h->saddr, + &ip6h->daddr, + skb->len - dataoff, + protocol, + csum_sub(0, hsum))); + skb->ip_summed = CHECKSUM_NONE; + return __skb_checksum_complete_head(skb, dataoff + len); + } + return csum; +}; + __sum16 nf_checksum(struct sk_buff *skb, unsigned int hook, - unsigned int dataoff, u_int8_t protocol, + unsigned int dataoff, u8 protocol, unsigned short family) { - const struct nf_ipv6_ops *v6ops; __sum16 csum = 0; switch (family) { @@ -71,9 +130,7 @@ __sum16 nf_checksum(struct sk_buff *skb, unsigned int hook, csum = nf_ip_checksum(skb, hook, dataoff, protocol); break; case AF_INET6: - v6ops = rcu_dereference(nf_ipv6_ops); - if (v6ops) - csum = v6ops->checksum(skb, hook, dataoff, protocol); + csum = nf_ip6_checksum(skb, hook, dataoff, protocol); break; } @@ -83,9 +140,8 @@ EXPORT_SYMBOL_GPL(nf_checksum); __sum16 nf_checksum_partial(struct sk_buff *skb, unsigned int hook, unsigned int dataoff, unsigned int len, - u_int8_t protocol, unsigned short family) + u8 protocol, unsigned short family) { - const struct nf_ipv6_ops *v6ops; __sum16 csum = 0; switch (family) { @@ -94,10 +150,8 @@ __sum16 nf_checksum_partial(struct sk_buff *skb, unsigned int hook, protocol); break; case AF_INET6: - v6ops = rcu_dereference(nf_ipv6_ops); - if (v6ops) - csum = v6ops->checksum_partial(skb, hook, dataoff, len, - protocol); + csum = nf_ip6_checksum_partial(skb, hook, dataoff, len, + protocol); break; } -- cgit v1.2.3-59-g8ed1b From 2b9672ddb6f347467d7b33b86c5dfc4d5c0501a8 Mon Sep 17 00:00:00 2001 From: Heiner Kallweit Date: Thu, 12 Jul 2018 21:32:53 +0200 Subject: net: phy: add phy_speed_down and phy_speed_up Some network drivers include functionality to speed down the PHY when suspending and just waiting for a WoL packet because this saves energy. This functionality is quite generic, therefore let's factor it out to phylib. Signed-off-by: Heiner Kallweit Signed-off-by: David S. Miller --- drivers/net/phy/phy.c | 78 +++++++++++++++++++++++++++++++++++++++++++++++++++ include/linux/phy.h | 2 ++ 2 files changed, 80 insertions(+) (limited to 'include/linux') diff --git a/drivers/net/phy/phy.c b/drivers/net/phy/phy.c index c4aa360dedff..d2baedc4ea91 100644 --- a/drivers/net/phy/phy.c +++ b/drivers/net/phy/phy.c @@ -551,6 +551,84 @@ int phy_start_aneg(struct phy_device *phydev) } EXPORT_SYMBOL(phy_start_aneg); +static int phy_poll_aneg_done(struct phy_device *phydev) +{ + unsigned int retries = 100; + int ret; + + do { + msleep(100); + ret = phy_aneg_done(phydev); + } while (!ret && --retries); + + if (!ret) + return -ETIMEDOUT; + + return ret < 0 ? ret : 0; +} + +/** + * phy_speed_down - set speed to lowest speed supported by both link partners + * @phydev: the phy_device struct + * @sync: perform action synchronously + * + * Description: Typically used to save energy when waiting for a WoL packet + * + * WARNING: Setting sync to false may cause the system being unable to suspend + * in case the PHY generates an interrupt when finishing the autonegotiation. + * This interrupt may wake up the system immediately after suspend. + * Therefore use sync = false only if you're sure it's safe with the respective + * network chip. + */ +int phy_speed_down(struct phy_device *phydev, bool sync) +{ + u32 adv = phydev->lp_advertising & phydev->supported; + u32 adv_old = phydev->advertising; + int ret; + + if (phydev->autoneg != AUTONEG_ENABLE) + return 0; + + if (adv & PHY_10BT_FEATURES) + phydev->advertising &= ~(PHY_100BT_FEATURES | + PHY_1000BT_FEATURES); + else if (adv & PHY_100BT_FEATURES) + phydev->advertising &= ~PHY_1000BT_FEATURES; + + if (phydev->advertising == adv_old) + return 0; + + ret = phy_config_aneg(phydev); + if (ret) + return ret; + + return sync ? phy_poll_aneg_done(phydev) : 0; +} +EXPORT_SYMBOL_GPL(phy_speed_down); + +/** + * phy_speed_up - (re)set advertised speeds to all supported speeds + * @phydev: the phy_device struct + * + * Description: Used to revert the effect of phy_speed_down + */ +int phy_speed_up(struct phy_device *phydev) +{ + u32 mask = PHY_10BT_FEATURES | PHY_100BT_FEATURES | PHY_1000BT_FEATURES; + u32 adv_old = phydev->advertising; + + if (phydev->autoneg != AUTONEG_ENABLE) + return 0; + + phydev->advertising = (adv_old & ~mask) | (phydev->supported & mask); + + if (phydev->advertising == adv_old) + return 0; + + return phy_config_aneg(phydev); +} +EXPORT_SYMBOL_GPL(phy_speed_up); + /** * phy_start_machine - start PHY state machine tracking * @phydev: the phy_device struct diff --git a/include/linux/phy.h b/include/linux/phy.h index 6cd09098427c..075c2f770d3e 100644 --- a/include/linux/phy.h +++ b/include/linux/phy.h @@ -942,6 +942,8 @@ void phy_start(struct phy_device *phydev); void phy_stop(struct phy_device *phydev); int phy_start_aneg(struct phy_device *phydev); int phy_aneg_done(struct phy_device *phydev); +int phy_speed_down(struct phy_device *phydev, bool sync); +int phy_speed_up(struct phy_device *phydev); int phy_stop_interrupts(struct phy_device *phydev); int phy_restart_aneg(struct phy_device *phydev); -- cgit v1.2.3-59-g8ed1b From d9f37d01e294e5338aa3e9d3b2eda61b59b619df Mon Sep 17 00:00:00 2001 From: Li RongQing Date: Fri, 13 Jul 2018 14:41:36 +0800 Subject: net: convert gro_count to bitmask gro_hash size is 192 bytes, and uses 3 cache lines, if there is few flows, gro_hash may be not fully used, so it is unnecessary to iterate all gro_hash in napi_gro_flush(), to occupy unnecessary cacheline. convert gro_count to a bitmask, and rename it as gro_bitmask, each bit represents a element of gro_hash, only flush a gro_hash element if the related bit is set, to speed up napi_gro_flush(). and update gro_bitmask only if it will be changed, to reduce cache update Suggested-by: Eric Dumazet Signed-off-by: Li RongQing Cc: Stefano Brivio Signed-off-by: David S. Miller --- include/linux/netdevice.h | 9 +++++++-- net/core/dev.c | 36 ++++++++++++++++++++++++------------ 2 files changed, 31 insertions(+), 14 deletions(-) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 3514d67112b3..c1295c7a452e 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -308,9 +308,14 @@ struct gro_list { }; /* - * Structure for NAPI scheduling similar to tasklet but with weighting + * size of gro hash buckets, must less than bit number of + * napi_struct::gro_bitmask */ #define GRO_HASH_BUCKETS 8 + +/* + * Structure for NAPI scheduling similar to tasklet but with weighting + */ struct napi_struct { /* The poll_list must only be managed by the entity which * changes the state of the NAPI_STATE_SCHED bit. This means @@ -322,7 +327,7 @@ struct napi_struct { unsigned long state; int weight; - unsigned int gro_count; + unsigned long gro_bitmask; int (*poll)(struct napi_struct *, int); #ifdef CONFIG_NETPOLL int poll_owner; diff --git a/net/core/dev.c b/net/core/dev.c index 0df1771a12f9..c883b17ee0fe 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -5282,9 +5282,11 @@ static void __napi_gro_flush_chain(struct napi_struct *napi, u32 index, list_del(&skb->list); skb->next = NULL; napi_gro_complete(skb); - napi->gro_count--; napi->gro_hash[index].count--; } + + if (!napi->gro_hash[index].count) + __clear_bit(index, &napi->gro_bitmask); } /* napi->gro_hash[].list contains packets ordered by age. @@ -5295,8 +5297,10 @@ void napi_gro_flush(struct napi_struct *napi, bool flush_old) { u32 i; - for (i = 0; i < GRO_HASH_BUCKETS; i++) - __napi_gro_flush_chain(napi, i, flush_old); + for (i = 0; i < GRO_HASH_BUCKETS; i++) { + if (test_bit(i, &napi->gro_bitmask)) + __napi_gro_flush_chain(napi, i, flush_old); + } } EXPORT_SYMBOL(napi_gro_flush); @@ -5388,8 +5392,8 @@ static void gro_flush_oldest(struct list_head *head) if (WARN_ON_ONCE(!oldest)) return; - /* Do not adjust napi->gro_count, caller is adding a new SKB to - * the chain. + /* Do not adjust napi->gro_hash[].count, caller is adding a new + * SKB to the chain. */ list_del(&oldest->list); napi_gro_complete(oldest); @@ -5464,7 +5468,6 @@ static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff list_del(&pp->list); pp->next = NULL; napi_gro_complete(pp); - napi->gro_count--; napi->gro_hash[hash].count--; } @@ -5477,7 +5480,6 @@ static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff if (unlikely(napi->gro_hash[hash].count >= MAX_GRO_SKBS)) { gro_flush_oldest(gro_head); } else { - napi->gro_count++; napi->gro_hash[hash].count++; } NAPI_GRO_CB(skb)->count = 1; @@ -5492,6 +5494,13 @@ pull: if (grow > 0) gro_pull_from_frag0(skb, grow); ok: + if (napi->gro_hash[hash].count) { + if (!test_bit(hash, &napi->gro_bitmask)) + __set_bit(hash, &napi->gro_bitmask); + } else if (test_bit(hash, &napi->gro_bitmask)) { + __clear_bit(hash, &napi->gro_bitmask); + } + return ret; normal: @@ -5890,7 +5899,7 @@ bool napi_complete_done(struct napi_struct *n, int work_done) NAPIF_STATE_IN_BUSY_POLL))) return false; - if (n->gro_count) { + if (n->gro_bitmask) { unsigned long timeout = 0; if (work_done) @@ -6099,7 +6108,7 @@ static enum hrtimer_restart napi_watchdog(struct hrtimer *timer) /* Note : we use a relaxed variant of napi_schedule_prep() not setting * NAPI_STATE_MISSED, since we do not react to a device IRQ. */ - if (napi->gro_count && !napi_disable_pending(napi) && + if (napi->gro_bitmask && !napi_disable_pending(napi) && !test_and_set_bit(NAPI_STATE_SCHED, &napi->state)) __napi_schedule_irqoff(napi); @@ -6114,7 +6123,7 @@ void netif_napi_add(struct net_device *dev, struct napi_struct *napi, INIT_LIST_HEAD(&napi->poll_list); hrtimer_init(&napi->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED); napi->timer.function = napi_watchdog; - napi->gro_count = 0; + napi->gro_bitmask = 0; for (i = 0; i < GRO_HASH_BUCKETS; i++) { INIT_LIST_HEAD(&napi->gro_hash[i].list); napi->gro_hash[i].count = 0; @@ -6174,7 +6183,7 @@ void netif_napi_del(struct napi_struct *napi) napi_free_frags(napi); flush_gro_hash(napi); - napi->gro_count = 0; + napi->gro_bitmask = 0; } EXPORT_SYMBOL(netif_napi_del); @@ -6216,7 +6225,7 @@ static int napi_poll(struct napi_struct *n, struct list_head *repoll) goto out_unlock; } - if (n->gro_count) { + if (n->gro_bitmask) { /* flush too old packets * If HZ < 1000, flush all packets. */ @@ -9272,6 +9281,9 @@ static struct hlist_head * __net_init netdev_create_hash(void) /* Initialize per network namespace state */ static int __net_init netdev_init(struct net *net) { + BUILD_BUG_ON(GRO_HASH_BUCKETS > + FIELD_SIZEOF(struct napi_struct, gro_bitmask)); + if (net != &init_net) INIT_LIST_HEAD(&net->dev_base_head); -- cgit v1.2.3-59-g8ed1b From 2fe31e43124040a67495fa81f7ac67bc4c09ebc8 Mon Sep 17 00:00:00 2001 From: Andrew Lunn Date: Tue, 17 Jul 2018 21:48:10 +0200 Subject: hwmon: Add missing HWMON_T_LCRIT_ALARM define The enum hwmon_temp_lcrit_alarm exists, but the BIT definition is missing. Signed-off-by: Andrew Lunn Acked-by: Guenter Roeck Signed-off-by: David S. Miller --- include/linux/hwmon.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/hwmon.h b/include/linux/hwmon.h index e5fd2707b6df..1b74ad11a5a4 100644 --- a/include/linux/hwmon.h +++ b/include/linux/hwmon.h @@ -93,6 +93,7 @@ enum hwmon_temp_attributes { #define HWMON_T_MIN_ALARM BIT(hwmon_temp_min_alarm) #define HWMON_T_MAX_ALARM BIT(hwmon_temp_max_alarm) #define HWMON_T_CRIT_ALARM BIT(hwmon_temp_crit_alarm) +#define HWMON_T_LCRIT_ALARM BIT(hwmon_temp_lcrit_alarm) #define HWMON_T_EMERGENCY_ALARM BIT(hwmon_temp_emergency_alarm) #define HWMON_T_FAULT BIT(hwmon_temp_fault) #define HWMON_T_OFFSET BIT(hwmon_temp_offset) -- cgit v1.2.3-59-g8ed1b From aa7f29b07c8702127124d0522e3cd46850cdbc41 Mon Sep 17 00:00:00 2001 From: Andrew Lunn Date: Tue, 17 Jul 2018 21:48:11 +0200 Subject: hwmon: Add support for power min, lcrit, min_alarm and lcrit_alarm Some sensors support reporting minimal and lower critical power, as well as alarms when these thresholds are reached. Add support for these attributes to the hwmon core. Signed-off-by: Andrew Lunn Acked-by: Guenter Roeck Signed-off-by: David S. Miller --- drivers/hwmon/hwmon.c | 4 ++++ include/linux/hwmon.h | 8 ++++++++ 2 files changed, 12 insertions(+) (limited to 'include/linux') diff --git a/drivers/hwmon/hwmon.c b/drivers/hwmon/hwmon.c index e88c01961948..33d51281272b 100644 --- a/drivers/hwmon/hwmon.c +++ b/drivers/hwmon/hwmon.c @@ -394,12 +394,16 @@ static const char * const hwmon_power_attr_templates[] = { [hwmon_power_cap_hyst] = "power%d_cap_hyst", [hwmon_power_cap_max] = "power%d_cap_max", [hwmon_power_cap_min] = "power%d_cap_min", + [hwmon_power_min] = "power%d_min", [hwmon_power_max] = "power%d_max", + [hwmon_power_lcrit] = "power%d_lcrit", [hwmon_power_crit] = "power%d_crit", [hwmon_power_label] = "power%d_label", [hwmon_power_alarm] = "power%d_alarm", [hwmon_power_cap_alarm] = "power%d_cap_alarm", + [hwmon_power_min_alarm] = "power%d_min_alarm", [hwmon_power_max_alarm] = "power%d_max_alarm", + [hwmon_power_lcrit_alarm] = "power%d_lcrit_alarm", [hwmon_power_crit_alarm] = "power%d_crit_alarm", }; diff --git a/include/linux/hwmon.h b/include/linux/hwmon.h index 1b74ad11a5a4..b217101ca76e 100644 --- a/include/linux/hwmon.h +++ b/include/linux/hwmon.h @@ -188,12 +188,16 @@ enum hwmon_power_attributes { hwmon_power_cap_hyst, hwmon_power_cap_max, hwmon_power_cap_min, + hwmon_power_min, hwmon_power_max, hwmon_power_crit, + hwmon_power_lcrit, hwmon_power_label, hwmon_power_alarm, hwmon_power_cap_alarm, + hwmon_power_min_alarm, hwmon_power_max_alarm, + hwmon_power_lcrit_alarm, hwmon_power_crit_alarm, }; @@ -214,12 +218,16 @@ enum hwmon_power_attributes { #define HWMON_P_CAP_HYST BIT(hwmon_power_cap_hyst) #define HWMON_P_CAP_MAX BIT(hwmon_power_cap_max) #define HWMON_P_CAP_MIN BIT(hwmon_power_cap_min) +#define HWMON_P_MIN BIT(hwmon_power_min) #define HWMON_P_MAX BIT(hwmon_power_max) +#define HWMON_P_LCRIT BIT(hwmon_power_lcrit) #define HWMON_P_CRIT BIT(hwmon_power_crit) #define HWMON_P_LABEL BIT(hwmon_power_label) #define HWMON_P_ALARM BIT(hwmon_power_alarm) #define HWMON_P_CAP_ALARM BIT(hwmon_power_cap_alarm) +#define HWMON_P_MIN_ALARM BIT(hwmon_power_max_alarm) #define HWMON_P_MAX_ALARM BIT(hwmon_power_max_alarm) +#define HWMON_P_LCRIT_ALARM BIT(hwmon_power_lcrit_alarm) #define HWMON_P_CRIT_ALARM BIT(hwmon_power_crit_alarm) enum hwmon_energy_attributes { -- cgit v1.2.3-59-g8ed1b From dcb5d0fcaa1ddd0af9c18ef51e6b05e38a6cec71 Mon Sep 17 00:00:00 2001 From: Andrew Lunn Date: Tue, 17 Jul 2018 21:48:12 +0200 Subject: hwmon: Add helper to tell if a char is invalid in a name HWMON device names are not allowed to contain "-* \t\n". Add a helper which will return true if passed an invalid character. It can be used to massage a string into a hwmon compatible name by replacing invalid characters with '_'. Signed-off-by: Andrew Lunn Acked-by: Guenter Roeck Signed-off-by: David S. Miller --- include/linux/hwmon.h | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) (limited to 'include/linux') diff --git a/include/linux/hwmon.h b/include/linux/hwmon.h index b217101ca76e..9493d4a388db 100644 --- a/include/linux/hwmon.h +++ b/include/linux/hwmon.h @@ -398,4 +398,27 @@ devm_hwmon_device_register_with_info(struct device *dev, void hwmon_device_unregister(struct device *dev); void devm_hwmon_device_unregister(struct device *dev); +/** + * hwmon_is_bad_char - Is the char invalid in a hwmon name + * @ch: the char to be considered + * + * hwmon_is_bad_char() can be used to determine if the given character + * may not be used in a hwmon name. + * + * Returns true if the char is invalid, false otherwise. + */ +static inline bool hwmon_is_bad_char(const char ch) +{ + switch (ch) { + case '-': + case '*': + case ' ': + case '\t': + case '\n': + return true; + default: + return false; + } +} + #endif -- cgit v1.2.3-59-g8ed1b From 1323061a018a7514287894a552c4ec2a5f0cb0cd Mon Sep 17 00:00:00 2001 From: Andrew Lunn Date: Tue, 17 Jul 2018 21:48:13 +0200 Subject: net: phy: sfp: Add HWMON support for module sensors MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit SFP modules can contain a number of sensors. The EEPROM also contains recommended alarm and critical values for each sensor, and indications of if these have been exceeded. Export this information via HWMON. Currently temperature, VCC, bias current, transmit power, and possibly receiver power is supported. The sensors in the modules can either return calibrate or uncalibrated values. Uncalibrated values need to be manipulated, using coefficients provided in the SFP EEPROM. Uncalibrated receive power values require floating point maths in order to calibrate them. Performing this in the kernel is hard. So if the SFP module indicates it uses uncalibrated values, RX power is not made available. With this hwmon device, it is possible to view the sensor values using lm-sensors programs: in0: +3.29 V (crit min = +2.90 V, min = +3.00 V) (max = +3.60 V, crit max = +3.70 V) temp1: +33.0°C (low = -5.0°C, high = +80.0°C) (crit low = -10.0°C, crit = +85.0°C) power1: 1000.00 nW (max = 794.00 uW, min = 50.00 uW) ALARM (LCRIT) (lcrit = 40.00 uW, crit = 1000.00 uW) curr1: +0.00 A (crit min = +0.00 A, min = +0.00 A) ALARM (LCRIT, MIN) (max = +0.01 A, crit max = +0.01 A) The scaling sensors performs on the bias current is not particularly good. The raw values are more useful: curr1: curr1_input: 0.000 curr1_min: 0.002 curr1_max: 0.010 curr1_lcrit: 0.000 curr1_crit: 0.011 curr1_min_alarm: 1.000 curr1_max_alarm: 0.000 curr1_lcrit_alarm: 1.000 curr1_crit_alarm: 0.000 In order to keep the I2C overhead to a minimum, the constant values, such as limits and calibration coefficients are read once at module insertion time. Thus only reading *_input and *_alarm properties requires i2c read operations. Signed-off-by: Andrew Lunn Acked-by: Guenter Roeck Signed-off-by: David S. Miller --- drivers/net/phy/Kconfig | 1 + drivers/net/phy/sfp.c | 727 ++++++++++++++++++++++++++++++++++++++++++++++++ include/linux/sfp.h | 72 ++++- 3 files changed, 799 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/drivers/net/phy/Kconfig b/drivers/net/phy/Kconfig index ceede09a2845..9beac427f9e8 100644 --- a/drivers/net/phy/Kconfig +++ b/drivers/net/phy/Kconfig @@ -215,6 +215,7 @@ config SFP tristate "SFP cage support" depends on I2C && PHYLINK select MDIO_I2C + imply HWMON config AMD_PHY tristate "AMD PHYs" diff --git a/drivers/net/phy/sfp.c b/drivers/net/phy/sfp.c index c4c92db86dfa..5661226cf75b 100644 --- a/drivers/net/phy/sfp.c +++ b/drivers/net/phy/sfp.c @@ -1,5 +1,7 @@ +#include #include #include +#include #include #include #include @@ -131,6 +133,12 @@ struct sfp { unsigned int sm_retries; struct sfp_eeprom_id id; +#if IS_ENABLED(CONFIG_HWMON) + struct sfp_diag diag; + struct device *hwmon_dev; + char *hwmon_name; +#endif + }; static bool sff_module_supported(const struct sfp_eeprom_id *id) @@ -316,6 +324,719 @@ static unsigned int sfp_check(void *buf, size_t len) return check; } +/* hwmon */ +#if IS_ENABLED(CONFIG_HWMON) +static umode_t sfp_hwmon_is_visible(const void *data, + enum hwmon_sensor_types type, + u32 attr, int channel) +{ + const struct sfp *sfp = data; + + switch (type) { + case hwmon_temp: + switch (attr) { + case hwmon_temp_input: + case hwmon_temp_min_alarm: + case hwmon_temp_max_alarm: + case hwmon_temp_lcrit_alarm: + case hwmon_temp_crit_alarm: + case hwmon_temp_min: + case hwmon_temp_max: + case hwmon_temp_lcrit: + case hwmon_temp_crit: + return 0444; + default: + return 0; + } + case hwmon_in: + switch (attr) { + case hwmon_in_input: + case hwmon_in_min_alarm: + case hwmon_in_max_alarm: + case hwmon_in_lcrit_alarm: + case hwmon_in_crit_alarm: + case hwmon_in_min: + case hwmon_in_max: + case hwmon_in_lcrit: + case hwmon_in_crit: + return 0444; + default: + return 0; + } + case hwmon_curr: + switch (attr) { + case hwmon_curr_input: + case hwmon_curr_min_alarm: + case hwmon_curr_max_alarm: + case hwmon_curr_lcrit_alarm: + case hwmon_curr_crit_alarm: + case hwmon_curr_min: + case hwmon_curr_max: + case hwmon_curr_lcrit: + case hwmon_curr_crit: + return 0444; + default: + return 0; + } + case hwmon_power: + /* External calibration of receive power requires + * floating point arithmetic. Doing that in the kernel + * is not easy, so just skip it. If the module does + * not require external calibration, we can however + * show receiver power, since FP is then not needed. + */ + if (sfp->id.ext.diagmon & SFP_DIAGMON_EXT_CAL && + channel == 1) + return 0; + switch (attr) { + case hwmon_power_input: + case hwmon_power_min_alarm: + case hwmon_power_max_alarm: + case hwmon_power_lcrit_alarm: + case hwmon_power_crit_alarm: + case hwmon_power_min: + case hwmon_power_max: + case hwmon_power_lcrit: + case hwmon_power_crit: + return 0444; + default: + return 0; + } + default: + return 0; + } +} + +static int sfp_hwmon_read_sensor(struct sfp *sfp, int reg, long *value) +{ + __be16 val; + int err; + + err = sfp_read(sfp, true, reg, &val, sizeof(val)); + if (err < 0) + return err; + + *value = be16_to_cpu(val); + + return 0; +} + +static void sfp_hwmon_to_rx_power(long *value) +{ + *value = DIV_ROUND_CLOSEST(*value, 100); +} + +static void sfp_hwmon_calibrate(struct sfp *sfp, unsigned int slope, int offset, + long *value) +{ + if (sfp->id.ext.diagmon & SFP_DIAGMON_EXT_CAL) + *value = DIV_ROUND_CLOSEST(*value * slope, 256) + offset; +} + +static void sfp_hwmon_calibrate_temp(struct sfp *sfp, long *value) +{ + sfp_hwmon_calibrate(sfp, be16_to_cpu(sfp->diag.cal_t_slope), + be16_to_cpu(sfp->diag.cal_t_offset), value); + + if (*value >= 0x8000) + *value -= 0x10000; + + *value = DIV_ROUND_CLOSEST(*value * 1000, 256); +} + +static void sfp_hwmon_calibrate_vcc(struct sfp *sfp, long *value) +{ + sfp_hwmon_calibrate(sfp, be16_to_cpu(sfp->diag.cal_v_slope), + be16_to_cpu(sfp->diag.cal_v_offset), value); + + *value = DIV_ROUND_CLOSEST(*value, 10); +} + +static void sfp_hwmon_calibrate_bias(struct sfp *sfp, long *value) +{ + sfp_hwmon_calibrate(sfp, be16_to_cpu(sfp->diag.cal_txi_slope), + be16_to_cpu(sfp->diag.cal_txi_offset), value); + + *value = DIV_ROUND_CLOSEST(*value, 500); +} + +static void sfp_hwmon_calibrate_tx_power(struct sfp *sfp, long *value) +{ + sfp_hwmon_calibrate(sfp, be16_to_cpu(sfp->diag.cal_txpwr_slope), + be16_to_cpu(sfp->diag.cal_txpwr_offset), value); + + *value = DIV_ROUND_CLOSEST(*value, 10); +} + +static int sfp_hwmon_read_temp(struct sfp *sfp, int reg, long *value) +{ + int err; + + err = sfp_hwmon_read_sensor(sfp, reg, value); + if (err < 0) + return err; + + sfp_hwmon_calibrate_temp(sfp, value); + + return 0; +} + +static int sfp_hwmon_read_vcc(struct sfp *sfp, int reg, long *value) +{ + int err; + + err = sfp_hwmon_read_sensor(sfp, reg, value); + if (err < 0) + return err; + + sfp_hwmon_calibrate_vcc(sfp, value); + + return 0; +} + +static int sfp_hwmon_read_bias(struct sfp *sfp, int reg, long *value) +{ + int err; + + err = sfp_hwmon_read_sensor(sfp, reg, value); + if (err < 0) + return err; + + sfp_hwmon_calibrate_bias(sfp, value); + + return 0; +} + +static int sfp_hwmon_read_tx_power(struct sfp *sfp, int reg, long *value) +{ + int err; + + err = sfp_hwmon_read_sensor(sfp, reg, value); + if (err < 0) + return err; + + sfp_hwmon_calibrate_tx_power(sfp, value); + + return 0; +} + +static int sfp_hwmon_read_rx_power(struct sfp *sfp, int reg, long *value) +{ + int err; + + err = sfp_hwmon_read_sensor(sfp, reg, value); + if (err < 0) + return err; + + sfp_hwmon_to_rx_power(value); + + return 0; +} + +static int sfp_hwmon_temp(struct sfp *sfp, u32 attr, long *value) +{ + u8 status; + int err; + + switch (attr) { + case hwmon_temp_input: + return sfp_hwmon_read_temp(sfp, SFP_TEMP, value); + + case hwmon_temp_lcrit: + *value = be16_to_cpu(sfp->diag.temp_low_alarm); + sfp_hwmon_calibrate_temp(sfp, value); + return 0; + + case hwmon_temp_min: + *value = be16_to_cpu(sfp->diag.temp_low_warn); + sfp_hwmon_calibrate_temp(sfp, value); + return 0; + case hwmon_temp_max: + *value = be16_to_cpu(sfp->diag.temp_high_warn); + sfp_hwmon_calibrate_temp(sfp, value); + return 0; + + case hwmon_temp_crit: + *value = be16_to_cpu(sfp->diag.temp_high_alarm); + sfp_hwmon_calibrate_temp(sfp, value); + return 0; + + case hwmon_temp_lcrit_alarm: + err = sfp_read(sfp, true, SFP_ALARM0, &status, sizeof(status)); + if (err < 0) + return err; + + *value = !!(status & SFP_ALARM0_TEMP_LOW); + return 0; + + case hwmon_temp_min_alarm: + err = sfp_read(sfp, true, SFP_WARN0, &status, sizeof(status)); + if (err < 0) + return err; + + *value = !!(status & SFP_WARN0_TEMP_LOW); + return 0; + + case hwmon_temp_max_alarm: + err = sfp_read(sfp, true, SFP_WARN0, &status, sizeof(status)); + if (err < 0) + return err; + + *value = !!(status & SFP_WARN0_TEMP_HIGH); + return 0; + + case hwmon_temp_crit_alarm: + err = sfp_read(sfp, true, SFP_ALARM0, &status, sizeof(status)); + if (err < 0) + return err; + + *value = !!(status & SFP_ALARM0_TEMP_HIGH); + return 0; + default: + return -EOPNOTSUPP; + } + + return -EOPNOTSUPP; +} + +static int sfp_hwmon_vcc(struct sfp *sfp, u32 attr, long *value) +{ + u8 status; + int err; + + switch (attr) { + case hwmon_in_input: + return sfp_hwmon_read_vcc(sfp, SFP_VCC, value); + + case hwmon_in_lcrit: + *value = be16_to_cpu(sfp->diag.volt_low_alarm); + sfp_hwmon_calibrate_vcc(sfp, value); + return 0; + + case hwmon_in_min: + *value = be16_to_cpu(sfp->diag.volt_low_warn); + sfp_hwmon_calibrate_vcc(sfp, value); + return 0; + + case hwmon_in_max: + *value = be16_to_cpu(sfp->diag.volt_high_warn); + sfp_hwmon_calibrate_vcc(sfp, value); + return 0; + + case hwmon_in_crit: + *value = be16_to_cpu(sfp->diag.volt_high_alarm); + sfp_hwmon_calibrate_vcc(sfp, value); + return 0; + + case hwmon_in_lcrit_alarm: + err = sfp_read(sfp, true, SFP_ALARM0, &status, sizeof(status)); + if (err < 0) + return err; + + *value = !!(status & SFP_ALARM0_VCC_LOW); + return 0; + + case hwmon_in_min_alarm: + err = sfp_read(sfp, true, SFP_WARN0, &status, sizeof(status)); + if (err < 0) + return err; + + *value = !!(status & SFP_WARN0_VCC_LOW); + return 0; + + case hwmon_in_max_alarm: + err = sfp_read(sfp, true, SFP_WARN0, &status, sizeof(status)); + if (err < 0) + return err; + + *value = !!(status & SFP_WARN0_VCC_HIGH); + return 0; + + case hwmon_in_crit_alarm: + err = sfp_read(sfp, true, SFP_ALARM0, &status, sizeof(status)); + if (err < 0) + return err; + + *value = !!(status & SFP_ALARM0_VCC_HIGH); + return 0; + default: + return -EOPNOTSUPP; + } + + return -EOPNOTSUPP; +} + +static int sfp_hwmon_bias(struct sfp *sfp, u32 attr, long *value) +{ + u8 status; + int err; + + switch (attr) { + case hwmon_curr_input: + return sfp_hwmon_read_bias(sfp, SFP_TX_BIAS, value); + + case hwmon_curr_lcrit: + *value = be16_to_cpu(sfp->diag.bias_low_alarm); + sfp_hwmon_calibrate_bias(sfp, value); + return 0; + + case hwmon_curr_min: + *value = be16_to_cpu(sfp->diag.bias_low_warn); + sfp_hwmon_calibrate_bias(sfp, value); + return 0; + + case hwmon_curr_max: + *value = be16_to_cpu(sfp->diag.bias_high_warn); + sfp_hwmon_calibrate_bias(sfp, value); + return 0; + + case hwmon_curr_crit: + *value = be16_to_cpu(sfp->diag.bias_high_alarm); + sfp_hwmon_calibrate_bias(sfp, value); + return 0; + + case hwmon_curr_lcrit_alarm: + err = sfp_read(sfp, true, SFP_ALARM0, &status, sizeof(status)); + if (err < 0) + return err; + + *value = !!(status & SFP_ALARM0_TX_BIAS_LOW); + return 0; + + case hwmon_curr_min_alarm: + err = sfp_read(sfp, true, SFP_WARN0, &status, sizeof(status)); + if (err < 0) + return err; + + *value = !!(status & SFP_WARN0_TX_BIAS_LOW); + return 0; + + case hwmon_curr_max_alarm: + err = sfp_read(sfp, true, SFP_WARN0, &status, sizeof(status)); + if (err < 0) + return err; + + *value = !!(status & SFP_WARN0_TX_BIAS_HIGH); + return 0; + + case hwmon_curr_crit_alarm: + err = sfp_read(sfp, true, SFP_ALARM0, &status, sizeof(status)); + if (err < 0) + return err; + + *value = !!(status & SFP_ALARM0_TX_BIAS_HIGH); + return 0; + default: + return -EOPNOTSUPP; + } + + return -EOPNOTSUPP; +} + +static int sfp_hwmon_tx_power(struct sfp *sfp, u32 attr, long *value) +{ + u8 status; + int err; + + switch (attr) { + case hwmon_power_input: + return sfp_hwmon_read_tx_power(sfp, SFP_TX_POWER, value); + + case hwmon_power_lcrit: + *value = be16_to_cpu(sfp->diag.txpwr_low_alarm); + sfp_hwmon_calibrate_tx_power(sfp, value); + return 0; + + case hwmon_power_min: + *value = be16_to_cpu(sfp->diag.txpwr_low_warn); + sfp_hwmon_calibrate_tx_power(sfp, value); + return 0; + + case hwmon_power_max: + *value = be16_to_cpu(sfp->diag.txpwr_high_warn); + sfp_hwmon_calibrate_tx_power(sfp, value); + return 0; + + case hwmon_power_crit: + *value = be16_to_cpu(sfp->diag.txpwr_high_alarm); + sfp_hwmon_calibrate_tx_power(sfp, value); + return 0; + + case hwmon_power_lcrit_alarm: + err = sfp_read(sfp, true, SFP_ALARM0, &status, sizeof(status)); + if (err < 0) + return err; + + *value = !!(status & SFP_ALARM0_TXPWR_LOW); + return 0; + + case hwmon_power_min_alarm: + err = sfp_read(sfp, true, SFP_WARN0, &status, sizeof(status)); + if (err < 0) + return err; + + *value = !!(status & SFP_WARN0_TXPWR_LOW); + return 0; + + case hwmon_power_max_alarm: + err = sfp_read(sfp, true, SFP_WARN0, &status, sizeof(status)); + if (err < 0) + return err; + + *value = !!(status & SFP_WARN0_TXPWR_HIGH); + return 0; + + case hwmon_power_crit_alarm: + err = sfp_read(sfp, true, SFP_ALARM0, &status, sizeof(status)); + if (err < 0) + return err; + + *value = !!(status & SFP_ALARM0_TXPWR_HIGH); + return 0; + default: + return -EOPNOTSUPP; + } + + return -EOPNOTSUPP; +} + +static int sfp_hwmon_rx_power(struct sfp *sfp, u32 attr, long *value) +{ + u8 status; + int err; + + switch (attr) { + case hwmon_power_input: + return sfp_hwmon_read_rx_power(sfp, SFP_RX_POWER, value); + + case hwmon_power_lcrit: + *value = be16_to_cpu(sfp->diag.rxpwr_low_alarm); + sfp_hwmon_to_rx_power(value); + return 0; + + case hwmon_power_min: + *value = be16_to_cpu(sfp->diag.rxpwr_low_warn); + sfp_hwmon_to_rx_power(value); + return 0; + + case hwmon_power_max: + *value = be16_to_cpu(sfp->diag.rxpwr_high_warn); + sfp_hwmon_to_rx_power(value); + return 0; + + case hwmon_power_crit: + *value = be16_to_cpu(sfp->diag.rxpwr_high_alarm); + sfp_hwmon_to_rx_power(value); + return 0; + + case hwmon_power_lcrit_alarm: + err = sfp_read(sfp, true, SFP_ALARM1, &status, sizeof(status)); + if (err < 0) + return err; + + *value = !!(status & SFP_ALARM1_RXPWR_LOW); + return 0; + + case hwmon_power_min_alarm: + err = sfp_read(sfp, true, SFP_WARN1, &status, sizeof(status)); + if (err < 0) + return err; + + *value = !!(status & SFP_WARN1_RXPWR_LOW); + return 0; + + case hwmon_power_max_alarm: + err = sfp_read(sfp, true, SFP_WARN1, &status, sizeof(status)); + if (err < 0) + return err; + + *value = !!(status & SFP_WARN1_RXPWR_HIGH); + return 0; + + case hwmon_power_crit_alarm: + err = sfp_read(sfp, true, SFP_ALARM1, &status, sizeof(status)); + if (err < 0) + return err; + + *value = !!(status & SFP_ALARM1_RXPWR_HIGH); + return 0; + default: + return -EOPNOTSUPP; + } + + return -EOPNOTSUPP; +} + +static int sfp_hwmon_read(struct device *dev, enum hwmon_sensor_types type, + u32 attr, int channel, long *value) +{ + struct sfp *sfp = dev_get_drvdata(dev); + + switch (type) { + case hwmon_temp: + return sfp_hwmon_temp(sfp, attr, value); + case hwmon_in: + return sfp_hwmon_vcc(sfp, attr, value); + case hwmon_curr: + return sfp_hwmon_bias(sfp, attr, value); + case hwmon_power: + switch (channel) { + case 0: + return sfp_hwmon_tx_power(sfp, attr, value); + case 1: + return sfp_hwmon_rx_power(sfp, attr, value); + default: + return -EOPNOTSUPP; + } + default: + return -EOPNOTSUPP; + } +} + +static const struct hwmon_ops sfp_hwmon_ops = { + .is_visible = sfp_hwmon_is_visible, + .read = sfp_hwmon_read, +}; + +static u32 sfp_hwmon_chip_config[] = { + HWMON_C_REGISTER_TZ, + 0, +}; + +static const struct hwmon_channel_info sfp_hwmon_chip = { + .type = hwmon_chip, + .config = sfp_hwmon_chip_config, +}; + +static u32 sfp_hwmon_temp_config[] = { + HWMON_T_INPUT | + HWMON_T_MAX | HWMON_T_MIN | + HWMON_T_MAX_ALARM | HWMON_T_MIN_ALARM | + HWMON_T_CRIT | HWMON_T_LCRIT | + HWMON_T_CRIT_ALARM | HWMON_T_LCRIT_ALARM, + 0, +}; + +static const struct hwmon_channel_info sfp_hwmon_temp_channel_info = { + .type = hwmon_temp, + .config = sfp_hwmon_temp_config, +}; + +static u32 sfp_hwmon_vcc_config[] = { + HWMON_I_INPUT | + HWMON_I_MAX | HWMON_I_MIN | + HWMON_I_MAX_ALARM | HWMON_I_MIN_ALARM | + HWMON_I_CRIT | HWMON_I_LCRIT | + HWMON_I_CRIT_ALARM | HWMON_I_LCRIT_ALARM, + 0, +}; + +static const struct hwmon_channel_info sfp_hwmon_vcc_channel_info = { + .type = hwmon_in, + .config = sfp_hwmon_vcc_config, +}; + +static u32 sfp_hwmon_bias_config[] = { + HWMON_C_INPUT | + HWMON_C_MAX | HWMON_C_MIN | + HWMON_C_MAX_ALARM | HWMON_C_MIN_ALARM | + HWMON_C_CRIT | HWMON_C_LCRIT | + HWMON_C_CRIT_ALARM | HWMON_C_LCRIT_ALARM, + 0, +}; + +static const struct hwmon_channel_info sfp_hwmon_bias_channel_info = { + .type = hwmon_curr, + .config = sfp_hwmon_bias_config, +}; + +static u32 sfp_hwmon_power_config[] = { + /* Transmit power */ + HWMON_P_INPUT | + HWMON_P_MAX | HWMON_P_MIN | + HWMON_P_MAX_ALARM | HWMON_P_MIN_ALARM | + HWMON_P_CRIT | HWMON_P_LCRIT | + HWMON_P_CRIT_ALARM | HWMON_P_LCRIT_ALARM, + /* Receive power */ + HWMON_P_INPUT | + HWMON_P_MAX | HWMON_P_MIN | + HWMON_P_MAX_ALARM | HWMON_P_MIN_ALARM | + HWMON_P_CRIT | HWMON_P_LCRIT | + HWMON_P_CRIT_ALARM | HWMON_P_LCRIT_ALARM, + 0, +}; + +static const struct hwmon_channel_info sfp_hwmon_power_channel_info = { + .type = hwmon_power, + .config = sfp_hwmon_power_config, +}; + +static const struct hwmon_channel_info *sfp_hwmon_info[] = { + &sfp_hwmon_chip, + &sfp_hwmon_vcc_channel_info, + &sfp_hwmon_temp_channel_info, + &sfp_hwmon_bias_channel_info, + &sfp_hwmon_power_channel_info, + NULL, +}; + +static const struct hwmon_chip_info sfp_hwmon_chip_info = { + .ops = &sfp_hwmon_ops, + .info = sfp_hwmon_info, +}; + +static int sfp_hwmon_insert(struct sfp *sfp) +{ + int err, i; + + if (sfp->id.ext.sff8472_compliance == SFP_SFF8472_COMPLIANCE_NONE) + return 0; + + if (!(sfp->id.ext.diagmon & SFP_DIAGMON_DDM)) + return 0; + + if (sfp->id.ext.diagmon & SFP_DIAGMON_ADDRMODE) + /* This driver in general does not support address + * change. + */ + return 0; + + err = sfp_read(sfp, true, 0, &sfp->diag, sizeof(sfp->diag)); + if (err < 0) + return err; + + sfp->hwmon_name = kstrdup(dev_name(sfp->dev), GFP_KERNEL); + if (!sfp->hwmon_name) + return -ENODEV; + + for (i = 0; sfp->hwmon_name[i]; i++) + if (hwmon_is_bad_char(sfp->hwmon_name[i])) + sfp->hwmon_name[i] = '_'; + + sfp->hwmon_dev = hwmon_device_register_with_info(sfp->dev, + sfp->hwmon_name, sfp, + &sfp_hwmon_chip_info, + NULL); + + return PTR_ERR_OR_ZERO(sfp->hwmon_dev); +} + +static void sfp_hwmon_remove(struct sfp *sfp) +{ + hwmon_device_unregister(sfp->hwmon_dev); + kfree(sfp->hwmon_name); +} +#else +static int sfp_hwmon_insert(struct sfp *sfp) +{ + return 0; +} + +static void sfp_hwmon_remove(struct sfp *sfp) +{ +} +#endif + /* Helpers */ static void sfp_module_tx_disable(struct sfp *sfp) { @@ -636,6 +1357,10 @@ static int sfp_sm_mod_probe(struct sfp *sfp) dev_warn(sfp->dev, "module address swap to access page 0xA2 is not supported.\n"); + ret = sfp_hwmon_insert(sfp); + if (ret < 0) + return ret; + ret = sfp_module_insert(sfp->sfp_bus, &sfp->id); if (ret < 0) return ret; @@ -647,6 +1372,8 @@ static void sfp_sm_mod_remove(struct sfp *sfp) { sfp_module_remove(sfp->sfp_bus); + sfp_hwmon_remove(sfp); + if (sfp->mod_phy) sfp_sm_phy_detach(sfp); diff --git a/include/linux/sfp.h b/include/linux/sfp.h index ebce9e24906a..d37518e89db2 100644 --- a/include/linux/sfp.h +++ b/include/linux/sfp.h @@ -231,6 +231,50 @@ struct sfp_eeprom_id { struct sfp_eeprom_ext ext; } __packed; +struct sfp_diag { + __be16 temp_high_alarm; + __be16 temp_low_alarm; + __be16 temp_high_warn; + __be16 temp_low_warn; + __be16 volt_high_alarm; + __be16 volt_low_alarm; + __be16 volt_high_warn; + __be16 volt_low_warn; + __be16 bias_high_alarm; + __be16 bias_low_alarm; + __be16 bias_high_warn; + __be16 bias_low_warn; + __be16 txpwr_high_alarm; + __be16 txpwr_low_alarm; + __be16 txpwr_high_warn; + __be16 txpwr_low_warn; + __be16 rxpwr_high_alarm; + __be16 rxpwr_low_alarm; + __be16 rxpwr_high_warn; + __be16 rxpwr_low_warn; + __be16 laser_temp_high_alarm; + __be16 laser_temp_low_alarm; + __be16 laser_temp_high_warn; + __be16 laser_temp_low_warn; + __be16 tec_cur_high_alarm; + __be16 tec_cur_low_alarm; + __be16 tec_cur_high_warn; + __be16 tec_cur_low_warn; + __be32 cal_rxpwr4; + __be32 cal_rxpwr3; + __be32 cal_rxpwr2; + __be32 cal_rxpwr1; + __be32 cal_rxpwr0; + __be16 cal_txi_slope; + __be16 cal_txi_offset; + __be16 cal_txpwr_slope; + __be16 cal_txpwr_offset; + __be16 cal_t_slope; + __be16 cal_t_offset; + __be16 cal_v_slope; + __be16 cal_v_offset; +} __packed; + /* SFP EEPROM registers */ enum { SFP_PHYS_ID = 0x00, @@ -384,7 +428,33 @@ enum { SFP_TEC_CUR = 0x6c, SFP_STATUS = 0x6e, - SFP_ALARM = 0x70, + SFP_ALARM0 = 0x70, + SFP_ALARM0_TEMP_HIGH = BIT(7), + SFP_ALARM0_TEMP_LOW = BIT(6), + SFP_ALARM0_VCC_HIGH = BIT(5), + SFP_ALARM0_VCC_LOW = BIT(4), + SFP_ALARM0_TX_BIAS_HIGH = BIT(3), + SFP_ALARM0_TX_BIAS_LOW = BIT(2), + SFP_ALARM0_TXPWR_HIGH = BIT(1), + SFP_ALARM0_TXPWR_LOW = BIT(0), + + SFP_ALARM1 = 0x71, + SFP_ALARM1_RXPWR_HIGH = BIT(7), + SFP_ALARM1_RXPWR_LOW = BIT(6), + + SFP_WARN0 = 0x74, + SFP_WARN0_TEMP_HIGH = BIT(7), + SFP_WARN0_TEMP_LOW = BIT(6), + SFP_WARN0_VCC_HIGH = BIT(5), + SFP_WARN0_VCC_LOW = BIT(4), + SFP_WARN0_TX_BIAS_HIGH = BIT(3), + SFP_WARN0_TX_BIAS_LOW = BIT(2), + SFP_WARN0_TXPWR_HIGH = BIT(1), + SFP_WARN0_TXPWR_LOW = BIT(0), + + SFP_WARN1 = 0x75, + SFP_WARN1_RXPWR_HIGH = BIT(7), + SFP_WARN1_RXPWR_LOW = BIT(6), SFP_EXT_STATUS = 0x76, SFP_VSL = 0x78, -- cgit v1.2.3-59-g8ed1b From be2ab5b4d5c0bf041a34ec2e1397d50afbfb095e Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Wed, 11 Jul 2018 13:45:12 +0200 Subject: netfilter: nf_tables: take module reference when starting a batch Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/linux/netfilter/nfnetlink.h | 1 + net/netfilter/nf_tables_api.c | 1 + net/netfilter/nfnetlink.c | 9 +++++++++ 3 files changed, 11 insertions(+) (limited to 'include/linux') diff --git a/include/linux/netfilter/nfnetlink.h b/include/linux/netfilter/nfnetlink.h index 3ecc3050be0e..4a520d3304a2 100644 --- a/include/linux/netfilter/nfnetlink.h +++ b/include/linux/netfilter/nfnetlink.h @@ -29,6 +29,7 @@ struct nfnetlink_subsystem { __u8 subsys_id; /* nfnetlink subsystem ID */ __u8 cb_count; /* number of callbacks */ const struct nfnl_callback *cb; /* callback for individual types */ + struct module *owner; int (*commit)(struct net *net, struct sk_buff *skb); int (*abort)(struct net *net, struct sk_buff *skb); void (*cleanup)(struct net *net); diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 594b395442d6..c16c481fc52a 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -6603,6 +6603,7 @@ static const struct nfnetlink_subsystem nf_tables_subsys = { .abort = nf_tables_abort, .cleanup = nf_tables_cleanup, .valid_genid = nf_tables_valid_genid, + .owner = THIS_MODULE, }; int nft_chain_validate_dependency(const struct nft_chain *chain, diff --git a/net/netfilter/nfnetlink.c b/net/netfilter/nfnetlink.c index 94f9bcaa0799..dd1d7bc23b03 100644 --- a/net/netfilter/nfnetlink.c +++ b/net/netfilter/nfnetlink.c @@ -337,7 +337,14 @@ replay: return kfree_skb(skb); } + if (!try_module_get(ss->owner)) { + nfnl_unlock(subsys_id); + netlink_ack(oskb, nlh, -EOPNOTSUPP, NULL); + return kfree_skb(skb); + } + if (!ss->valid_genid(net, genid)) { + module_put(ss->owner); nfnl_unlock(subsys_id); netlink_ack(oskb, nlh, -ERESTART, NULL); return kfree_skb(skb); @@ -472,6 +479,7 @@ done: nfnl_err_reset(&err_list); nfnl_unlock(subsys_id); kfree_skb(skb); + module_put(ss->owner); goto replay; } else if (status == NFNL_BATCH_DONE) { err = ss->commit(net, oskb); @@ -491,6 +499,7 @@ done: nfnl_err_deliver(&err_list, oskb); nfnl_unlock(subsys_id); kfree_skb(skb); + module_put(ss->owner); } static const struct nla_policy nfnl_batch_policy[NFNL_BATCH_MAX + 1] = { -- cgit v1.2.3-59-g8ed1b From d29ab6e1fa21ebc2a8a771015dd9e0e5d4e28dc1 Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Fri, 13 Jul 2018 12:41:10 -0700 Subject: bpf: bpf_prog_array_alloc() should return a generic non-rcu pointer Currently the return type of the bpf_prog_array_alloc() is struct bpf_prog_array __rcu *, which is not quite correct. Obviously, the returned pointer is a generic pointer, which is valid for an indefinite amount of time and it's not shared with anyone else, so there is no sense in marking it as __rcu. This change eliminate the following sparse warnings: kernel/bpf/core.c:1544:31: warning: incorrect type in return expression (different address spaces) kernel/bpf/core.c:1544:31: expected struct bpf_prog_array [noderef] * kernel/bpf/core.c:1544:31: got void * kernel/bpf/core.c:1548:17: warning: incorrect type in return expression (different address spaces) kernel/bpf/core.c:1548:17: expected struct bpf_prog_array [noderef] * kernel/bpf/core.c:1548:17: got struct bpf_prog_array * kernel/bpf/core.c:1681:15: warning: incorrect type in assignment (different address spaces) kernel/bpf/core.c:1681:15: expected struct bpf_prog_array *array kernel/bpf/core.c:1681:15: got struct bpf_prog_array [noderef] * Fixes: 324bda9e6c5a ("bpf: multi program support for cgroup+bpf") Signed-off-by: Roman Gushchin Cc: Alexei Starovoitov Cc: Daniel Borkmann Signed-off-by: Daniel Borkmann --- include/linux/bpf.h | 2 +- kernel/bpf/core.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 8827e797ff97..943fb08d8287 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -352,7 +352,7 @@ struct bpf_prog_array { struct bpf_prog *progs[0]; }; -struct bpf_prog_array __rcu *bpf_prog_array_alloc(u32 prog_cnt, gfp_t flags); +struct bpf_prog_array *bpf_prog_array_alloc(u32 prog_cnt, gfp_t flags); void bpf_prog_array_free(struct bpf_prog_array __rcu *progs); int bpf_prog_array_length(struct bpf_prog_array __rcu *progs); int bpf_prog_array_copy_to_user(struct bpf_prog_array __rcu *progs, diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 1e5625d46414..253aa8e79c7b 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -1538,7 +1538,7 @@ static struct { .null_prog = NULL, }; -struct bpf_prog_array __rcu *bpf_prog_array_alloc(u32 prog_cnt, gfp_t flags) +struct bpf_prog_array *bpf_prog_array_alloc(u32 prog_cnt, gfp_t flags) { if (prog_cnt) return kzalloc(sizeof(struct bpf_prog_array) + -- cgit v1.2.3-59-g8ed1b From 09728266b6f99ab57cd4f84f3eead65b7b65dbf7 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 17 Jul 2018 10:53:23 -0700 Subject: bpf: offload: rename bpf_offload_dev_match() to bpf_offload_prog_map_match() A set of new API functions exported for the drivers will soon use 'bpf_offload_dev_' as a prefix. Rename the bpf_offload_dev_match() which is internal to the core (used by the verifier) to avoid any confusion. Signed-off-by: Jakub Kicinski Reviewed-by: Quentin Monnet Signed-off-by: Daniel Borkmann --- include/linux/bpf.h | 2 +- kernel/bpf/offload.c | 2 +- kernel/bpf/verifier.c | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 943fb08d8287..9b010d9129f3 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -648,7 +648,7 @@ int bpf_map_offload_delete_elem(struct bpf_map *map, void *key); int bpf_map_offload_get_next_key(struct bpf_map *map, void *key, void *next_key); -bool bpf_offload_dev_match(struct bpf_prog *prog, struct bpf_map *map); +bool bpf_offload_prog_map_match(struct bpf_prog *prog, struct bpf_map *map); #if defined(CONFIG_NET) && defined(CONFIG_BPF_SYSCALL) int bpf_prog_offload_init(struct bpf_prog *prog, union bpf_attr *attr); diff --git a/kernel/bpf/offload.c b/kernel/bpf/offload.c index ac747d5cf7c6..6184e48703f4 100644 --- a/kernel/bpf/offload.c +++ b/kernel/bpf/offload.c @@ -468,7 +468,7 @@ int bpf_map_offload_info_fill(struct bpf_map_info *info, struct bpf_map *map) return 0; } -bool bpf_offload_dev_match(struct bpf_prog *prog, struct bpf_map *map) +bool bpf_offload_prog_map_match(struct bpf_prog *prog, struct bpf_map *map) { struct bpf_offloaded_map *offmap; struct bpf_prog_offload *offload; diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 9e2bf834f13a..15d69b278277 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -5054,7 +5054,7 @@ static int check_map_prog_compatibility(struct bpf_verifier_env *env, } if ((bpf_prog_is_dev_bound(prog->aux) || bpf_map_is_dev_bound(map)) && - !bpf_offload_dev_match(prog, map)) { + !bpf_offload_prog_map_match(prog, map)) { verbose(env, "offload device mismatch between prog and map\n"); return -EINVAL; } -- cgit v1.2.3-59-g8ed1b From 9fd7c5559165f4c679b40c5e6ad442955832dfad Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 17 Jul 2018 10:53:24 -0700 Subject: bpf: offload: aggregate offloads per-device Currently we have two lists of offloaded objects - programs and maps. Netdevice unregister notifier scans those lists to orphan objects associated with device being unregistered. This puts unnecessary (even if negligible) burden on all netdev unregister calls in BPF- -enabled kernel. The lists of objects may potentially get long making the linear scan even more problematic. There haven't been complaints about this mechanisms so far, but it is suboptimal. Instead of relying on notifiers, make the few BPF-capable drivers register explicitly for BPF offloads. The programs and maps will now be collected per-device not on a global list, and only scanned for removal when driver unregisters from BPF offloads. Signed-off-by: Jakub Kicinski Reviewed-by: Quentin Monnet Signed-off-by: Daniel Borkmann --- drivers/net/ethernet/netronome/nfp/bpf/main.c | 13 +++ drivers/net/netdevsim/bpf.c | 7 ++ include/linux/bpf.h | 3 + kernel/bpf/offload.c | 142 +++++++++++++++++--------- 4 files changed, 119 insertions(+), 46 deletions(-) (limited to 'include/linux') diff --git a/drivers/net/ethernet/netronome/nfp/bpf/main.c b/drivers/net/ethernet/netronome/nfp/bpf/main.c index b95b94d008cf..dee039ada75c 100644 --- a/drivers/net/ethernet/netronome/nfp/bpf/main.c +++ b/drivers/net/ethernet/netronome/nfp/bpf/main.c @@ -404,6 +404,16 @@ err_release_free: return -EINVAL; } +static int nfp_bpf_ndo_init(struct nfp_app *app, struct net_device *netdev) +{ + return bpf_offload_dev_netdev_register(netdev); +} + +static void nfp_bpf_ndo_uninit(struct nfp_app *app, struct net_device *netdev) +{ + bpf_offload_dev_netdev_unregister(netdev); +} + static int nfp_bpf_init(struct nfp_app *app) { struct nfp_app_bpf *bpf; @@ -466,6 +476,9 @@ const struct nfp_app_type app_bpf = { .extra_cap = nfp_bpf_extra_cap, + .ndo_init = nfp_bpf_ndo_init, + .ndo_uninit = nfp_bpf_ndo_uninit, + .vnic_alloc = nfp_bpf_vnic_alloc, .vnic_free = nfp_bpf_vnic_free, diff --git a/drivers/net/netdevsim/bpf.c b/drivers/net/netdevsim/bpf.c index 357f9e62f306..c4a2829e0e1f 100644 --- a/drivers/net/netdevsim/bpf.c +++ b/drivers/net/netdevsim/bpf.c @@ -582,6 +582,8 @@ int nsim_bpf(struct net_device *dev, struct netdev_bpf *bpf) int nsim_bpf_init(struct netdevsim *ns) { + int err; + if (ns->sdev->refcnt == 1) { INIT_LIST_HEAD(&ns->sdev->bpf_bound_progs); INIT_LIST_HEAD(&ns->sdev->bpf_bound_maps); @@ -592,6 +594,10 @@ int nsim_bpf_init(struct netdevsim *ns) return -ENOMEM; } + err = bpf_offload_dev_netdev_register(ns->netdev); + if (err) + return err; + debugfs_create_u32("bpf_offloaded_id", 0400, ns->ddir, &ns->bpf_offloaded_id); @@ -625,6 +631,7 @@ void nsim_bpf_uninit(struct netdevsim *ns) WARN_ON(ns->xdp.prog); WARN_ON(ns->xdp_hw.prog); WARN_ON(ns->bpf_offloaded); + bpf_offload_dev_netdev_unregister(ns->netdev); if (ns->sdev->refcnt == 1) { WARN_ON(!list_empty(&ns->sdev->bpf_bound_progs)); diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 9b010d9129f3..aa2e834a122b 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -650,6 +650,9 @@ int bpf_map_offload_get_next_key(struct bpf_map *map, bool bpf_offload_prog_map_match(struct bpf_prog *prog, struct bpf_map *map); +int bpf_offload_dev_netdev_register(struct net_device *netdev); +void bpf_offload_dev_netdev_unregister(struct net_device *netdev); + #if defined(CONFIG_NET) && defined(CONFIG_BPF_SYSCALL) int bpf_prog_offload_init(struct bpf_prog *prog, union bpf_attr *attr); diff --git a/kernel/bpf/offload.c b/kernel/bpf/offload.c index 6184e48703f4..cd64a26807aa 100644 --- a/kernel/bpf/offload.c +++ b/kernel/bpf/offload.c @@ -18,19 +18,37 @@ #include #include #include +#include #include #include #include +#include #include #include -/* Protects bpf_prog_offload_devs, bpf_map_offload_devs and offload members +/* Protects offdevs, members of bpf_offload_netdev and offload members * of all progs. * RTNL lock cannot be taken when holding this lock. */ static DECLARE_RWSEM(bpf_devs_lock); -static LIST_HEAD(bpf_prog_offload_devs); -static LIST_HEAD(bpf_map_offload_devs); + +struct bpf_offload_netdev { + struct rhash_head l; + struct net_device *netdev; + struct list_head progs; + struct list_head maps; +}; + +static const struct rhashtable_params offdevs_params = { + .nelem_hint = 4, + .key_len = sizeof(struct net_device *), + .key_offset = offsetof(struct bpf_offload_netdev, netdev), + .head_offset = offsetof(struct bpf_offload_netdev, l), + .automatic_shrinking = true, +}; + +static struct rhashtable offdevs; +static bool offdevs_inited; static int bpf_dev_offload_check(struct net_device *netdev) { @@ -41,8 +59,19 @@ static int bpf_dev_offload_check(struct net_device *netdev) return 0; } +static struct bpf_offload_netdev * +bpf_offload_find_netdev(struct net_device *netdev) +{ + lockdep_assert_held(&bpf_devs_lock); + + if (!offdevs_inited) + return NULL; + return rhashtable_lookup_fast(&offdevs, &netdev, offdevs_params); +} + int bpf_prog_offload_init(struct bpf_prog *prog, union bpf_attr *attr) { + struct bpf_offload_netdev *ondev; struct bpf_prog_offload *offload; int err; @@ -66,12 +95,13 @@ int bpf_prog_offload_init(struct bpf_prog *prog, union bpf_attr *attr) goto err_maybe_put; down_write(&bpf_devs_lock); - if (offload->netdev->reg_state != NETREG_REGISTERED) { + ondev = bpf_offload_find_netdev(offload->netdev); + if (!ondev) { err = -EINVAL; goto err_unlock; } prog->aux->offload = offload; - list_add_tail(&offload->offloads, &bpf_prog_offload_devs); + list_add_tail(&offload->offloads, &ondev->progs); dev_put(offload->netdev); up_write(&bpf_devs_lock); @@ -294,6 +324,7 @@ static int bpf_map_offload_ndo(struct bpf_offloaded_map *offmap, struct bpf_map *bpf_map_offload_map_alloc(union bpf_attr *attr) { struct net *net = current->nsproxy->net_ns; + struct bpf_offload_netdev *ondev; struct bpf_offloaded_map *offmap; int err; @@ -316,11 +347,17 @@ struct bpf_map *bpf_map_offload_map_alloc(union bpf_attr *attr) if (err) goto err_unlock; + ondev = bpf_offload_find_netdev(offmap->netdev); + if (!ondev) { + err = -EINVAL; + goto err_unlock; + } + err = bpf_map_offload_ndo(offmap, BPF_OFFLOAD_MAP_ALLOC); if (err) goto err_unlock; - list_add_tail(&offmap->offloads, &bpf_map_offload_devs); + list_add_tail(&offmap->offloads, &ondev->maps); up_write(&bpf_devs_lock); rtnl_unlock(); @@ -489,56 +526,69 @@ bool bpf_offload_prog_map_match(struct bpf_prog *prog, struct bpf_map *map) return ret; } -static void bpf_offload_orphan_all_progs(struct net_device *netdev) +int bpf_offload_dev_netdev_register(struct net_device *netdev) { - struct bpf_prog_offload *offload, *tmp; + struct bpf_offload_netdev *ondev; + int err; - list_for_each_entry_safe(offload, tmp, &bpf_prog_offload_devs, offloads) - if (offload->netdev == netdev) - __bpf_prog_offload_destroy(offload->prog); -} + down_write(&bpf_devs_lock); + if (!offdevs_inited) { + err = rhashtable_init(&offdevs, &offdevs_params); + if (err) + return err; + offdevs_inited = true; + } + up_write(&bpf_devs_lock); -static void bpf_offload_orphan_all_maps(struct net_device *netdev) -{ - struct bpf_offloaded_map *offmap, *tmp; + ondev = kzalloc(sizeof(*ondev), GFP_KERNEL); + if (!ondev) + return -ENOMEM; + + ondev->netdev = netdev; + INIT_LIST_HEAD(&ondev->progs); + INIT_LIST_HEAD(&ondev->maps); + + down_write(&bpf_devs_lock); + err = rhashtable_insert_fast(&offdevs, &ondev->l, offdevs_params); + if (err) { + netdev_warn(netdev, "failed to register for BPF offload\n"); + goto err_unlock_free; + } - list_for_each_entry_safe(offmap, tmp, &bpf_map_offload_devs, offloads) - if (offmap->netdev == netdev) - __bpf_map_offload_destroy(offmap); + up_write(&bpf_devs_lock); + return 0; + +err_unlock_free: + up_write(&bpf_devs_lock); + kfree(ondev); + return err; } +EXPORT_SYMBOL_GPL(bpf_offload_dev_netdev_register); -static int bpf_offload_notification(struct notifier_block *notifier, - ulong event, void *ptr) +void bpf_offload_dev_netdev_unregister(struct net_device *netdev) { - struct net_device *netdev = netdev_notifier_info_to_dev(ptr); + struct bpf_offloaded_map *offmap, *mtmp; + struct bpf_prog_offload *offload, *ptmp; + struct bpf_offload_netdev *ondev; ASSERT_RTNL(); - switch (event) { - case NETDEV_UNREGISTER: - /* ignore namespace changes */ - if (netdev->reg_state != NETREG_UNREGISTERING) - break; - - down_write(&bpf_devs_lock); - bpf_offload_orphan_all_progs(netdev); - bpf_offload_orphan_all_maps(netdev); - up_write(&bpf_devs_lock); - break; - default: - break; - } - return NOTIFY_OK; -} + down_write(&bpf_devs_lock); + ondev = rhashtable_lookup_fast(&offdevs, &netdev, offdevs_params); + if (WARN_ON(!ondev)) + goto unlock; -static struct notifier_block bpf_offload_notifier = { - .notifier_call = bpf_offload_notification, -}; + WARN_ON(rhashtable_remove_fast(&offdevs, &ondev->l, offdevs_params)); -static int __init bpf_offload_init(void) -{ - register_netdevice_notifier(&bpf_offload_notifier); - return 0; -} + list_for_each_entry_safe(offload, ptmp, &ondev->progs, offloads) + __bpf_prog_offload_destroy(offload->prog); + list_for_each_entry_safe(offmap, mtmp, &ondev->maps, offloads) + __bpf_map_offload_destroy(offmap); -subsys_initcall(bpf_offload_init); + WARN_ON(!list_empty(&ondev->progs)); + WARN_ON(!list_empty(&ondev->maps)); + kfree(ondev); +unlock: + up_write(&bpf_devs_lock); +} +EXPORT_SYMBOL_GPL(bpf_offload_dev_netdev_unregister); -- cgit v1.2.3-59-g8ed1b From 602144c224604f1cbff02ee2d1cf46825269ecbd Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 17 Jul 2018 10:53:25 -0700 Subject: bpf: offload: keep the offload state per-ASIC Create a higher-level entity to represent a device/ASIC to allow programs and maps to be shared between device ports. The extra work is required to make sure we don't destroy BPF objects as soon as the netdev for which they were loaded gets destroyed, as other ports may still be using them. When netdev goes away all of its BPF objects will be moved to other netdevs of the device, and only destroyed when last netdev is unregistered. Signed-off-by: Jakub Kicinski Reviewed-by: Quentin Monnet Acked-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann --- drivers/net/ethernet/netronome/nfp/bpf/main.c | 14 ++++- drivers/net/ethernet/netronome/nfp/bpf/main.h | 4 ++ drivers/net/netdevsim/bpf.c | 17 +++++- drivers/net/netdevsim/netdevsim.h | 3 + include/linux/bpf.h | 9 ++- kernel/bpf/offload.c | 81 +++++++++++++++++++++------ 6 files changed, 104 insertions(+), 24 deletions(-) (limited to 'include/linux') diff --git a/drivers/net/ethernet/netronome/nfp/bpf/main.c b/drivers/net/ethernet/netronome/nfp/bpf/main.c index dee039ada75c..458f49235d06 100644 --- a/drivers/net/ethernet/netronome/nfp/bpf/main.c +++ b/drivers/net/ethernet/netronome/nfp/bpf/main.c @@ -406,12 +406,16 @@ err_release_free: static int nfp_bpf_ndo_init(struct nfp_app *app, struct net_device *netdev) { - return bpf_offload_dev_netdev_register(netdev); + struct nfp_app_bpf *bpf = app->priv; + + return bpf_offload_dev_netdev_register(bpf->bpf_dev, netdev); } static void nfp_bpf_ndo_uninit(struct nfp_app *app, struct net_device *netdev) { - bpf_offload_dev_netdev_unregister(netdev); + struct nfp_app_bpf *bpf = app->priv; + + bpf_offload_dev_netdev_unregister(bpf->bpf_dev, netdev); } static int nfp_bpf_init(struct nfp_app *app) @@ -437,6 +441,11 @@ static int nfp_bpf_init(struct nfp_app *app) if (err) goto err_free_neutral_maps; + bpf->bpf_dev = bpf_offload_dev_create(); + err = PTR_ERR_OR_ZERO(bpf->bpf_dev); + if (err) + goto err_free_neutral_maps; + return 0; err_free_neutral_maps: @@ -455,6 +464,7 @@ static void nfp_bpf_clean(struct nfp_app *app) { struct nfp_app_bpf *bpf = app->priv; + bpf_offload_dev_destroy(bpf->bpf_dev); WARN_ON(!skb_queue_empty(&bpf->cmsg_replies)); WARN_ON(!list_empty(&bpf->map_list)); WARN_ON(bpf->maps_in_use || bpf->map_elems_in_use); diff --git a/drivers/net/ethernet/netronome/nfp/bpf/main.h b/drivers/net/ethernet/netronome/nfp/bpf/main.h index 9845c1a2d4c2..bec935468f90 100644 --- a/drivers/net/ethernet/netronome/nfp/bpf/main.h +++ b/drivers/net/ethernet/netronome/nfp/bpf/main.h @@ -110,6 +110,8 @@ enum pkt_vec { * struct nfp_app_bpf - bpf app priv structure * @app: backpointer to the app * + * @bpf_dev: BPF offload device handle + * * @tag_allocator: bitmap of control message tags in use * @tag_alloc_next: next tag bit to allocate * @tag_alloc_last: next tag bit to be freed @@ -150,6 +152,8 @@ enum pkt_vec { struct nfp_app_bpf { struct nfp_app *app; + struct bpf_offload_dev *bpf_dev; + DECLARE_BITMAP(tag_allocator, U16_MAX + 1); u16 tag_alloc_next; u16 tag_alloc_last; diff --git a/drivers/net/netdevsim/bpf.c b/drivers/net/netdevsim/bpf.c index c4a2829e0e1f..9eab29f67a0e 100644 --- a/drivers/net/netdevsim/bpf.c +++ b/drivers/net/netdevsim/bpf.c @@ -592,11 +592,16 @@ int nsim_bpf_init(struct netdevsim *ns) debugfs_create_dir("bpf_bound_progs", ns->sdev->ddir); if (IS_ERR_OR_NULL(ns->sdev->ddir_bpf_bound_progs)) return -ENOMEM; + + ns->sdev->bpf_dev = bpf_offload_dev_create(); + err = PTR_ERR_OR_ZERO(ns->sdev->bpf_dev); + if (err) + return err; } - err = bpf_offload_dev_netdev_register(ns->netdev); + err = bpf_offload_dev_netdev_register(ns->sdev->bpf_dev, ns->netdev); if (err) - return err; + goto err_destroy_bdev; debugfs_create_u32("bpf_offloaded_id", 0400, ns->ddir, &ns->bpf_offloaded_id); @@ -624,6 +629,11 @@ int nsim_bpf_init(struct netdevsim *ns) &ns->bpf_map_accept); return 0; + +err_destroy_bdev: + if (ns->sdev->refcnt == 1) + bpf_offload_dev_destroy(ns->sdev->bpf_dev); + return err; } void nsim_bpf_uninit(struct netdevsim *ns) @@ -631,10 +641,11 @@ void nsim_bpf_uninit(struct netdevsim *ns) WARN_ON(ns->xdp.prog); WARN_ON(ns->xdp_hw.prog); WARN_ON(ns->bpf_offloaded); - bpf_offload_dev_netdev_unregister(ns->netdev); + bpf_offload_dev_netdev_unregister(ns->sdev->bpf_dev, ns->netdev); if (ns->sdev->refcnt == 1) { WARN_ON(!list_empty(&ns->sdev->bpf_bound_progs)); WARN_ON(!list_empty(&ns->sdev->bpf_bound_maps)); + bpf_offload_dev_destroy(ns->sdev->bpf_dev); } } diff --git a/drivers/net/netdevsim/netdevsim.h b/drivers/net/netdevsim/netdevsim.h index 98f26fa1e671..02be199eb005 100644 --- a/drivers/net/netdevsim/netdevsim.h +++ b/drivers/net/netdevsim/netdevsim.h @@ -27,6 +27,7 @@ #define NSIM_EA(extack, msg) NL_SET_ERR_MSG_MOD((extack), msg) struct bpf_prog; +struct bpf_offload_dev; struct dentry; struct nsim_vf_config; @@ -36,6 +37,8 @@ struct netdevsim_shared_dev { struct dentry *ddir; + struct bpf_offload_dev *bpf_dev; + struct dentry *ddir_bpf_bound_progs; u32 prog_id_gen; diff --git a/include/linux/bpf.h b/include/linux/bpf.h index aa2e834a122b..913465e45062 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -85,6 +85,7 @@ struct bpf_map { char name[BPF_OBJ_NAME_LEN]; }; +struct bpf_offload_dev; struct bpf_offloaded_map; struct bpf_map_dev_ops { @@ -650,8 +651,12 @@ int bpf_map_offload_get_next_key(struct bpf_map *map, bool bpf_offload_prog_map_match(struct bpf_prog *prog, struct bpf_map *map); -int bpf_offload_dev_netdev_register(struct net_device *netdev); -void bpf_offload_dev_netdev_unregister(struct net_device *netdev); +struct bpf_offload_dev *bpf_offload_dev_create(void); +void bpf_offload_dev_destroy(struct bpf_offload_dev *offdev); +int bpf_offload_dev_netdev_register(struct bpf_offload_dev *offdev, + struct net_device *netdev); +void bpf_offload_dev_netdev_unregister(struct bpf_offload_dev *offdev, + struct net_device *netdev); #if defined(CONFIG_NET) && defined(CONFIG_BPF_SYSCALL) int bpf_prog_offload_init(struct bpf_prog *prog, union bpf_attr *attr); diff --git a/kernel/bpf/offload.c b/kernel/bpf/offload.c index cd64a26807aa..925575f64ff1 100644 --- a/kernel/bpf/offload.c +++ b/kernel/bpf/offload.c @@ -32,11 +32,17 @@ */ static DECLARE_RWSEM(bpf_devs_lock); +struct bpf_offload_dev { + struct list_head netdevs; +}; + struct bpf_offload_netdev { struct rhash_head l; struct net_device *netdev; + struct bpf_offload_dev *offdev; struct list_head progs; struct list_head maps; + struct list_head offdev_netdevs; }; static const struct rhashtable_params offdevs_params = { @@ -526,25 +532,18 @@ bool bpf_offload_prog_map_match(struct bpf_prog *prog, struct bpf_map *map) return ret; } -int bpf_offload_dev_netdev_register(struct net_device *netdev) +int bpf_offload_dev_netdev_register(struct bpf_offload_dev *offdev, + struct net_device *netdev) { struct bpf_offload_netdev *ondev; int err; - down_write(&bpf_devs_lock); - if (!offdevs_inited) { - err = rhashtable_init(&offdevs, &offdevs_params); - if (err) - return err; - offdevs_inited = true; - } - up_write(&bpf_devs_lock); - ondev = kzalloc(sizeof(*ondev), GFP_KERNEL); if (!ondev) return -ENOMEM; ondev->netdev = netdev; + ondev->offdev = offdev; INIT_LIST_HEAD(&ondev->progs); INIT_LIST_HEAD(&ondev->maps); @@ -555,6 +554,7 @@ int bpf_offload_dev_netdev_register(struct net_device *netdev) goto err_unlock_free; } + list_add(&ondev->offdev_netdevs, &offdev->netdevs); up_write(&bpf_devs_lock); return 0; @@ -565,11 +565,12 @@ err_unlock_free: } EXPORT_SYMBOL_GPL(bpf_offload_dev_netdev_register); -void bpf_offload_dev_netdev_unregister(struct net_device *netdev) +void bpf_offload_dev_netdev_unregister(struct bpf_offload_dev *offdev, + struct net_device *netdev) { + struct bpf_offload_netdev *ondev, *altdev; struct bpf_offloaded_map *offmap, *mtmp; struct bpf_prog_offload *offload, *ptmp; - struct bpf_offload_netdev *ondev; ASSERT_RTNL(); @@ -579,11 +580,26 @@ void bpf_offload_dev_netdev_unregister(struct net_device *netdev) goto unlock; WARN_ON(rhashtable_remove_fast(&offdevs, &ondev->l, offdevs_params)); - - list_for_each_entry_safe(offload, ptmp, &ondev->progs, offloads) - __bpf_prog_offload_destroy(offload->prog); - list_for_each_entry_safe(offmap, mtmp, &ondev->maps, offloads) - __bpf_map_offload_destroy(offmap); + list_del(&ondev->offdev_netdevs); + + /* Try to move the objects to another netdev of the device */ + altdev = list_first_entry_or_null(&offdev->netdevs, + struct bpf_offload_netdev, + offdev_netdevs); + if (altdev) { + list_for_each_entry(offload, &ondev->progs, offloads) + offload->netdev = altdev->netdev; + list_splice_init(&ondev->progs, &altdev->progs); + + list_for_each_entry(offmap, &ondev->maps, offloads) + offmap->netdev = altdev->netdev; + list_splice_init(&ondev->maps, &altdev->maps); + } else { + list_for_each_entry_safe(offload, ptmp, &ondev->progs, offloads) + __bpf_prog_offload_destroy(offload->prog); + list_for_each_entry_safe(offmap, mtmp, &ondev->maps, offloads) + __bpf_map_offload_destroy(offmap); + } WARN_ON(!list_empty(&ondev->progs)); WARN_ON(!list_empty(&ondev->maps)); @@ -592,3 +608,34 @@ unlock: up_write(&bpf_devs_lock); } EXPORT_SYMBOL_GPL(bpf_offload_dev_netdev_unregister); + +struct bpf_offload_dev *bpf_offload_dev_create(void) +{ + struct bpf_offload_dev *offdev; + int err; + + down_write(&bpf_devs_lock); + if (!offdevs_inited) { + err = rhashtable_init(&offdevs, &offdevs_params); + if (err) + return ERR_PTR(err); + offdevs_inited = true; + } + up_write(&bpf_devs_lock); + + offdev = kzalloc(sizeof(*offdev), GFP_KERNEL); + if (!offdev) + return ERR_PTR(-ENOMEM); + + INIT_LIST_HEAD(&offdev->netdevs); + + return offdev; +} +EXPORT_SYMBOL_GPL(bpf_offload_dev_create); + +void bpf_offload_dev_destroy(struct bpf_offload_dev *offdev) +{ + WARN_ON(!list_empty(&offdev->netdevs)); + kfree(offdev); +} +EXPORT_SYMBOL_GPL(bpf_offload_dev_destroy); -- cgit v1.2.3-59-g8ed1b From fd4f227dea0f24d89f52f7c4eb3207f84ddcbcbd Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 17 Jul 2018 10:53:26 -0700 Subject: bpf: offload: allow program and map sharing per-ASIC Allow programs and maps to be re-used across different netdevs, as long as they belong to the same struct bpf_offload_dev. Update the bpf_offload_prog_map_match() helper for the verifier and export a new helper for the drivers to use when checking programs at attachment time. Signed-off-by: Jakub Kicinski Reviewed-by: Quentin Monnet Signed-off-by: Daniel Borkmann --- include/linux/bpf.h | 1 + kernel/bpf/offload.c | 42 +++++++++++++++++++++++++++++++++++------- 2 files changed, 36 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 913465e45062..5b5ad95cf339 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -657,6 +657,7 @@ int bpf_offload_dev_netdev_register(struct bpf_offload_dev *offdev, struct net_device *netdev); void bpf_offload_dev_netdev_unregister(struct bpf_offload_dev *offdev, struct net_device *netdev); +bool bpf_offload_dev_match(struct bpf_prog *prog, struct net_device *netdev); #if defined(CONFIG_NET) && defined(CONFIG_BPF_SYSCALL) int bpf_prog_offload_init(struct bpf_prog *prog, union bpf_attr *attr); diff --git a/kernel/bpf/offload.c b/kernel/bpf/offload.c index 925575f64ff1..177a52436394 100644 --- a/kernel/bpf/offload.c +++ b/kernel/bpf/offload.c @@ -511,22 +511,50 @@ int bpf_map_offload_info_fill(struct bpf_map_info *info, struct bpf_map *map) return 0; } -bool bpf_offload_prog_map_match(struct bpf_prog *prog, struct bpf_map *map) +static bool __bpf_offload_dev_match(struct bpf_prog *prog, + struct net_device *netdev) { - struct bpf_offloaded_map *offmap; + struct bpf_offload_netdev *ondev1, *ondev2; struct bpf_prog_offload *offload; - bool ret; if (!bpf_prog_is_dev_bound(prog->aux)) return false; - if (!bpf_map_is_dev_bound(map)) - return bpf_map_offload_neutral(map); - down_read(&bpf_devs_lock); offload = prog->aux->offload; + if (!offload) + return false; + if (offload->netdev == netdev) + return true; + + ondev1 = bpf_offload_find_netdev(offload->netdev); + ondev2 = bpf_offload_find_netdev(netdev); + + return ondev1 && ondev2 && ondev1->offdev == ondev2->offdev; +} + +bool bpf_offload_dev_match(struct bpf_prog *prog, struct net_device *netdev) +{ + bool ret; + + down_read(&bpf_devs_lock); + ret = __bpf_offload_dev_match(prog, netdev); + up_read(&bpf_devs_lock); + + return ret; +} +EXPORT_SYMBOL_GPL(bpf_offload_dev_match); + +bool bpf_offload_prog_map_match(struct bpf_prog *prog, struct bpf_map *map) +{ + struct bpf_offloaded_map *offmap; + bool ret; + + if (!bpf_map_is_dev_bound(map)) + return bpf_map_offload_neutral(map); offmap = map_to_offmap(map); - ret = offload && offload->netdev == offmap->netdev; + down_read(&bpf_devs_lock); + ret = __bpf_offload_dev_match(prog, offmap->netdev); up_read(&bpf_devs_lock); return ret; -- cgit v1.2.3-59-g8ed1b From a48d189ef53146a8df132a327a637c4182f50a16 Mon Sep 17 00:00:00 2001 From: Stefano Brivio Date: Tue, 17 Jul 2018 11:52:57 +0200 Subject: net: Move skb decrypted field, avoid explicity copy Commit 784abe24c903 ("net: Add decrypted field to skb") introduced a 'decrypted' field that is explicitly copied on skb copy and clone. Move it between headers_start[0] and headers_end[0], so that we don't need to copy it explicitly as it's copied by the memcpy() in __copy_skb_header(). While at it, drop the assignment in __skb_clone(), it was already redundant. This doesn't change the size of sk_buff or cacheline boundaries. The 15-bits hole before tc_index becomes a 14-bits hole, and will be again a 15-bits hole when this change is merged with commit 8b7008620b84 ("net: Don't copy pfmemalloc flag in __copy_skb_header()"). v2: as reported by kbuild test robot (oops, I forgot to build with CONFIG_TLS_DEVICE it seems), we can't use CHECK_SKB_FIELD() on a bit-field member. Just drop the check for the moment being, perhaps we could think of some magic to also check bit-field members one day. Fixes: 784abe24c903 ("net: Add decrypted field to skb") Signed-off-by: Stefano Brivio Signed-off-by: David S. Miller --- include/linux/skbuff.h | 9 ++++----- net/core/skbuff.c | 6 ------ 2 files changed, 4 insertions(+), 11 deletions(-) (limited to 'include/linux') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 3ceb8dcc54da..14bc9ebe30f2 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -630,7 +630,6 @@ typedef unsigned char *sk_buff_data_t; * @hash: the packet hash * @queue_mapping: Queue mapping for multiqueue devices * @xmit_more: More SKBs are pending for this queue - * @decrypted: Decrypted SKB * @ndisc_nodetype: router type (from link layer) * @ooo_okay: allow the mapping of a socket to a queue to be changed * @l4_hash: indicate hash is a canonical 4-tuple hash over transport @@ -641,6 +640,7 @@ typedef unsigned char *sk_buff_data_t; * @no_fcs: Request NIC to treat last 4 bytes as Ethernet FCS * @csum_not_inet: use CRC32c to resolve CHECKSUM_PARTIAL * @dst_pending_confirm: need to confirm neighbour + * @decrypted: Decrypted SKB * @napi_id: id of the NAPI struct this skb came from * @secmark: security marking * @mark: Generic packet mark @@ -737,11 +737,7 @@ struct sk_buff { peeked:1, head_frag:1, xmit_more:1, -#ifdef CONFIG_TLS_DEVICE - decrypted:1; -#else __unused:1; -#endif /* fields enclosed in headers_start/headers_end are copied * using a single memcpy() in __copy_skb_header() @@ -797,6 +793,9 @@ struct sk_buff { __u8 tc_redirected:1; __u8 tc_from_ingress:1; #endif +#ifdef CONFIG_TLS_DEVICE + __u8 decrypted:1; +#endif #ifdef CONFIG_NET_SCHED __u16 tc_index; /* traffic control index */ diff --git a/net/core/skbuff.c b/net/core/skbuff.c index cfd6c6f35f9c..c4e24ac27464 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -805,9 +805,6 @@ static void __copy_skb_header(struct sk_buff *new, const struct sk_buff *old) * It is not yet because we do not want to have a 16 bit hole */ new->queue_mapping = old->queue_mapping; -#ifdef CONFIG_TLS_DEVICE - new->decrypted = old->decrypted; -#endif memcpy(&new->headers_start, &old->headers_start, offsetof(struct sk_buff, headers_end) - @@ -868,9 +865,6 @@ static struct sk_buff *__skb_clone(struct sk_buff *n, struct sk_buff *skb) C(head_frag); C(data); C(truesize); -#ifdef CONFIG_TLS_DEVICE - C(decrypted); -#endif refcount_set(&n->users, 1); atomic_inc(&(skb_shinfo(skb)->dataref)); -- cgit v1.2.3-59-g8ed1b From eff8ea8f24eac76bc21c25e4ca4ac4ee2dade846 Mon Sep 17 00:00:00 2001 From: Feras Daoud Date: Mon, 16 Jul 2018 18:35:30 -0700 Subject: net/mlx5: FW tracer, add hardware structures This change adds the infrastructure to mlx5 core fw tracer. It introduces the following 4 new registers: MLX5_REG_MTRC_CAP - Used to read tracer capabilities MLX5_REG_MTRC_CONF - Used to set tracer configurations MLX5_REG_MTRC_STDB - Used to query tracer strings database MLX5_REG_MTRC_CTRL - Used to control the tracer The capability of the tracing can be checked using mcam access register, therefore, the mcam access register interface will expose the tracer register. Signed-off-by: Feras Daoud Signed-off-by: Saeed Mahameed --- include/linux/mlx5/driver.h | 4 +++ include/linux/mlx5/mlx5_ifc.h | 61 ++++++++++++++++++++++++++++++++++++++++++- 2 files changed, 64 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index 1cb1c0317b77..4a4125b4279d 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -138,6 +138,10 @@ enum { MLX5_REG_HOST_ENDIANNESS = 0x7004, MLX5_REG_MCIA = 0x9014, MLX5_REG_MLCR = 0x902b, + MLX5_REG_MTRC_CAP = 0x9040, + MLX5_REG_MTRC_CONF = 0x9041, + MLX5_REG_MTRC_STDB = 0x9042, + MLX5_REG_MTRC_CTRL = 0x9043, MLX5_REG_MPCNT = 0x9051, MLX5_REG_MTPPS = 0x9053, MLX5_REG_MTPPSE = 0x9054, diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index 1853e7fd6924..bd7b71f54d59 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -8112,7 +8112,9 @@ struct mlx5_ifc_mcam_access_reg_bits { u8 mcqi[0x1]; u8 reserved_at_1f[0x1]; - u8 regs_95_to_64[0x20]; + u8 regs_95_to_68[0x1c]; + u8 tracer_registers[0x4]; + u8 regs_63_to_32[0x20]; u8 regs_31_to_0[0x20]; }; @@ -9187,4 +9189,61 @@ struct mlx5_ifc_create_uctx_in_bits { struct mlx5_ifc_uctx_bits uctx; }; +struct mlx5_ifc_mtrc_string_db_param_bits { + u8 string_db_base_address[0x20]; + + u8 reserved_at_20[0x8]; + u8 string_db_size[0x18]; +}; + +struct mlx5_ifc_mtrc_cap_bits { + u8 trace_owner[0x1]; + u8 trace_to_memory[0x1]; + u8 reserved_at_2[0x4]; + u8 trc_ver[0x2]; + u8 reserved_at_8[0x14]; + u8 num_string_db[0x4]; + + u8 first_string_trace[0x8]; + u8 num_string_trace[0x8]; + u8 reserved_at_30[0x28]; + + u8 log_max_trace_buffer_size[0x8]; + + u8 reserved_at_60[0x20]; + + struct mlx5_ifc_mtrc_string_db_param_bits string_db_param[8]; + + u8 reserved_at_280[0x180]; +}; + +struct mlx5_ifc_mtrc_conf_bits { + u8 reserved_at_0[0x1c]; + u8 trace_mode[0x4]; + u8 reserved_at_20[0x18]; + u8 log_trace_buffer_size[0x8]; + u8 trace_mkey[0x20]; + u8 reserved_at_60[0x3a0]; +}; + +struct mlx5_ifc_mtrc_stdb_bits { + u8 string_db_index[0x4]; + u8 reserved_at_4[0x4]; + u8 read_size[0x18]; + u8 start_offset[0x20]; + u8 string_db_data[0]; +}; + +struct mlx5_ifc_mtrc_ctrl_bits { + u8 trace_status[0x2]; + u8 reserved_at_2[0x2]; + u8 arm_event[0x1]; + u8 reserved_at_5[0xb]; + u8 modify_field_select[0x10]; + u8 reserved_at_20[0x2b]; + u8 current_timestamp52_32[0x15]; + u8 current_timestamp31_0[0x20]; + u8 reserved_at_80[0x180]; +}; + #endif /* MLX5_IFC_H */ -- cgit v1.2.3-59-g8ed1b From 5e022dd353b74132bf216a77b169c43e39f5be9e Mon Sep 17 00:00:00 2001 From: Eran Ben Elisha Date: Mon, 16 Jul 2018 18:35:31 -0700 Subject: net/mlx5: Expose MPEGC (Management PCIe General Configuration) structures This patch exposes PRM layout for handling MPEGC (Management PCIe General Configuration). This will be used in the downstream patch for configuring MPEGC via the driver. Signed-off-by: Eran Ben Elisha Reviewed-by: Moshe Shemesh Signed-off-by: Saeed Mahameed --- include/linux/mlx5/driver.h | 1 + include/linux/mlx5/mlx5_ifc.h | 23 +++++++++++++++++++++-- 2 files changed, 22 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index 4a4125b4279d..957199c20a0f 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -145,6 +145,7 @@ enum { MLX5_REG_MPCNT = 0x9051, MLX5_REG_MTPPS = 0x9053, MLX5_REG_MTPPSE = 0x9054, + MLX5_REG_MPEGC = 0x9056, MLX5_REG_MCQI = 0x9061, MLX5_REG_MCC = 0x9062, MLX5_REG_MCDA = 0x9063, diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index bd7b71f54d59..2de5feaeb74a 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -8049,6 +8049,19 @@ struct mlx5_ifc_peir_reg_bits { u8 error_type[0x8]; }; +struct mlx5_ifc_mpegc_reg_bits { + u8 reserved_at_0[0x30]; + u8 field_select[0x10]; + + u8 tx_overflow_sense[0x1]; + u8 mark_cqe[0x1]; + u8 mark_cnp[0x1]; + u8 reserved_at_43[0x1b]; + u8 tx_lossy_overflow_oper[0x2]; + + u8 reserved_at_60[0x100]; +}; + struct mlx5_ifc_pcam_enhanced_features_bits { u8 reserved_at_0[0x6d]; u8 rx_icrc_encapsulated_counter[0x1]; @@ -8097,7 +8110,11 @@ struct mlx5_ifc_pcam_reg_bits { }; struct mlx5_ifc_mcam_enhanced_features_bits { - u8 reserved_at_0[0x7b]; + u8 reserved_at_0[0x74]; + u8 mark_tx_action_cnp[0x1]; + u8 mark_tx_action_cqe[0x1]; + u8 dynamic_tx_overflow[0x1]; + u8 reserved_at_77[0x4]; u8 pcie_outbound_stalled[0x1]; u8 tx_overflow_buffer_pkt[0x1]; u8 mtpps_enh_out_per_adj[0x1]; @@ -8112,7 +8129,9 @@ struct mlx5_ifc_mcam_access_reg_bits { u8 mcqi[0x1]; u8 reserved_at_1f[0x1]; - u8 regs_95_to_68[0x1c]; + u8 regs_95_to_87[0x9]; + u8 mpegc[0x1]; + u8 regs_85_to_68[0x12]; u8 tracer_registers[0x4]; u8 regs_63_to_32[0x20]; -- cgit v1.2.3-59-g8ed1b From 8da6fe2a18505b9bd977e573d62d33f836c6903c Mon Sep 17 00:00:00 2001 From: Jianbo Liu Date: Mon, 16 Jul 2018 18:35:32 -0700 Subject: net/mlx5: Add core support for double vlan push/pop steering action As newer firmware supports double push/pop in a single FTE, we add core bits and extend vlan action logic for it. Signed-off-by: Jianbo Liu Reviewed-by: Or Gerlitz Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/diag/fs_tracepoint.h | 2 ++ drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c | 6 +++--- drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c | 12 +++++++++--- drivers/net/ethernet/mellanox/mlx5/core/fs_core.c | 4 +++- include/linux/mlx5/fs.h | 4 +++- include/linux/mlx5/mlx5_ifc.h | 11 +++++++++-- 6 files changed, 29 insertions(+), 10 deletions(-) (limited to 'include/linux') diff --git a/drivers/net/ethernet/mellanox/mlx5/core/diag/fs_tracepoint.h b/drivers/net/ethernet/mellanox/mlx5/core/diag/fs_tracepoint.h index 09f178a3fcab..0240aee9189e 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/diag/fs_tracepoint.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/diag/fs_tracepoint.h @@ -138,6 +138,8 @@ TRACE_EVENT(mlx5_fs_del_fg, {MLX5_FLOW_CONTEXT_ACTION_MOD_HDR, "MOD_HDR"},\ {MLX5_FLOW_CONTEXT_ACTION_VLAN_PUSH, "VLAN_PUSH"},\ {MLX5_FLOW_CONTEXT_ACTION_VLAN_POP, "VLAN_POP"},\ + {MLX5_FLOW_CONTEXT_ACTION_VLAN_PUSH_2, "VLAN_PUSH_2"},\ + {MLX5_FLOW_CONTEXT_ACTION_VLAN_POP_2, "VLAN_POP_2"},\ {MLX5_FLOW_CONTEXT_ACTION_FWD_NEXT_PRIO, "NEXT_PRIO"} TRACE_EVENT(mlx5_fs_set_fte, diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c index cecd201f0b73..8f50ce80ff66 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c @@ -70,9 +70,9 @@ mlx5_eswitch_add_offloaded_rule(struct mlx5_eswitch *esw, flow_act.action &= ~(MLX5_FLOW_CONTEXT_ACTION_VLAN_PUSH | MLX5_FLOW_CONTEXT_ACTION_VLAN_POP); else if (flow_act.action & MLX5_FLOW_CONTEXT_ACTION_VLAN_PUSH) { - flow_act.vlan.ethtype = ntohs(attr->vlan_proto); - flow_act.vlan.vid = attr->vlan_vid; - flow_act.vlan.prio = attr->vlan_prio; + flow_act.vlan[0].ethtype = ntohs(attr->vlan_proto); + flow_act.vlan[0].vid = attr->vlan_vid; + flow_act.vlan[0].prio = attr->vlan_prio; } if (flow_act.action & MLX5_FLOW_CONTEXT_ACTION_FWD_DEST) { diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c b/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c index 5a00deff5457..6a62b84e57f4 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c @@ -349,9 +349,15 @@ static int mlx5_cmd_set_fte(struct mlx5_core_dev *dev, vlan = MLX5_ADDR_OF(flow_context, in_flow_context, push_vlan); - MLX5_SET(vlan, vlan, ethtype, fte->action.vlan.ethtype); - MLX5_SET(vlan, vlan, vid, fte->action.vlan.vid); - MLX5_SET(vlan, vlan, prio, fte->action.vlan.prio); + MLX5_SET(vlan, vlan, ethtype, fte->action.vlan[0].ethtype); + MLX5_SET(vlan, vlan, vid, fte->action.vlan[0].vid); + MLX5_SET(vlan, vlan, prio, fte->action.vlan[0].prio); + + vlan = MLX5_ADDR_OF(flow_context, in_flow_context, push_vlan_2); + + MLX5_SET(vlan, vlan, ethtype, fte->action.vlan[1].ethtype); + MLX5_SET(vlan, vlan, vid, fte->action.vlan[1].vid); + MLX5_SET(vlan, vlan, prio, fte->action.vlan[1].prio); in_match_value = MLX5_ADDR_OF(flow_context, in_flow_context, match_value); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c index 49a75d31185e..05e7a5112b74 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c @@ -1464,7 +1464,9 @@ static bool check_conflicting_actions(u32 action1, u32 action2) MLX5_FLOW_CONTEXT_ACTION_DECAP | MLX5_FLOW_CONTEXT_ACTION_MOD_HDR | MLX5_FLOW_CONTEXT_ACTION_VLAN_POP | - MLX5_FLOW_CONTEXT_ACTION_VLAN_PUSH)) + MLX5_FLOW_CONTEXT_ACTION_VLAN_PUSH | + MLX5_FLOW_CONTEXT_ACTION_VLAN_POP_2 | + MLX5_FLOW_CONTEXT_ACTION_VLAN_PUSH_2)) return true; return false; diff --git a/include/linux/mlx5/fs.h b/include/linux/mlx5/fs.h index 757b4a30281e..c40f2fc68655 100644 --- a/include/linux/mlx5/fs.h +++ b/include/linux/mlx5/fs.h @@ -152,6 +152,8 @@ struct mlx5_fs_vlan { u8 prio; }; +#define MLX5_FS_VLAN_DEPTH 2 + struct mlx5_flow_act { u32 action; bool has_flow_tag; @@ -159,7 +161,7 @@ struct mlx5_flow_act { u32 encap_id; u32 modify_id; uintptr_t esp_id; - struct mlx5_fs_vlan vlan; + struct mlx5_fs_vlan vlan[MLX5_FS_VLAN_DEPTH]; struct ib_counters *counters; }; diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index 2de5feaeb74a..ae12120ef021 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -337,7 +337,10 @@ struct mlx5_ifc_flow_table_prop_layout_bits { u8 reserved_at_9[0x1]; u8 pop_vlan[0x1]; u8 push_vlan[0x1]; - u8 reserved_at_c[0x14]; + u8 reserved_at_c[0x1]; + u8 pop_vlan_2[0x1]; + u8 push_vlan_2[0x1]; + u8 reserved_at_f[0x11]; u8 reserved_at_20[0x2]; u8 log_max_ft_size[0x6]; @@ -2386,6 +2389,8 @@ enum { MLX5_FLOW_CONTEXT_ACTION_MOD_HDR = 0x40, MLX5_FLOW_CONTEXT_ACTION_VLAN_POP = 0x80, MLX5_FLOW_CONTEXT_ACTION_VLAN_PUSH = 0x100, + MLX5_FLOW_CONTEXT_ACTION_VLAN_POP_2 = 0x400, + MLX5_FLOW_CONTEXT_ACTION_VLAN_PUSH_2 = 0x800, }; struct mlx5_ifc_vlan_bits { @@ -2416,7 +2421,9 @@ struct mlx5_ifc_flow_context_bits { u8 modify_header_id[0x20]; - u8 reserved_at_100[0x100]; + struct mlx5_ifc_vlan_bits push_vlan_2; + + u8 reserved_at_120[0xe0]; struct mlx5_ifc_fte_match_param_bits match_value; -- cgit v1.2.3-59-g8ed1b From e2abdcf1d2dd59d7164cf975c50e1587b96b9d04 Mon Sep 17 00:00:00 2001 From: Tariq Toukan Date: Mon, 16 Jul 2018 18:35:36 -0700 Subject: net/mlx5: Better return types for CQE API Reduce sizes of return types. Use bool for binary indication. Signed-off-by: Tariq Toukan Signed-off-by: Saeed Mahameed --- include/linux/mlx5/device.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mlx5/device.h b/include/linux/mlx5/device.h index f8671c0a43aa..0566c6a94805 100644 --- a/include/linux/mlx5/device.h +++ b/include/linux/mlx5/device.h @@ -750,7 +750,7 @@ enum { #define MLX5_MINI_CQE_ARRAY_SIZE 8 -static inline int mlx5_get_cqe_format(struct mlx5_cqe64 *cqe) +static inline u8 mlx5_get_cqe_format(struct mlx5_cqe64 *cqe) { return (cqe->op_own >> 2) & 0x3; } @@ -770,14 +770,14 @@ static inline u8 get_cqe_l3_hdr_type(struct mlx5_cqe64 *cqe) return (cqe->l4_l3_hdr_type >> 2) & 0x3; } -static inline u8 cqe_is_tunneled(struct mlx5_cqe64 *cqe) +static inline bool cqe_is_tunneled(struct mlx5_cqe64 *cqe) { return cqe->outer_l3_tunneled & 0x1; } -static inline int cqe_has_vlan(struct mlx5_cqe64 *cqe) +static inline bool cqe_has_vlan(struct mlx5_cqe64 *cqe) { - return !!(cqe->l4_l3_hdr_type & 0x1); + return cqe->l4_l3_hdr_type & 0x1; } static inline u64 get_cqe_ts(struct mlx5_cqe64 *cqe) -- cgit v1.2.3-59-g8ed1b From b51dab46c6adfbb7e80cd0f59ae17b8a30d94b1a Mon Sep 17 00:00:00 2001 From: Sudarsana Reddy Kalluru Date: Wed, 18 Jul 2018 06:27:22 -0700 Subject: qed: Add qed APIs for PHY module query. This patch adds qed APIs for reading the PHY module. Signed-off-by: Sudarsana Reddy Kalluru Signed-off-by: Ariel Elior Signed-off-by: David S. Miller --- drivers/net/ethernet/qlogic/qed/qed_hsi.h | 16 ++++++++++ drivers/net/ethernet/qlogic/qed/qed_main.c | 23 ++++++++++++++ drivers/net/ethernet/qlogic/qed/qed_mcp.c | 49 ++++++++++++++++++++++++++++++ drivers/net/ethernet/qlogic/qed/qed_mcp.h | 16 ++++++++++ include/linux/qed/qed_if.h | 15 +++++++++ 5 files changed, 119 insertions(+) (limited to 'include/linux') diff --git a/drivers/net/ethernet/qlogic/qed/qed_hsi.h b/drivers/net/ethernet/qlogic/qed/qed_hsi.h index bee10c1781fb..8faceb691657 100644 --- a/drivers/net/ethernet/qlogic/qed/qed_hsi.h +++ b/drivers/net/ethernet/qlogic/qed/qed_hsi.h @@ -12444,6 +12444,8 @@ struct public_drv_mb { #define DRV_MSG_CODE_STATS_TYPE_ISCSI 3 #define DRV_MSG_CODE_STATS_TYPE_RDMA 4 +#define DRV_MSG_CODE_TRANSCEIVER_READ 0x00160000 + #define DRV_MSG_CODE_MASK_PARITIES 0x001a0000 #define DRV_MSG_CODE_BIST_TEST 0x001e0000 @@ -12543,6 +12545,15 @@ struct public_drv_mb { #define DRV_MB_PARAM_SET_LED_MODE_ON 0x1 #define DRV_MB_PARAM_SET_LED_MODE_OFF 0x2 +#define DRV_MB_PARAM_TRANSCEIVER_PORT_OFFSET 0 +#define DRV_MB_PARAM_TRANSCEIVER_PORT_MASK 0x00000003 +#define DRV_MB_PARAM_TRANSCEIVER_SIZE_OFFSET 2 +#define DRV_MB_PARAM_TRANSCEIVER_SIZE_MASK 0x000000FC +#define DRV_MB_PARAM_TRANSCEIVER_I2C_ADDRESS_OFFSET 8 +#define DRV_MB_PARAM_TRANSCEIVER_I2C_ADDRESS_MASK 0x0000FF00 +#define DRV_MB_PARAM_TRANSCEIVER_OFFSET_OFFSET 16 +#define DRV_MB_PARAM_TRANSCEIVER_OFFSET_MASK 0xFFFF0000 + /* Resource Allocation params - Driver version support */ #define DRV_MB_PARAM_RESOURCE_ALLOC_VERSION_MAJOR_MASK 0xFFFF0000 #define DRV_MB_PARAM_RESOURCE_ALLOC_VERSION_MAJOR_SHIFT 16 @@ -12596,6 +12607,9 @@ struct public_drv_mb { #define FW_MSG_CODE_PHY_OK 0x00110000 #define FW_MSG_CODE_OK 0x00160000 #define FW_MSG_CODE_ERROR 0x00170000 +#define FW_MSG_CODE_TRANSCEIVER_DIAG_OK 0x00160000 +#define FW_MSG_CODE_TRANSCEIVER_DIAG_ERROR 0x00170000 +#define FW_MSG_CODE_TRANSCEIVER_NOT_PRESENT 0x00020000 #define FW_MSG_CODE_OS_WOL_SUPPORTED 0x00800000 #define FW_MSG_CODE_OS_WOL_NOT_SUPPORTED 0x00810000 @@ -12687,6 +12701,8 @@ struct mcp_public_data { struct public_func func[MCP_GLOB_FUNC_MAX]; }; +#define MAX_I2C_TRANSACTION_SIZE 16 + /* OCBB definitions */ enum tlvs { /* Category 1: Device Properties */ diff --git a/drivers/net/ethernet/qlogic/qed/qed_main.c b/drivers/net/ethernet/qlogic/qed/qed_main.c index 0cbc74d6ca8b..158944aa6097 100644 --- a/drivers/net/ethernet/qlogic/qed/qed_main.c +++ b/drivers/net/ethernet/qlogic/qed/qed_main.c @@ -2102,6 +2102,28 @@ out: return status; } +static int qed_read_module_eeprom(struct qed_dev *cdev, char *buf, + u8 dev_addr, u32 offset, u32 len) +{ + struct qed_hwfn *hwfn = QED_LEADING_HWFN(cdev); + struct qed_ptt *ptt; + int rc = 0; + + if (IS_VF(cdev)) + return 0; + + ptt = qed_ptt_acquire(hwfn); + if (!ptt) + return -EAGAIN; + + rc = qed_mcp_phy_sfp_read(hwfn, ptt, MFW_PORT(hwfn), dev_addr, + offset, len, buf); + + qed_ptt_release(hwfn, ptt); + + return rc; +} + static struct qed_selftest_ops qed_selftest_ops_pass = { .selftest_memory = &qed_selftest_memory, .selftest_interrupt = &qed_selftest_interrupt, @@ -2144,6 +2166,7 @@ const struct qed_common_ops qed_common_ops_pass = { .update_mac = &qed_update_mac, .update_mtu = &qed_update_mtu, .update_wol = &qed_update_wol, + .read_module_eeprom = &qed_read_module_eeprom, }; void qed_get_protocol_stats(struct qed_dev *cdev, diff --git a/drivers/net/ethernet/qlogic/qed/qed_mcp.c b/drivers/net/ethernet/qlogic/qed/qed_mcp.c index 4e0b443c9519..62a220fce6e1 100644 --- a/drivers/net/ethernet/qlogic/qed/qed_mcp.c +++ b/drivers/net/ethernet/qlogic/qed/qed_mcp.c @@ -2463,6 +2463,55 @@ out: return rc; } +int qed_mcp_phy_sfp_read(struct qed_hwfn *p_hwfn, struct qed_ptt *p_ptt, + u32 port, u32 addr, u32 offset, u32 len, u8 *p_buf) +{ + u32 bytes_left, bytes_to_copy, buf_size, nvm_offset = 0; + u32 resp, param; + int rc; + + nvm_offset |= (port << DRV_MB_PARAM_TRANSCEIVER_PORT_OFFSET) & + DRV_MB_PARAM_TRANSCEIVER_PORT_MASK; + nvm_offset |= (addr << DRV_MB_PARAM_TRANSCEIVER_I2C_ADDRESS_OFFSET) & + DRV_MB_PARAM_TRANSCEIVER_I2C_ADDRESS_MASK; + + addr = offset; + offset = 0; + bytes_left = len; + while (bytes_left > 0) { + bytes_to_copy = min_t(u32, bytes_left, + MAX_I2C_TRANSACTION_SIZE); + nvm_offset &= (DRV_MB_PARAM_TRANSCEIVER_I2C_ADDRESS_MASK | + DRV_MB_PARAM_TRANSCEIVER_PORT_MASK); + nvm_offset |= ((addr + offset) << + DRV_MB_PARAM_TRANSCEIVER_OFFSET_OFFSET) & + DRV_MB_PARAM_TRANSCEIVER_OFFSET_MASK; + nvm_offset |= (bytes_to_copy << + DRV_MB_PARAM_TRANSCEIVER_SIZE_OFFSET) & + DRV_MB_PARAM_TRANSCEIVER_SIZE_MASK; + rc = qed_mcp_nvm_rd_cmd(p_hwfn, p_ptt, + DRV_MSG_CODE_TRANSCEIVER_READ, + nvm_offset, &resp, ¶m, &buf_size, + (u32 *)(p_buf + offset)); + if (rc) { + DP_NOTICE(p_hwfn, + "Failed to send a transceiver read command to the MFW. rc = %d.\n", + rc); + return rc; + } + + if (resp == FW_MSG_CODE_TRANSCEIVER_NOT_PRESENT) + return -ENODEV; + else if (resp != FW_MSG_CODE_TRANSCEIVER_DIAG_OK) + return -EINVAL; + + offset += buf_size; + bytes_left -= buf_size; + } + + return 0; +} + int qed_mcp_bist_register_test(struct qed_hwfn *p_hwfn, struct qed_ptt *p_ptt) { u32 drv_mb_param = 0, rsp, param; diff --git a/drivers/net/ethernet/qlogic/qed/qed_mcp.h b/drivers/net/ethernet/qlogic/qed/qed_mcp.h index 632a838f1fe3..047976d5c6e9 100644 --- a/drivers/net/ethernet/qlogic/qed/qed_mcp.h +++ b/drivers/net/ethernet/qlogic/qed/qed_mcp.h @@ -839,6 +839,22 @@ int qed_mcp_nvm_rd_cmd(struct qed_hwfn *p_hwfn, u32 *o_mcp_resp, u32 *o_mcp_param, u32 *o_txn_size, u32 *o_buf); +/** + * @brief Read from sfp + * + * @param p_hwfn - hw function + * @param p_ptt - PTT required for register access + * @param port - transceiver port + * @param addr - I2C address + * @param offset - offset in sfp + * @param len - buffer length + * @param p_buf - buffer to read into + * + * @return int - 0 - operation was successful. + */ +int qed_mcp_phy_sfp_read(struct qed_hwfn *p_hwfn, struct qed_ptt *p_ptt, + u32 port, u32 addr, u32 offset, u32 len, u8 *p_buf); + /** * @brief indicates whether the MFW objects [under mcp_info] are accessible * diff --git a/include/linux/qed/qed_if.h b/include/linux/qed/qed_if.h index b4040023cbfb..8cd34645e892 100644 --- a/include/linux/qed/qed_if.h +++ b/include/linux/qed/qed_if.h @@ -759,6 +759,9 @@ struct qed_generic_tlvs { u8 mac[QED_TLV_MAC_COUNT][ETH_ALEN]; }; +#define QED_I2C_DEV_ADDR_A0 0xA0 +#define QED_I2C_DEV_ADDR_A2 0xA2 + #define QED_NVM_SIGNATURE 0x12435687 enum qed_nvm_flash_cmd { @@ -1026,6 +1029,18 @@ struct qed_common_ops { * @param enabled - true iff WoL should be enabled. */ int (*update_wol) (struct qed_dev *cdev, bool enabled); + +/** + * @brief read_module_eeprom + * + * @param cdev + * @param buf - buffer + * @param dev_addr - PHY device memory region + * @param offset - offset into eeprom contents to be read + * @param len - buffer length, i.e., max bytes to be read + */ + int (*read_module_eeprom)(struct qed_dev *cdev, + char *buf, u8 dev_addr, u32 offset, u32 len); }; #define MASK_FIELD(_name, _value) \ -- cgit v1.2.3-59-g8ed1b From 488dee96bb62f0b3d9e678cf42574034d5b033a5 Mon Sep 17 00:00:00 2001 From: Dmitry Torokhov Date: Fri, 20 Jul 2018 21:56:47 +0000 Subject: kernfs: allow creating kernfs objects with arbitrary uid/gid This change allows creating kernfs files and directories with arbitrary uid/gid instead of always using GLOBAL_ROOT_UID/GID by extending kernfs_create_dir_ns() and kernfs_create_file_ns() with uid/gid arguments. The "simple" kernfs_create_file() and kernfs_create_dir() are left alone and always create objects belonging to the global root. When creating symlinks ownership (uid/gid) is taken from the target kernfs object. Co-Developed-by: Tyler Hicks Signed-off-by: Dmitry Torokhov Signed-off-by: Tyler Hicks Signed-off-by: David S. Miller --- arch/x86/kernel/cpu/intel_rdt_rdtgroup.c | 4 +++- fs/kernfs/dir.c | 29 ++++++++++++++++++++++++++--- fs/kernfs/file.c | 8 ++++++-- fs/kernfs/inode.c | 2 +- fs/kernfs/kernfs-internal.h | 2 ++ fs/kernfs/symlink.c | 11 ++++++++++- fs/sysfs/dir.c | 4 +++- fs/sysfs/file.c | 5 +++-- include/linux/kernfs.h | 28 +++++++++++++++++++--------- kernel/cgroup/cgroup.c | 4 +++- 10 files changed, 76 insertions(+), 21 deletions(-) (limited to 'include/linux') diff --git a/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c b/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c index 749856a2e736..9af1a21265d3 100644 --- a/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c +++ b/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c @@ -146,6 +146,7 @@ static int rdtgroup_add_file(struct kernfs_node *parent_kn, struct rftype *rft) int ret; kn = __kernfs_create_file(parent_kn, rft->name, rft->mode, + GLOBAL_ROOT_UID, GLOBAL_ROOT_GID, 0, rft->kf_ops, rft, NULL, NULL); if (IS_ERR(kn)) return PTR_ERR(kn); @@ -1503,7 +1504,8 @@ static int mon_addfile(struct kernfs_node *parent_kn, const char *name, struct kernfs_node *kn; int ret = 0; - kn = __kernfs_create_file(parent_kn, name, 0444, 0, + kn = __kernfs_create_file(parent_kn, name, 0444, + GLOBAL_ROOT_UID, GLOBAL_ROOT_GID, 0, &kf_mondata_ops, priv, NULL, NULL); if (IS_ERR(kn)) return PTR_ERR(kn); diff --git a/fs/kernfs/dir.c b/fs/kernfs/dir.c index d66cc0777303..4ca0b5c18192 100644 --- a/fs/kernfs/dir.c +++ b/fs/kernfs/dir.c @@ -619,6 +619,7 @@ struct kernfs_node *kernfs_node_from_dentry(struct dentry *dentry) static struct kernfs_node *__kernfs_new_node(struct kernfs_root *root, const char *name, umode_t mode, + kuid_t uid, kgid_t gid, unsigned flags) { struct kernfs_node *kn; @@ -661,8 +662,22 @@ static struct kernfs_node *__kernfs_new_node(struct kernfs_root *root, kn->mode = mode; kn->flags = flags; + if (!uid_eq(uid, GLOBAL_ROOT_UID) || !gid_eq(gid, GLOBAL_ROOT_GID)) { + struct iattr iattr = { + .ia_valid = ATTR_UID | ATTR_GID, + .ia_uid = uid, + .ia_gid = gid, + }; + + ret = __kernfs_setattr(kn, &iattr); + if (ret < 0) + goto err_out3; + } + return kn; + err_out3: + idr_remove(&root->ino_idr, kn->id.ino); err_out2: kmem_cache_free(kernfs_node_cache, kn); err_out1: @@ -672,11 +687,13 @@ static struct kernfs_node *__kernfs_new_node(struct kernfs_root *root, struct kernfs_node *kernfs_new_node(struct kernfs_node *parent, const char *name, umode_t mode, + kuid_t uid, kgid_t gid, unsigned flags) { struct kernfs_node *kn; - kn = __kernfs_new_node(kernfs_root(parent), name, mode, flags); + kn = __kernfs_new_node(kernfs_root(parent), + name, mode, uid, gid, flags); if (kn) { kernfs_get(parent); kn->parent = parent; @@ -946,6 +963,7 @@ struct kernfs_root *kernfs_create_root(struct kernfs_syscall_ops *scops, root->next_generation = 1; kn = __kernfs_new_node(root, "", S_IFDIR | S_IRUGO | S_IXUGO, + GLOBAL_ROOT_UID, GLOBAL_ROOT_GID, KERNFS_DIR); if (!kn) { idr_destroy(&root->ino_idr); @@ -984,6 +1002,8 @@ void kernfs_destroy_root(struct kernfs_root *root) * @parent: parent in which to create a new directory * @name: name of the new directory * @mode: mode of the new directory + * @uid: uid of the new directory + * @gid: gid of the new directory * @priv: opaque data associated with the new directory * @ns: optional namespace tag of the directory * @@ -991,13 +1011,15 @@ void kernfs_destroy_root(struct kernfs_root *root) */ struct kernfs_node *kernfs_create_dir_ns(struct kernfs_node *parent, const char *name, umode_t mode, + kuid_t uid, kgid_t gid, void *priv, const void *ns) { struct kernfs_node *kn; int rc; /* allocate */ - kn = kernfs_new_node(parent, name, mode | S_IFDIR, KERNFS_DIR); + kn = kernfs_new_node(parent, name, mode | S_IFDIR, + uid, gid, KERNFS_DIR); if (!kn) return ERR_PTR(-ENOMEM); @@ -1028,7 +1050,8 @@ struct kernfs_node *kernfs_create_empty_dir(struct kernfs_node *parent, int rc; /* allocate */ - kn = kernfs_new_node(parent, name, S_IRUGO|S_IXUGO|S_IFDIR, KERNFS_DIR); + kn = kernfs_new_node(parent, name, S_IRUGO|S_IXUGO|S_IFDIR, + GLOBAL_ROOT_UID, GLOBAL_ROOT_GID, KERNFS_DIR); if (!kn) return ERR_PTR(-ENOMEM); diff --git a/fs/kernfs/file.c b/fs/kernfs/file.c index 2015d8c45e4a..dbf5bc250bfd 100644 --- a/fs/kernfs/file.c +++ b/fs/kernfs/file.c @@ -965,6 +965,8 @@ const struct file_operations kernfs_file_fops = { * @parent: directory to create the file in * @name: name of the file * @mode: mode of the file + * @uid: uid of the file + * @gid: gid of the file * @size: size of the file * @ops: kernfs operations for the file * @priv: private data for the file @@ -975,7 +977,8 @@ const struct file_operations kernfs_file_fops = { */ struct kernfs_node *__kernfs_create_file(struct kernfs_node *parent, const char *name, - umode_t mode, loff_t size, + umode_t mode, kuid_t uid, kgid_t gid, + loff_t size, const struct kernfs_ops *ops, void *priv, const void *ns, struct lock_class_key *key) @@ -986,7 +989,8 @@ struct kernfs_node *__kernfs_create_file(struct kernfs_node *parent, flags = KERNFS_FILE; - kn = kernfs_new_node(parent, name, (mode & S_IALLUGO) | S_IFREG, flags); + kn = kernfs_new_node(parent, name, (mode & S_IALLUGO) | S_IFREG, + uid, gid, flags); if (!kn) return ERR_PTR(-ENOMEM); diff --git a/fs/kernfs/inode.c b/fs/kernfs/inode.c index 3d73fe9d56e2..80cebcd94c90 100644 --- a/fs/kernfs/inode.c +++ b/fs/kernfs/inode.c @@ -63,7 +63,7 @@ out_unlock: return ret; } -static int __kernfs_setattr(struct kernfs_node *kn, const struct iattr *iattr) +int __kernfs_setattr(struct kernfs_node *kn, const struct iattr *iattr) { struct kernfs_iattrs *attrs; struct iattr *iattrs; diff --git a/fs/kernfs/kernfs-internal.h b/fs/kernfs/kernfs-internal.h index 0f260dcca177..3d83b114bb08 100644 --- a/fs/kernfs/kernfs-internal.h +++ b/fs/kernfs/kernfs-internal.h @@ -90,6 +90,7 @@ int kernfs_iop_setattr(struct dentry *dentry, struct iattr *iattr); int kernfs_iop_getattr(const struct path *path, struct kstat *stat, u32 request_mask, unsigned int query_flags); ssize_t kernfs_iop_listxattr(struct dentry *dentry, char *buf, size_t size); +int __kernfs_setattr(struct kernfs_node *kn, const struct iattr *iattr); /* * dir.c @@ -104,6 +105,7 @@ void kernfs_put_active(struct kernfs_node *kn); int kernfs_add_one(struct kernfs_node *kn); struct kernfs_node *kernfs_new_node(struct kernfs_node *parent, const char *name, umode_t mode, + kuid_t uid, kgid_t gid, unsigned flags); struct kernfs_node *kernfs_find_and_get_node_by_ino(struct kernfs_root *root, unsigned int ino); diff --git a/fs/kernfs/symlink.c b/fs/kernfs/symlink.c index 08ccabd7047f..5ffed48f3d0e 100644 --- a/fs/kernfs/symlink.c +++ b/fs/kernfs/symlink.c @@ -21,6 +21,7 @@ * @target: target node for the symlink to point to * * Returns the created node on success, ERR_PTR() value on error. + * Ownership of the link matches ownership of the target. */ struct kernfs_node *kernfs_create_link(struct kernfs_node *parent, const char *name, @@ -28,8 +29,16 @@ struct kernfs_node *kernfs_create_link(struct kernfs_node *parent, { struct kernfs_node *kn; int error; + kuid_t uid = GLOBAL_ROOT_UID; + kgid_t gid = GLOBAL_ROOT_GID; - kn = kernfs_new_node(parent, name, S_IFLNK|S_IRWXUGO, KERNFS_LINK); + if (target->iattr) { + uid = target->iattr->ia_iattr.ia_uid; + gid = target->iattr->ia_iattr.ia_gid; + } + + kn = kernfs_new_node(parent, name, S_IFLNK|S_IRWXUGO, uid, gid, + KERNFS_LINK); if (!kn) return ERR_PTR(-ENOMEM); diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c index 58eba92a0e41..e39b884f0867 100644 --- a/fs/sysfs/dir.c +++ b/fs/sysfs/dir.c @@ -52,7 +52,9 @@ int sysfs_create_dir_ns(struct kobject *kobj, const void *ns) return -ENOENT; kn = kernfs_create_dir_ns(parent, kobject_name(kobj), - S_IRWXU | S_IRUGO | S_IXUGO, kobj, ns); + S_IRWXU | S_IRUGO | S_IXUGO, + GLOBAL_ROOT_UID, GLOBAL_ROOT_GID, + kobj, ns); if (IS_ERR(kn)) { if (PTR_ERR(kn) == -EEXIST) sysfs_warn_dup(parent, kobject_name(kobj)); diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c index 5c13f29bfcdb..513fa691ecbd 100644 --- a/fs/sysfs/file.c +++ b/fs/sysfs/file.c @@ -302,8 +302,9 @@ int sysfs_add_file_mode_ns(struct kernfs_node *parent, if (!attr->ignore_lockdep) key = attr->key ?: (struct lock_class_key *)&attr->skey; #endif - kn = __kernfs_create_file(parent, attr->name, mode & 0777, size, ops, - (void *)attr, ns, key); + kn = __kernfs_create_file(parent, attr->name, + mode & 0777, GLOBAL_ROOT_UID, GLOBAL_ROOT_GID, + size, ops, (void *)attr, ns, key); if (IS_ERR(kn)) { if (PTR_ERR(kn) == -EEXIST) sysfs_warn_dup(parent, attr->name); diff --git a/include/linux/kernfs.h b/include/linux/kernfs.h index ab25c8b6d9e3..814643f7ee52 100644 --- a/include/linux/kernfs.h +++ b/include/linux/kernfs.h @@ -15,6 +15,7 @@ #include #include #include +#include #include struct file; @@ -325,12 +326,14 @@ void kernfs_destroy_root(struct kernfs_root *root); struct kernfs_node *kernfs_create_dir_ns(struct kernfs_node *parent, const char *name, umode_t mode, + kuid_t uid, kgid_t gid, void *priv, const void *ns); struct kernfs_node *kernfs_create_empty_dir(struct kernfs_node *parent, const char *name); struct kernfs_node *__kernfs_create_file(struct kernfs_node *parent, - const char *name, - umode_t mode, loff_t size, + const char *name, umode_t mode, + kuid_t uid, kgid_t gid, + loff_t size, const struct kernfs_ops *ops, void *priv, const void *ns, struct lock_class_key *key); @@ -415,12 +418,14 @@ static inline void kernfs_destroy_root(struct kernfs_root *root) { } static inline struct kernfs_node * kernfs_create_dir_ns(struct kernfs_node *parent, const char *name, - umode_t mode, void *priv, const void *ns) + umode_t mode, kuid_t uid, kgid_t gid, + void *priv, const void *ns) { return ERR_PTR(-ENOSYS); } static inline struct kernfs_node * __kernfs_create_file(struct kernfs_node *parent, const char *name, - umode_t mode, loff_t size, const struct kernfs_ops *ops, + umode_t mode, kuid_t uid, kgid_t gid, + loff_t size, const struct kernfs_ops *ops, void *priv, const void *ns, struct lock_class_key *key) { return ERR_PTR(-ENOSYS); } @@ -498,12 +503,15 @@ static inline struct kernfs_node * kernfs_create_dir(struct kernfs_node *parent, const char *name, umode_t mode, void *priv) { - return kernfs_create_dir_ns(parent, name, mode, priv, NULL); + return kernfs_create_dir_ns(parent, name, mode, + GLOBAL_ROOT_UID, GLOBAL_ROOT_GID, + priv, NULL); } static inline struct kernfs_node * kernfs_create_file_ns(struct kernfs_node *parent, const char *name, - umode_t mode, loff_t size, const struct kernfs_ops *ops, + umode_t mode, kuid_t uid, kgid_t gid, + loff_t size, const struct kernfs_ops *ops, void *priv, const void *ns) { struct lock_class_key *key = NULL; @@ -511,15 +519,17 @@ kernfs_create_file_ns(struct kernfs_node *parent, const char *name, #ifdef CONFIG_DEBUG_LOCK_ALLOC key = (struct lock_class_key *)&ops->lockdep_key; #endif - return __kernfs_create_file(parent, name, mode, size, ops, priv, ns, - key); + return __kernfs_create_file(parent, name, mode, uid, gid, + size, ops, priv, ns, key); } static inline struct kernfs_node * kernfs_create_file(struct kernfs_node *parent, const char *name, umode_t mode, loff_t size, const struct kernfs_ops *ops, void *priv) { - return kernfs_create_file_ns(parent, name, mode, size, ops, priv, NULL); + return kernfs_create_file_ns(parent, name, mode, + GLOBAL_ROOT_UID, GLOBAL_ROOT_GID, + size, ops, priv, NULL); } static inline int kernfs_remove_by_name(struct kernfs_node *parent, diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 077370bf8964..35cf3d71f8aa 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -3557,7 +3557,9 @@ static int cgroup_add_file(struct cgroup_subsys_state *css, struct cgroup *cgrp, key = &cft->lockdep_key; #endif kn = __kernfs_create_file(cgrp->kn, cgroup_file_name(cgrp, cft, name), - cgroup_file_mode(cft), 0, cft->kf_ops, cft, + cgroup_file_mode(cft), + GLOBAL_ROOT_UID, GLOBAL_ROOT_GID, + 0, cft->kf_ops, cft, NULL, key); if (IS_ERR(kn)) return PTR_ERR(kn); -- cgit v1.2.3-59-g8ed1b From 5f81880d5204ee2388fd9a75bb850ccd526885b7 Mon Sep 17 00:00:00 2001 From: Dmitry Torokhov Date: Fri, 20 Jul 2018 21:56:48 +0000 Subject: sysfs, kobject: allow creating kobject belonging to arbitrary users Normally kobjects and their sysfs representation belong to global root, however it is not necessarily the case for objects in separate namespaces. For example, objects in separate network namespace logically belong to the container's root and not global root. This change lays groundwork for allowing network namespace objects ownership to be transferred to container's root user by defining get_ownership() callback in ktype structure and using it in sysfs code to retrieve desired uid/gid when creating sysfs objects for given kobject. Co-Developed-by: Tyler Hicks Signed-off-by: Dmitry Torokhov Signed-off-by: Tyler Hicks Signed-off-by: David S. Miller --- fs/sysfs/dir.c | 7 +++++-- fs/sysfs/file.c | 32 ++++++++++++++++++++------------ fs/sysfs/group.c | 23 +++++++++++++++++------ fs/sysfs/sysfs.h | 5 ++--- include/linux/kobject.h | 4 ++++ lib/kobject.c | 19 +++++++++++++++++++ 6 files changed, 67 insertions(+), 23 deletions(-) (limited to 'include/linux') diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c index e39b884f0867..feeae8081c22 100644 --- a/fs/sysfs/dir.c +++ b/fs/sysfs/dir.c @@ -40,6 +40,8 @@ void sysfs_warn_dup(struct kernfs_node *parent, const char *name) int sysfs_create_dir_ns(struct kobject *kobj, const void *ns) { struct kernfs_node *parent, *kn; + kuid_t uid; + kgid_t gid; BUG_ON(!kobj); @@ -51,9 +53,10 @@ int sysfs_create_dir_ns(struct kobject *kobj, const void *ns) if (!parent) return -ENOENT; + kobject_get_ownership(kobj, &uid, &gid); + kn = kernfs_create_dir_ns(parent, kobject_name(kobj), - S_IRWXU | S_IRUGO | S_IXUGO, - GLOBAL_ROOT_UID, GLOBAL_ROOT_GID, + S_IRWXU | S_IRUGO | S_IXUGO, uid, gid, kobj, ns); if (IS_ERR(kn)) { if (PTR_ERR(kn) == -EEXIST) diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c index 513fa691ecbd..fa46216523cf 100644 --- a/fs/sysfs/file.c +++ b/fs/sysfs/file.c @@ -245,7 +245,7 @@ static const struct kernfs_ops sysfs_bin_kfops_mmap = { int sysfs_add_file_mode_ns(struct kernfs_node *parent, const struct attribute *attr, bool is_bin, - umode_t mode, const void *ns) + umode_t mode, kuid_t uid, kgid_t gid, const void *ns) { struct lock_class_key *key = NULL; const struct kernfs_ops *ops; @@ -302,8 +302,8 @@ int sysfs_add_file_mode_ns(struct kernfs_node *parent, if (!attr->ignore_lockdep) key = attr->key ?: (struct lock_class_key *)&attr->skey; #endif - kn = __kernfs_create_file(parent, attr->name, - mode & 0777, GLOBAL_ROOT_UID, GLOBAL_ROOT_GID, + + kn = __kernfs_create_file(parent, attr->name, mode & 0777, uid, gid, size, ops, (void *)attr, ns, key); if (IS_ERR(kn)) { if (PTR_ERR(kn) == -EEXIST) @@ -313,12 +313,6 @@ int sysfs_add_file_mode_ns(struct kernfs_node *parent, return 0; } -int sysfs_add_file(struct kernfs_node *parent, const struct attribute *attr, - bool is_bin) -{ - return sysfs_add_file_mode_ns(parent, attr, is_bin, attr->mode, NULL); -} - /** * sysfs_create_file_ns - create an attribute file for an object with custom ns * @kobj: object we're creating for @@ -328,9 +322,14 @@ int sysfs_add_file(struct kernfs_node *parent, const struct attribute *attr, int sysfs_create_file_ns(struct kobject *kobj, const struct attribute *attr, const void *ns) { + kuid_t uid; + kgid_t gid; + BUG_ON(!kobj || !kobj->sd || !attr); - return sysfs_add_file_mode_ns(kobj->sd, attr, false, attr->mode, ns); + kobject_get_ownership(kobj, &uid, &gid); + return sysfs_add_file_mode_ns(kobj->sd, attr, false, attr->mode, + uid, gid, ns); } EXPORT_SYMBOL_GPL(sysfs_create_file_ns); @@ -359,6 +358,8 @@ int sysfs_add_file_to_group(struct kobject *kobj, const struct attribute *attr, const char *group) { struct kernfs_node *parent; + kuid_t uid; + kgid_t gid; int error; if (group) { @@ -371,7 +372,9 @@ int sysfs_add_file_to_group(struct kobject *kobj, if (!parent) return -ENOENT; - error = sysfs_add_file(parent, attr, false); + kobject_get_ownership(kobj, &uid, &gid); + error = sysfs_add_file_mode_ns(kobj->sd, attr, false, + attr->mode, uid, gid, NULL); kernfs_put(parent); return error; @@ -487,9 +490,14 @@ EXPORT_SYMBOL_GPL(sysfs_remove_file_from_group); int sysfs_create_bin_file(struct kobject *kobj, const struct bin_attribute *attr) { + kuid_t uid; + kgid_t gid; + BUG_ON(!kobj || !kobj->sd || !attr); - return sysfs_add_file(kobj->sd, &attr->attr, true); + kobject_get_ownership(kobj, &uid, &gid); + return sysfs_add_file_mode_ns(kobj->sd, &attr->attr, true, + attr->attr.mode, uid, gid, NULL); } EXPORT_SYMBOL_GPL(sysfs_create_bin_file); diff --git a/fs/sysfs/group.c b/fs/sysfs/group.c index 4802ec0e1e3a..c7a716c4acc9 100644 --- a/fs/sysfs/group.c +++ b/fs/sysfs/group.c @@ -31,6 +31,7 @@ static void remove_files(struct kernfs_node *parent, } static int create_files(struct kernfs_node *parent, struct kobject *kobj, + kuid_t uid, kgid_t gid, const struct attribute_group *grp, int update) { struct attribute *const *attr; @@ -60,7 +61,7 @@ static int create_files(struct kernfs_node *parent, struct kobject *kobj, mode &= SYSFS_PREALLOC | 0664; error = sysfs_add_file_mode_ns(parent, *attr, false, - mode, NULL); + mode, uid, gid, NULL); if (unlikely(error)) break; } @@ -90,7 +91,8 @@ static int create_files(struct kernfs_node *parent, struct kobject *kobj, mode &= SYSFS_PREALLOC | 0664; error = sysfs_add_file_mode_ns(parent, &(*bin_attr)->attr, true, - mode, NULL); + mode, + uid, gid, NULL); if (error) break; } @@ -106,6 +108,8 @@ static int internal_create_group(struct kobject *kobj, int update, const struct attribute_group *grp) { struct kernfs_node *kn; + kuid_t uid; + kgid_t gid; int error; BUG_ON(!kobj || (!update && !kobj->sd)); @@ -118,9 +122,11 @@ static int internal_create_group(struct kobject *kobj, int update, kobj->name, grp->name ?: ""); return -EINVAL; } + kobject_get_ownership(kobj, &uid, &gid); if (grp->name) { - kn = kernfs_create_dir(kobj->sd, grp->name, - S_IRWXU | S_IRUGO | S_IXUGO, kobj); + kn = kernfs_create_dir_ns(kobj->sd, grp->name, + S_IRWXU | S_IRUGO | S_IXUGO, + uid, gid, kobj, NULL); if (IS_ERR(kn)) { if (PTR_ERR(kn) == -EEXIST) sysfs_warn_dup(kobj->sd, grp->name); @@ -129,7 +135,7 @@ static int internal_create_group(struct kobject *kobj, int update, } else kn = kobj->sd; kernfs_get(kn); - error = create_files(kn, kobj, grp, update); + error = create_files(kn, kobj, uid, gid, grp, update); if (error) { if (grp->name) kernfs_remove(kn); @@ -281,6 +287,8 @@ int sysfs_merge_group(struct kobject *kobj, const struct attribute_group *grp) { struct kernfs_node *parent; + kuid_t uid; + kgid_t gid; int error = 0; struct attribute *const *attr; int i; @@ -289,8 +297,11 @@ int sysfs_merge_group(struct kobject *kobj, if (!parent) return -ENOENT; + kobject_get_ownership(kobj, &uid, &gid); + for ((i = 0, attr = grp->attrs); *attr && !error; (++i, ++attr)) - error = sysfs_add_file(parent, *attr, false); + error = sysfs_add_file_mode_ns(parent, *attr, false, + (*attr)->mode, uid, gid, NULL); if (error) { while (--i >= 0) kernfs_remove_by_name(parent, (*--attr)->name); diff --git a/fs/sysfs/sysfs.h b/fs/sysfs/sysfs.h index d098e015fcc9..0050cc0c0236 100644 --- a/fs/sysfs/sysfs.h +++ b/fs/sysfs/sysfs.h @@ -27,11 +27,10 @@ void sysfs_warn_dup(struct kernfs_node *parent, const char *name); /* * file.c */ -int sysfs_add_file(struct kernfs_node *parent, - const struct attribute *attr, bool is_bin); int sysfs_add_file_mode_ns(struct kernfs_node *parent, const struct attribute *attr, bool is_bin, - umode_t amode, const void *ns); + umode_t amode, kuid_t uid, kgid_t gid, + const void *ns); /* * symlink.c diff --git a/include/linux/kobject.h b/include/linux/kobject.h index 7f6f93c3df9c..b49ff230beba 100644 --- a/include/linux/kobject.h +++ b/include/linux/kobject.h @@ -26,6 +26,7 @@ #include #include #include +#include #define UEVENT_HELPER_PATH_LEN 256 #define UEVENT_NUM_ENVP 32 /* number of env pointers */ @@ -114,6 +115,8 @@ extern struct kobject * __must_check kobject_get_unless_zero( extern void kobject_put(struct kobject *kobj); extern const void *kobject_namespace(struct kobject *kobj); +extern void kobject_get_ownership(struct kobject *kobj, + kuid_t *uid, kgid_t *gid); extern char *kobject_get_path(struct kobject *kobj, gfp_t flag); struct kobj_type { @@ -122,6 +125,7 @@ struct kobj_type { struct attribute **default_attrs; const struct kobj_ns_type_operations *(*child_ns_type)(struct kobject *kobj); const void *(*namespace)(struct kobject *kobj); + void (*get_ownership)(struct kobject *kobj, kuid_t *uid, kgid_t *gid); }; struct kobj_uevent_env { diff --git a/lib/kobject.c b/lib/kobject.c index 18989b5b3b56..f2dc1f756007 100644 --- a/lib/kobject.c +++ b/lib/kobject.c @@ -35,6 +35,25 @@ const void *kobject_namespace(struct kobject *kobj) return kobj->ktype->namespace(kobj); } +/** + * kobject_get_ownership - get sysfs ownership data for @kobj + * @kobj: kobject in question + * @uid: kernel user ID for sysfs objects + * @gid: kernel group ID for sysfs objects + * + * Returns initial uid/gid pair that should be used when creating sysfs + * representation of given kobject. Normally used to adjust ownership of + * objects in a container. + */ +void kobject_get_ownership(struct kobject *kobj, kuid_t *uid, kgid_t *gid) +{ + *uid = GLOBAL_ROOT_UID; + *gid = GLOBAL_ROOT_GID; + + if (kobj->ktype->get_ownership) + kobj->ktype->get_ownership(kobj, uid, gid); +} + /* * populate_dir - populate directory with attributes. * @kobj: object we're working on. -- cgit v1.2.3-59-g8ed1b From 9944e894c1266dc8515c82d1ff752d681215526b Mon Sep 17 00:00:00 2001 From: Dmitry Torokhov Date: Fri, 20 Jul 2018 21:56:50 +0000 Subject: driver core: set up ownership of class devices in sysfs Plumb in get_ownership() callback for devices belonging to a class so that they can be created with uid/gid different from global root. This will allow network devices in a container to belong to container's root and not global root. Signed-off-by: Dmitry Torokhov Reviewed-by: Tyler Hicks Signed-off-by: David S. Miller --- drivers/base/core.c | 9 +++++++++ include/linux/device.h | 5 +++++ 2 files changed, 14 insertions(+) (limited to 'include/linux') diff --git a/drivers/base/core.c b/drivers/base/core.c index df3e1a44707a..276c7e3f754c 100644 --- a/drivers/base/core.c +++ b/drivers/base/core.c @@ -866,10 +866,19 @@ static const void *device_namespace(struct kobject *kobj) return ns; } +static void device_get_ownership(struct kobject *kobj, kuid_t *uid, kgid_t *gid) +{ + struct device *dev = kobj_to_dev(kobj); + + if (dev->class && dev->class->get_ownership) + dev->class->get_ownership(dev, uid, gid); +} + static struct kobj_type device_ktype = { .release = device_release, .sysfs_ops = &dev_sysfs_ops, .namespace = device_namespace, + .get_ownership = device_get_ownership, }; diff --git a/include/linux/device.h b/include/linux/device.h index 055a69dbcd18..fe6ccb6dc119 100644 --- a/include/linux/device.h +++ b/include/linux/device.h @@ -384,6 +384,9 @@ int subsys_virtual_register(struct bus_type *subsys, * @shutdown_pre: Called at shut-down time before driver shutdown. * @ns_type: Callbacks so sysfs can detemine namespaces. * @namespace: Namespace of the device belongs to this class. + * @get_ownership: Allows class to specify uid/gid of the sysfs directories + * for the devices belonging to the class. Usually tied to + * device's namespace. * @pm: The default device power management operations of this class. * @p: The private data of the driver core, no one other than the * driver core can touch this. @@ -413,6 +416,8 @@ struct class { const struct kobj_ns_type_operations *ns_type; const void *(*namespace)(struct device *dev); + void (*get_ownership)(struct device *dev, kuid_t *uid, kgid_t *gid); + const struct dev_pm_ops *pm; struct subsys_private *p; -- cgit v1.2.3-59-g8ed1b From f53aaa31cce7b543e407da7e97690a700206f7b9 Mon Sep 17 00:00:00 2001 From: Feras Daoud Date: Mon, 16 Jul 2018 15:22:01 -0700 Subject: net/mlx5: FW tracer, implement tracer logic Implement FW tracer logic and registers access, initialization and cleanup flows. Initializing the tracer will be part of load one flow, as multiple PFs will try to acquire ownership but only one will succeed and will be the tracer owner. Signed-off-by: Feras Daoud Signed-off-by: Saeed Mahameed --- .../ethernet/mellanox/mlx5/core/diag/fw_tracer.c | 196 +++++++++++++++++++++ .../ethernet/mellanox/mlx5/core/diag/fw_tracer.h | 66 +++++++ include/linux/mlx5/driver.h | 3 + 3 files changed, 265 insertions(+) create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/diag/fw_tracer.c create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/diag/fw_tracer.h (limited to 'include/linux') diff --git a/drivers/net/ethernet/mellanox/mlx5/core/diag/fw_tracer.c b/drivers/net/ethernet/mellanox/mlx5/core/diag/fw_tracer.c new file mode 100644 index 000000000000..3ecbf06b4d71 --- /dev/null +++ b/drivers/net/ethernet/mellanox/mlx5/core/diag/fw_tracer.c @@ -0,0 +1,196 @@ +/* + * Copyright (c) 2018, Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "fw_tracer.h" + +static int mlx5_query_mtrc_caps(struct mlx5_fw_tracer *tracer) +{ + u32 *string_db_base_address_out = tracer->str_db.base_address_out; + u32 *string_db_size_out = tracer->str_db.size_out; + struct mlx5_core_dev *dev = tracer->dev; + u32 out[MLX5_ST_SZ_DW(mtrc_cap)] = {0}; + u32 in[MLX5_ST_SZ_DW(mtrc_cap)] = {0}; + void *mtrc_cap_sp; + int err, i; + + err = mlx5_core_access_reg(dev, in, sizeof(in), out, sizeof(out), + MLX5_REG_MTRC_CAP, 0, 0); + if (err) { + mlx5_core_warn(dev, "FWTracer: Error reading tracer caps %d\n", + err); + return err; + } + + if (!MLX5_GET(mtrc_cap, out, trace_to_memory)) { + mlx5_core_dbg(dev, "FWTracer: Device does not support logging traces to memory\n"); + return -ENOTSUPP; + } + + tracer->trc_ver = MLX5_GET(mtrc_cap, out, trc_ver); + tracer->str_db.first_string_trace = + MLX5_GET(mtrc_cap, out, first_string_trace); + tracer->str_db.num_string_trace = + MLX5_GET(mtrc_cap, out, num_string_trace); + tracer->str_db.num_string_db = MLX5_GET(mtrc_cap, out, num_string_db); + tracer->owner = !!MLX5_GET(mtrc_cap, out, trace_owner); + + for (i = 0; i < tracer->str_db.num_string_db; i++) { + mtrc_cap_sp = MLX5_ADDR_OF(mtrc_cap, out, string_db_param[i]); + string_db_base_address_out[i] = MLX5_GET(mtrc_string_db_param, + mtrc_cap_sp, + string_db_base_address); + string_db_size_out[i] = MLX5_GET(mtrc_string_db_param, + mtrc_cap_sp, string_db_size); + } + + return err; +} + +static int mlx5_set_mtrc_caps_trace_owner(struct mlx5_fw_tracer *tracer, + u32 *out, u32 out_size, + u8 trace_owner) +{ + struct mlx5_core_dev *dev = tracer->dev; + u32 in[MLX5_ST_SZ_DW(mtrc_cap)] = {0}; + + MLX5_SET(mtrc_cap, in, trace_owner, trace_owner); + + return mlx5_core_access_reg(dev, in, sizeof(in), out, out_size, + MLX5_REG_MTRC_CAP, 0, 1); +} + +static int mlx5_fw_tracer_ownership_acquire(struct mlx5_fw_tracer *tracer) +{ + struct mlx5_core_dev *dev = tracer->dev; + u32 out[MLX5_ST_SZ_DW(mtrc_cap)] = {0}; + int err; + + err = mlx5_set_mtrc_caps_trace_owner(tracer, out, sizeof(out), + MLX5_FW_TRACER_ACQUIRE_OWNERSHIP); + if (err) { + mlx5_core_warn(dev, "FWTracer: Acquire tracer ownership failed %d\n", + err); + return err; + } + + tracer->owner = !!MLX5_GET(mtrc_cap, out, trace_owner); + + if (!tracer->owner) + return -EBUSY; + + return 0; +} + +static void mlx5_fw_tracer_ownership_release(struct mlx5_fw_tracer *tracer) +{ + u32 out[MLX5_ST_SZ_DW(mtrc_cap)] = {0}; + + mlx5_set_mtrc_caps_trace_owner(tracer, out, sizeof(out), + MLX5_FW_TRACER_RELEASE_OWNERSHIP); + tracer->owner = false; +} + +static void mlx5_fw_tracer_ownership_change(struct work_struct *work) +{ + struct mlx5_fw_tracer *tracer = container_of(work, struct mlx5_fw_tracer, + ownership_change_work); + struct mlx5_core_dev *dev = tracer->dev; + int err; + + if (tracer->owner) { + mlx5_fw_tracer_ownership_release(tracer); + return; + } + + err = mlx5_fw_tracer_ownership_acquire(tracer); + if (err) { + mlx5_core_dbg(dev, "FWTracer: Ownership was not granted %d\n", err); + return; + } +} + +struct mlx5_fw_tracer *mlx5_fw_tracer_create(struct mlx5_core_dev *dev) +{ + struct mlx5_fw_tracer *tracer = NULL; + int err; + + if (!MLX5_CAP_MCAM_REG(dev, tracer_registers)) { + mlx5_core_dbg(dev, "FWTracer: Tracer capability not present\n"); + return NULL; + } + + tracer = kzalloc(sizeof(*tracer), GFP_KERNEL); + if (!tracer) + return ERR_PTR(-ENOMEM); + + tracer->work_queue = create_singlethread_workqueue("mlx5_fw_tracer"); + if (!tracer->work_queue) { + err = -ENOMEM; + goto free_tracer; + } + + tracer->dev = dev; + + INIT_WORK(&tracer->ownership_change_work, mlx5_fw_tracer_ownership_change); + + err = mlx5_query_mtrc_caps(tracer); + if (err) { + mlx5_core_dbg(dev, "FWTracer: Failed to query capabilities %d\n", err); + goto destroy_workqueue; + } + + mlx5_fw_tracer_ownership_change(&tracer->ownership_change_work); + + return tracer; + +destroy_workqueue: + tracer->dev = NULL; + destroy_workqueue(tracer->work_queue); +free_tracer: + kfree(tracer); + return ERR_PTR(err); +} + +void mlx5_fw_tracer_destroy(struct mlx5_fw_tracer *tracer) +{ + if (!tracer) + return; + + cancel_work_sync(&tracer->ownership_change_work); + + if (tracer->owner) + mlx5_fw_tracer_ownership_release(tracer); + + flush_workqueue(tracer->work_queue); + destroy_workqueue(tracer->work_queue); + kfree(tracer); +} diff --git a/drivers/net/ethernet/mellanox/mlx5/core/diag/fw_tracer.h b/drivers/net/ethernet/mellanox/mlx5/core/diag/fw_tracer.h new file mode 100644 index 000000000000..721c41a5e827 --- /dev/null +++ b/drivers/net/ethernet/mellanox/mlx5/core/diag/fw_tracer.h @@ -0,0 +1,66 @@ +/* + * Copyright (c) 2018, Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef __LIB_TRACER_H__ +#define __LIB_TRACER_H__ + +#include +#include "mlx5_core.h" + +#define STRINGS_DB_SECTIONS_NUM 8 + +struct mlx5_fw_tracer { + struct mlx5_core_dev *dev; + bool owner; + u8 trc_ver; + struct workqueue_struct *work_queue; + struct work_struct ownership_change_work; + + /* Strings DB */ + struct { + u8 first_string_trace; + u8 num_string_trace; + u32 num_string_db; + u32 base_address_out[STRINGS_DB_SECTIONS_NUM]; + u32 size_out[STRINGS_DB_SECTIONS_NUM]; + } str_db; +}; + +enum mlx5_fw_tracer_ownership_state { + MLX5_FW_TRACER_RELEASE_OWNERSHIP, + MLX5_FW_TRACER_ACQUIRE_OWNERSHIP, +}; + +struct mlx5_fw_tracer *mlx5_fw_tracer_create(struct mlx5_core_dev *dev); +void mlx5_fw_tracer_destroy(struct mlx5_fw_tracer *tracer); + +#endif diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index 957199c20a0f..86cb0ebf92fa 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -816,6 +816,8 @@ struct mlx5_clock { struct mlx5_pps pps_info; }; +struct mlx5_fw_tracer; + struct mlx5_core_dev { struct pci_dev *pdev; /* sync pci state */ @@ -860,6 +862,7 @@ struct mlx5_core_dev { struct mlx5_clock clock; struct mlx5_ib_clock_info *clock_info; struct page *clock_info_page; + struct mlx5_fw_tracer *tracer; }; struct mlx5_db { -- cgit v1.2.3-59-g8ed1b From c71ad41ccb0c29fce95149b74786574b354c9dda Mon Sep 17 00:00:00 2001 From: Feras Daoud Date: Wed, 7 Feb 2018 11:08:56 +0200 Subject: net/mlx5: FW tracer, events handling The tracer has one event, event 0x26, with two subtypes: - Subtype 0: Ownership change - Subtype 1: Traces available An ownership change occurs in the following cases: 1- Owner releases his ownership, in this case, an event will be sent to inform others to reattempt acquire ownership. 2- Ownership was taken by a higher priority tool, in this case the owner should understand that it lost ownership, and go through tear down flow. The second subtype indicates that there are traces in the trace buffer, in this case, the driver polls the tracer buffer for new traces, parse them and prepares the messages for printing. The HW starts tracing from the first address in the tracer buffer. Driver receives an event notifying that new trace block exists. HW posts a timestamp event at the last 8B of every 256B block. Comparing the timestamp to the last handled timestamp would indicate that this is a new trace block. Once the new timestamp is detected, the entire block is considered valid. Block validation and parsing, should be done after copying the current block to a different location, in order to avoid block overwritten during processing. Signed-off-by: Feras Daoud Signed-off-by: Saeed Mahameed --- .../ethernet/mellanox/mlx5/core/diag/fw_tracer.c | 268 ++++++++++++++++++++- .../ethernet/mellanox/mlx5/core/diag/fw_tracer.h | 71 +++++- drivers/net/ethernet/mellanox/mlx5/core/eq.c | 11 + include/linux/mlx5/device.h | 7 + 4 files changed, 347 insertions(+), 10 deletions(-) (limited to 'include/linux') diff --git a/drivers/net/ethernet/mellanox/mlx5/core/diag/fw_tracer.c b/drivers/net/ethernet/mellanox/mlx5/core/diag/fw_tracer.c index d6cc27b0ff34..bd887d1d3396 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/diag/fw_tracer.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/diag/fw_tracer.c @@ -318,25 +318,244 @@ out: return; } -static void mlx5_fw_tracer_ownership_change(struct work_struct *work) +static void mlx5_fw_tracer_arm(struct mlx5_core_dev *dev) { - struct mlx5_fw_tracer *tracer = container_of(work, struct mlx5_fw_tracer, - ownership_change_work); - struct mlx5_core_dev *dev = tracer->dev; + u32 out[MLX5_ST_SZ_DW(mtrc_ctrl)] = {0}; + u32 in[MLX5_ST_SZ_DW(mtrc_ctrl)] = {0}; int err; - if (tracer->owner) { - mlx5_fw_tracer_ownership_release(tracer); + MLX5_SET(mtrc_ctrl, in, arm_event, 1); + + err = mlx5_core_access_reg(dev, in, sizeof(in), out, sizeof(out), + MLX5_REG_MTRC_CTRL, 0, 1); + if (err) + mlx5_core_warn(dev, "FWTracer: Failed to arm tracer event %d\n", err); +} + +static void poll_trace(struct mlx5_fw_tracer *tracer, + struct tracer_event *tracer_event, u64 *trace) +{ + u32 timestamp_low, timestamp_mid, timestamp_high, urts; + + tracer_event->event_id = MLX5_GET(tracer_event, trace, event_id); + tracer_event->lost_event = MLX5_GET(tracer_event, trace, lost); + + switch (tracer_event->event_id) { + case TRACER_EVENT_TYPE_TIMESTAMP: + tracer_event->type = TRACER_EVENT_TYPE_TIMESTAMP; + urts = MLX5_GET(tracer_timestamp_event, trace, urts); + if (tracer->trc_ver == 0) + tracer_event->timestamp_event.unreliable = !!(urts >> 2); + else + tracer_event->timestamp_event.unreliable = !!(urts & 1); + + timestamp_low = MLX5_GET(tracer_timestamp_event, + trace, timestamp7_0); + timestamp_mid = MLX5_GET(tracer_timestamp_event, + trace, timestamp39_8); + timestamp_high = MLX5_GET(tracer_timestamp_event, + trace, timestamp52_40); + + tracer_event->timestamp_event.timestamp = + ((u64)timestamp_high << 40) | + ((u64)timestamp_mid << 8) | + (u64)timestamp_low; + break; + default: + if (tracer_event->event_id >= tracer->str_db.first_string_trace || + tracer_event->event_id <= tracer->str_db.first_string_trace + + tracer->str_db.num_string_trace) { + tracer_event->type = TRACER_EVENT_TYPE_STRING; + tracer_event->string_event.timestamp = + MLX5_GET(tracer_string_event, trace, timestamp); + tracer_event->string_event.string_param = + MLX5_GET(tracer_string_event, trace, string_param); + tracer_event->string_event.tmsn = + MLX5_GET(tracer_string_event, trace, tmsn); + tracer_event->string_event.tdsn = + MLX5_GET(tracer_string_event, trace, tdsn); + } else { + tracer_event->type = TRACER_EVENT_TYPE_UNRECOGNIZED; + } + break; + } +} + +static u64 get_block_timestamp(struct mlx5_fw_tracer *tracer, u64 *ts_event) +{ + struct tracer_event tracer_event; + u8 event_id; + + event_id = MLX5_GET(tracer_event, ts_event, event_id); + + if (event_id == TRACER_EVENT_TYPE_TIMESTAMP) + poll_trace(tracer, &tracer_event, ts_event); + else + tracer_event.timestamp_event.timestamp = 0; + + return tracer_event.timestamp_event.timestamp; +} + +static void mlx5_fw_tracer_handle_traces(struct work_struct *work) +{ + struct mlx5_fw_tracer *tracer = + container_of(work, struct mlx5_fw_tracer, handle_traces_work); + u64 block_timestamp, last_block_timestamp, tmp_trace_block[TRACES_PER_BLOCK]; + u32 block_count, start_offset, prev_start_offset, prev_consumer_index; + u32 trace_event_size = MLX5_ST_SZ_BYTES(tracer_event); + struct tracer_event tracer_event; + struct mlx5_core_dev *dev; + int i; + + if (!tracer->owner) return; + + dev = tracer->dev; + block_count = tracer->buff.size / TRACER_BLOCK_SIZE_BYTE; + start_offset = tracer->buff.consumer_index * TRACER_BLOCK_SIZE_BYTE; + + /* Copy the block to local buffer to avoid HW override while being processed*/ + memcpy(tmp_trace_block, tracer->buff.log_buf + start_offset, + TRACER_BLOCK_SIZE_BYTE); + + block_timestamp = + get_block_timestamp(tracer, &tmp_trace_block[TRACES_PER_BLOCK - 1]); + + while (block_timestamp > tracer->last_timestamp) { + /* Check block override if its not the first block */ + if (!tracer->last_timestamp) { + u64 *ts_event; + /* To avoid block override be the HW in case of buffer + * wraparound, the time stamp of the previous block + * should be compared to the last timestamp handled + * by the driver. + */ + prev_consumer_index = + (tracer->buff.consumer_index - 1) & (block_count - 1); + prev_start_offset = prev_consumer_index * TRACER_BLOCK_SIZE_BYTE; + + ts_event = tracer->buff.log_buf + prev_start_offset + + (TRACES_PER_BLOCK - 1) * trace_event_size; + last_block_timestamp = get_block_timestamp(tracer, ts_event); + /* If previous timestamp different from last stored + * timestamp then there is a good chance that the + * current buffer is overwritten and therefore should + * not be parsed. + */ + if (tracer->last_timestamp != last_block_timestamp) { + mlx5_core_warn(dev, "FWTracer: Events were lost\n"); + tracer->last_timestamp = block_timestamp; + tracer->buff.consumer_index = + (tracer->buff.consumer_index + 1) & (block_count - 1); + break; + } + } + + /* Parse events */ + for (i = 0; i < TRACES_PER_BLOCK ; i++) + poll_trace(tracer, &tracer_event, &tmp_trace_block[i]); + + tracer->buff.consumer_index = + (tracer->buff.consumer_index + 1) & (block_count - 1); + + tracer->last_timestamp = block_timestamp; + start_offset = tracer->buff.consumer_index * TRACER_BLOCK_SIZE_BYTE; + memcpy(tmp_trace_block, tracer->buff.log_buf + start_offset, + TRACER_BLOCK_SIZE_BYTE); + block_timestamp = get_block_timestamp(tracer, + &tmp_trace_block[TRACES_PER_BLOCK - 1]); } + mlx5_fw_tracer_arm(dev); +} + +static int mlx5_fw_tracer_set_mtrc_conf(struct mlx5_fw_tracer *tracer) +{ + struct mlx5_core_dev *dev = tracer->dev; + u32 out[MLX5_ST_SZ_DW(mtrc_conf)] = {0}; + u32 in[MLX5_ST_SZ_DW(mtrc_conf)] = {0}; + int err; + + MLX5_SET(mtrc_conf, in, trace_mode, TRACE_TO_MEMORY); + MLX5_SET(mtrc_conf, in, log_trace_buffer_size, + ilog2(TRACER_BUFFER_PAGE_NUM)); + MLX5_SET(mtrc_conf, in, trace_mkey, tracer->buff.mkey.key); + + err = mlx5_core_access_reg(dev, in, sizeof(in), out, sizeof(out), + MLX5_REG_MTRC_CONF, 0, 1); + if (err) + mlx5_core_warn(dev, "FWTracer: Failed to set tracer configurations %d\n", err); + + return err; +} + +static int mlx5_fw_tracer_set_mtrc_ctrl(struct mlx5_fw_tracer *tracer, u8 status, u8 arm) +{ + struct mlx5_core_dev *dev = tracer->dev; + u32 out[MLX5_ST_SZ_DW(mtrc_ctrl)] = {0}; + u32 in[MLX5_ST_SZ_DW(mtrc_ctrl)] = {0}; + int err; + + MLX5_SET(mtrc_ctrl, in, modify_field_select, TRACE_STATUS); + MLX5_SET(mtrc_ctrl, in, trace_status, status); + MLX5_SET(mtrc_ctrl, in, arm_event, arm); + + err = mlx5_core_access_reg(dev, in, sizeof(in), out, sizeof(out), + MLX5_REG_MTRC_CTRL, 0, 1); + + if (!err && status) + tracer->last_timestamp = 0; + + return err; +} + +static int mlx5_fw_tracer_start(struct mlx5_fw_tracer *tracer) +{ + struct mlx5_core_dev *dev = tracer->dev; + int err; + err = mlx5_fw_tracer_ownership_acquire(tracer); if (err) { mlx5_core_dbg(dev, "FWTracer: Ownership was not granted %d\n", err); + /* Don't fail since ownership can be acquired on a later FW event */ + return 0; + } + + err = mlx5_fw_tracer_set_mtrc_conf(tracer); + if (err) { + mlx5_core_warn(dev, "FWTracer: Failed to set tracer configuration %d\n", err); + goto release_ownership; + } + + /* enable tracer & trace events */ + err = mlx5_fw_tracer_set_mtrc_ctrl(tracer, 1, 1); + if (err) { + mlx5_core_warn(dev, "FWTracer: Failed to enable tracer %d\n", err); + goto release_ownership; + } + + return 0; + +release_ownership: + mlx5_fw_tracer_ownership_release(tracer); + return err; +} + +static void mlx5_fw_tracer_ownership_change(struct work_struct *work) +{ + struct mlx5_fw_tracer *tracer = + container_of(work, struct mlx5_fw_tracer, ownership_change_work); + + if (tracer->owner) { + tracer->owner = false; + tracer->buff.consumer_index = 0; return; } + + mlx5_fw_tracer_start(tracer); } +/* Create software resources (Buffers, etc ..) */ struct mlx5_fw_tracer *mlx5_fw_tracer_create(struct mlx5_core_dev *dev) { struct mlx5_fw_tracer *tracer = NULL; @@ -361,6 +580,8 @@ struct mlx5_fw_tracer *mlx5_fw_tracer_create(struct mlx5_core_dev *dev) INIT_WORK(&tracer->ownership_change_work, mlx5_fw_tracer_ownership_change); INIT_WORK(&tracer->read_fw_strings_work, mlx5_tracer_read_strings_db); + INIT_WORK(&tracer->handle_traces_work, mlx5_fw_tracer_handle_traces); + err = mlx5_query_mtrc_caps(tracer); if (err) { @@ -392,6 +613,9 @@ free_tracer: return ERR_PTR(err); } +/* Create HW resources + start tracer + * must be called before Async EQ is created + */ int mlx5_fw_tracer_init(struct mlx5_fw_tracer *tracer) { struct mlx5_core_dev *dev; @@ -417,22 +641,25 @@ int mlx5_fw_tracer_init(struct mlx5_fw_tracer *tracer) goto err_dealloc_pd; } - err = mlx5_fw_tracer_ownership_acquire(tracer); - if (err) /* Don't fail since ownership can be acquired on a later FW event */ - mlx5_core_dbg(dev, "FWTracer: Ownership was not granted %d\n", err); + mlx5_fw_tracer_start(tracer); return 0; + err_dealloc_pd: mlx5_core_dealloc_pd(dev, tracer->buff.pdn); return err; } +/* Stop tracer + Cleanup HW resources + * must be called after Async EQ is destroyed + */ void mlx5_fw_tracer_cleanup(struct mlx5_fw_tracer *tracer) { if (IS_ERR_OR_NULL(tracer)) return; cancel_work_sync(&tracer->ownership_change_work); + cancel_work_sync(&tracer->handle_traces_work); if (tracer->owner) mlx5_fw_tracer_ownership_release(tracer); @@ -441,6 +668,7 @@ void mlx5_fw_tracer_cleanup(struct mlx5_fw_tracer *tracer) mlx5_core_dealloc_pd(tracer->dev, tracer->buff.pdn); } +/* Free software resources (Buffers, etc ..) */ void mlx5_fw_tracer_destroy(struct mlx5_fw_tracer *tracer) { if (IS_ERR_OR_NULL(tracer)) @@ -454,4 +682,26 @@ void mlx5_fw_tracer_destroy(struct mlx5_fw_tracer *tracer) kfree(tracer); } +void mlx5_fw_tracer_event(struct mlx5_core_dev *dev, struct mlx5_eqe *eqe) +{ + struct mlx5_fw_tracer *tracer = dev->tracer; + + if (!tracer) + return; + + switch (eqe->sub_type) { + case MLX5_TRACER_SUBTYPE_OWNERSHIP_CHANGE: + if (test_bit(MLX5_INTERFACE_STATE_UP, &dev->intf_state)) + queue_work(tracer->work_queue, &tracer->ownership_change_work); + break; + case MLX5_TRACER_SUBTYPE_TRACES_AVAILABLE: + if (likely(tracer->str_db.loaded)) + queue_work(tracer->work_queue, &tracer->handle_traces_work); + break; + default: + mlx5_core_dbg(dev, "FWTracer: Event with unrecognized subtype: sub_type %d\n", + eqe->sub_type); + } +} + EXPORT_TRACEPOINT_SYMBOL(mlx5_fw); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/diag/fw_tracer.h b/drivers/net/ethernet/mellanox/mlx5/core/diag/fw_tracer.h index 66cb7e7ada28..3915e91486b2 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/diag/fw_tracer.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/diag/fw_tracer.h @@ -43,6 +43,9 @@ #define TRACER_BUFFER_CHUNK 4096 #define TRACE_BUFFER_SIZE_BYTE (TRACER_BUFFER_PAGE_NUM * TRACER_BUFFER_CHUNK) +#define TRACER_BLOCK_SIZE_BYTE 256 +#define TRACES_PER_BLOCK 32 + struct mlx5_fw_tracer { struct mlx5_core_dev *dev; bool owner; @@ -69,8 +72,11 @@ struct mlx5_fw_tracer { dma_addr_t dma; u32 size; struct mlx5_core_mkey mkey; - + u32 consumer_index; } buff; + + u64 last_timestamp; + struct work_struct handle_traces_work; }; enum mlx5_fw_tracer_ownership_state { @@ -78,7 +84,70 @@ enum mlx5_fw_tracer_ownership_state { MLX5_FW_TRACER_ACQUIRE_OWNERSHIP, }; +enum tracer_ctrl_fields_select { + TRACE_STATUS = 1 << 0, +}; + +enum tracer_event_type { + TRACER_EVENT_TYPE_STRING, + TRACER_EVENT_TYPE_TIMESTAMP = 0xFF, + TRACER_EVENT_TYPE_UNRECOGNIZED, +}; + +enum tracing_mode { + TRACE_TO_MEMORY = 1 << 0, +}; + +struct tracer_timestamp_event { + u64 timestamp; + u8 unreliable; +}; + +struct tracer_string_event { + u32 timestamp; + u32 tmsn; + u32 tdsn; + u32 string_param; +}; + +struct tracer_event { + bool lost_event; + u32 type; + u8 event_id; + union { + struct tracer_string_event string_event; + struct tracer_timestamp_event timestamp_event; + }; +}; + +struct mlx5_ifc_tracer_event_bits { + u8 lost[0x1]; + u8 timestamp[0x7]; + u8 event_id[0x8]; + u8 event_data[0x30]; +}; + +struct mlx5_ifc_tracer_string_event_bits { + u8 lost[0x1]; + u8 timestamp[0x7]; + u8 event_id[0x8]; + u8 tmsn[0xd]; + u8 tdsn[0x3]; + u8 string_param[0x20]; +}; + +struct mlx5_ifc_tracer_timestamp_event_bits { + u8 timestamp7_0[0x8]; + u8 event_id[0x8]; + u8 urts[0x3]; + u8 timestamp52_40[0xd]; + u8 timestamp39_8[0x20]; +}; + struct mlx5_fw_tracer *mlx5_fw_tracer_create(struct mlx5_core_dev *dev); +int mlx5_fw_tracer_init(struct mlx5_fw_tracer *tracer); +void mlx5_fw_tracer_cleanup(struct mlx5_fw_tracer *tracer); void mlx5_fw_tracer_destroy(struct mlx5_fw_tracer *tracer); +void mlx5_fw_tracer_event(struct mlx5_core_dev *dev, struct mlx5_eqe *eqe) { return; } #endif diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eq.c b/drivers/net/ethernet/mellanox/mlx5/core/eq.c index 406c23862f5f..7669b4380779 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/eq.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/eq.c @@ -40,6 +40,7 @@ #include "mlx5_core.h" #include "fpga/core.h" #include "eswitch.h" +#include "diag/fw_tracer.h" enum { MLX5_EQE_SIZE = sizeof(struct mlx5_eqe), @@ -168,6 +169,8 @@ static const char *eqe_type_str(u8 type) return "MLX5_EVENT_TYPE_FPGA_QP_ERROR"; case MLX5_EVENT_TYPE_GENERAL_EVENT: return "MLX5_EVENT_TYPE_GENERAL_EVENT"; + case MLX5_EVENT_TYPE_DEVICE_TRACER: + return "MLX5_EVENT_TYPE_DEVICE_TRACER"; default: return "Unrecognized event"; } @@ -576,6 +579,11 @@ static irqreturn_t mlx5_eq_int(int irq, void *eq_ptr) case MLX5_EVENT_TYPE_GENERAL_EVENT: general_event_handler(dev, eqe); break; + + case MLX5_EVENT_TYPE_DEVICE_TRACER: + mlx5_fw_tracer_event(dev, eqe); + break; + default: mlx5_core_warn(dev, "Unhandled event 0x%x on EQ 0x%x\n", eqe->type, eq->eqn); @@ -853,6 +861,9 @@ int mlx5_start_eqs(struct mlx5_core_dev *dev) if (MLX5_CAP_GEN(dev, temp_warn_event)) async_event_mask |= (1ull << MLX5_EVENT_TYPE_TEMP_WARN_EVENT); + if (MLX5_CAP_MCAM_REG(dev, tracer_registers)) + async_event_mask |= (1ull << MLX5_EVENT_TYPE_DEVICE_TRACER); + err = mlx5_create_map_eq(dev, &table->cmd_eq, MLX5_EQ_VEC_CMD, MLX5_NUM_CMD_EQE, 1ull << MLX5_EVENT_TYPE_CMD, "mlx5_cmd_eq", MLX5_EQ_TYPE_ASYNC); diff --git a/include/linux/mlx5/device.h b/include/linux/mlx5/device.h index 0566c6a94805..d489494b0a84 100644 --- a/include/linux/mlx5/device.h +++ b/include/linux/mlx5/device.h @@ -332,6 +332,13 @@ enum mlx5_event { MLX5_EVENT_TYPE_FPGA_ERROR = 0x20, MLX5_EVENT_TYPE_FPGA_QP_ERROR = 0x21, + + MLX5_EVENT_TYPE_DEVICE_TRACER = 0x26, +}; + +enum { + MLX5_TRACER_SUBTYPE_OWNERSHIP_CHANGE = 0x0, + MLX5_TRACER_SUBTYPE_TRACES_AVAILABLE = 0x1, }; enum { -- cgit v1.2.3-59-g8ed1b From 3730cf4dd70b6a36e48d58a862120311411b77f5 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Tue, 24 Jul 2018 12:47:56 +0200 Subject: netlink: do not store start function in netlink_cb ->start() is called once when dump is being initialized, there is no need to store it in netlink_cb. Signed-off-by: Florian Westphal Signed-off-by: David S. Miller --- include/linux/netlink.h | 1 - net/netlink/af_netlink.c | 5 ++--- 2 files changed, 2 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/netlink.h b/include/linux/netlink.h index f3075d6c7e82..71f121b66ca8 100644 --- a/include/linux/netlink.h +++ b/include/linux/netlink.h @@ -170,7 +170,6 @@ netlink_skb_clone(struct sk_buff *skb, gfp_t gfp_mask) struct netlink_callback { struct sk_buff *skb; const struct nlmsghdr *nlh; - int (*start)(struct netlink_callback *); int (*dump)(struct sk_buff * skb, struct netlink_callback *cb); int (*done)(struct netlink_callback *cb); diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c index 393573a99a5a..f6ac7693d2cc 100644 --- a/net/netlink/af_netlink.c +++ b/net/netlink/af_netlink.c @@ -2300,7 +2300,6 @@ int __netlink_dump_start(struct sock *ssk, struct sk_buff *skb, cb = &nlk->cb; memset(cb, 0, sizeof(*cb)); - cb->start = control->start; cb->dump = control->dump; cb->done = control->done; cb->nlh = nlh; @@ -2309,8 +2308,8 @@ int __netlink_dump_start(struct sock *ssk, struct sk_buff *skb, cb->min_dump_alloc = control->min_dump_alloc; cb->skb = skb; - if (cb->start) { - ret = cb->start(cb); + if (control->start) { + ret = control->start(cb); if (ret) goto error_put; } -- cgit v1.2.3-59-g8ed1b From 3c507b8af638c67d4a80d70091d2057ecb01e8a6 Mon Sep 17 00:00:00 2001 From: Heiner Kallweit Date: Mon, 23 Jul 2018 21:40:07 +0200 Subject: net: phy: add helper phy_polling_mode Add a helper for checking whether polling is used to detect PHY status changes. Signed-off-by: Heiner Kallweit Reviewed-by: Andrew Lunn Signed-off-by: David S. Miller --- drivers/net/phy/phy.c | 8 ++++---- include/linux/phy.h | 10 ++++++++++ 2 files changed, 14 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/drivers/net/phy/phy.c b/drivers/net/phy/phy.c index 04780db4c963..1ee25877c4d1 100644 --- a/drivers/net/phy/phy.c +++ b/drivers/net/phy/phy.c @@ -525,7 +525,7 @@ static int phy_start_aneg_priv(struct phy_device *phydev, bool sync) * negotiation may already be done and aneg interrupt may not be * generated. */ - if (phydev->irq != PHY_POLL && phydev->state == PHY_AN) { + if (!phy_polling_mode(phydev) && phydev->state == PHY_AN) { err = phy_aneg_done(phydev); if (err > 0) { trigger = true; @@ -983,7 +983,7 @@ void phy_state_machine(struct work_struct *work) needs_aneg = true; break; case PHY_NOLINK: - if (phydev->irq != PHY_POLL) + if (!phy_polling_mode(phydev)) break; err = phy_read_status(phydev); @@ -1024,7 +1024,7 @@ void phy_state_machine(struct work_struct *work) /* Only register a CHANGE if we are polling and link changed * since latest checking. */ - if (phydev->irq == PHY_POLL) { + if (phy_polling_mode(phydev)) { old_link = phydev->link; err = phy_read_status(phydev); if (err) @@ -1123,7 +1123,7 @@ void phy_state_machine(struct work_struct *work) * PHY, if PHY_IGNORE_INTERRUPT is set, then we will be moving * between states from phy_mac_interrupt() */ - if (phydev->irq == PHY_POLL) + if (phy_polling_mode(phydev)) queue_delayed_work(system_power_efficient_wq, &phydev->state_queue, PHY_STATE_TIME * HZ); } diff --git a/include/linux/phy.h b/include/linux/phy.h index 075c2f770d3e..cd6f637cbbfb 100644 --- a/include/linux/phy.h +++ b/include/linux/phy.h @@ -824,6 +824,16 @@ static inline bool phy_interrupt_is_valid(struct phy_device *phydev) return phydev->irq != PHY_POLL && phydev->irq != PHY_IGNORE_INTERRUPT; } +/** + * phy_polling_mode - Convenience function for testing whether polling is + * used to detect PHY status changes + * @phydev: the phy_device struct + */ +static inline bool phy_polling_mode(struct phy_device *phydev) +{ + return phydev->irq == PHY_POLL; +} + /** * phy_is_internal - Convenience function for testing if a PHY is internal * @phydev: the phy_device struct -- cgit v1.2.3-59-g8ed1b From 038709071328ff68a936c6b8c33a24a805eea3c5 Mon Sep 17 00:00:00 2001 From: Zhu Yi Date: Wed, 13 Jun 2018 16:37:17 +0200 Subject: can: dev: enable multi-queue for SocketCAN devices The existing SocketCAN implementation provides alloc_candev() to allocate a CAN device using a single Tx and Rx queue. This can lead to priority inversion in case the single Tx queue is already full with low priority messages and a high priority message needs to be sent while the bus is fully loaded with medium priority messages. This problem can be solved by using the existing multi-queue support of the network subsytem. The commit makes it possible to use multi-queue in the CAN subsystem in the same way it is used in the Ethernet subsystem by adding an alloc_candev_mqs() call and accompanying macros. With this support a CAN device can use multi-queue qdisc (e.g. mqprio) to avoid the aforementioned priority inversion. The exisiting functionality of alloc_candev() is the same as before. CAN devices need to have prioritized multiple hardware queues or are able to abort waiting for arbitration to make sensible use of multi-queues. Signed-off-by: Zhu Yi Signed-off-by: Mark Jonas Reviewed-by: Heiko Schocher Signed-off-by: Marc Kleine-Budde --- drivers/net/can/dev.c | 8 +++++--- include/linux/can/dev.h | 7 ++++++- 2 files changed, 11 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/drivers/net/can/dev.c b/drivers/net/can/dev.c index 5bb6e8896601..49163570a63a 100644 --- a/drivers/net/can/dev.c +++ b/drivers/net/can/dev.c @@ -701,7 +701,8 @@ EXPORT_SYMBOL_GPL(alloc_can_err_skb); /* * Allocate and setup space for the CAN network device */ -struct net_device *alloc_candev(int sizeof_priv, unsigned int echo_skb_max) +struct net_device *alloc_candev_mqs(int sizeof_priv, unsigned int echo_skb_max, + unsigned int txqs, unsigned int rxqs) { struct net_device *dev; struct can_priv *priv; @@ -713,7 +714,8 @@ struct net_device *alloc_candev(int sizeof_priv, unsigned int echo_skb_max) else size = sizeof_priv; - dev = alloc_netdev(size, "can%d", NET_NAME_UNKNOWN, can_setup); + dev = alloc_netdev_mqs(size, "can%d", NET_NAME_UNKNOWN, can_setup, + txqs, rxqs); if (!dev) return NULL; @@ -732,7 +734,7 @@ struct net_device *alloc_candev(int sizeof_priv, unsigned int echo_skb_max) return dev; } -EXPORT_SYMBOL_GPL(alloc_candev); +EXPORT_SYMBOL_GPL(alloc_candev_mqs); /* * Free space of the CAN network device diff --git a/include/linux/can/dev.h b/include/linux/can/dev.h index 055aaf5ed9af..a83e1f632eb7 100644 --- a/include/linux/can/dev.h +++ b/include/linux/can/dev.h @@ -143,7 +143,12 @@ u8 can_dlc2len(u8 can_dlc); /* map the sanitized data length to an appropriate data length code */ u8 can_len2dlc(u8 len); -struct net_device *alloc_candev(int sizeof_priv, unsigned int echo_skb_max); +struct net_device *alloc_candev_mqs(int sizeof_priv, unsigned int echo_skb_max, + unsigned int txqs, unsigned int rxqs); +#define alloc_candev(sizeof_priv, echo_skb_max) \ + alloc_candev_mqs(sizeof_priv, echo_skb_max, 1, 1) +#define alloc_candev_mq(sizeof_priv, echo_skb_max, count) \ + alloc_candev_mqs(sizeof_priv, echo_skb_max, count, count) void free_candev(struct net_device *dev); /* a candev safe wrapper around netdev_priv */ -- cgit v1.2.3-59-g8ed1b From 22a65aa8b1a84ca429c0fc8415dee5681ab36eb3 Mon Sep 17 00:00:00 2001 From: Gal Pressman Date: Mon, 25 Dec 2017 18:40:52 +0200 Subject: net/mlx5e: Vxlan, check maximum number of UDP ports The NIC has a limited number of offloaded VXLAN UDP ports (usually 4). Instead of letting the firmware fail when trying to add more ports than it can handle, let the driver check it on its own. Signed-off-by: Gal Pressman Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/en.h | 1 + drivers/net/ethernet/mellanox/mlx5/core/vxlan.c | 14 ++++++++++++++ include/linux/mlx5/mlx5_ifc.h | 4 +++- 3 files changed, 18 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en.h b/drivers/net/ethernet/mellanox/mlx5/core/en.h index c41cfc2a4b70..c4d4db8722f5 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h @@ -657,6 +657,7 @@ enum { struct mlx5e_vxlan_db { spinlock_t lock; /* protect vxlan table */ struct radix_tree_root tree; + int num_ports; }; struct mlx5e_l2_rule { diff --git a/drivers/net/ethernet/mellanox/mlx5/core/vxlan.c b/drivers/net/ethernet/mellanox/mlx5/core/vxlan.c index 2f699998d13e..e3af2efe18ce 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/vxlan.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/vxlan.c @@ -52,6 +52,11 @@ void mlx5e_vxlan_init(struct mlx5e_priv *priv) mlx5e_vxlan_add_port(priv, 4789); } +static inline u8 mlx5e_vxlan_max_udp_ports(struct mlx5_core_dev *mdev) +{ + return MLX5_CAP_ETH(mdev, max_vxlan_udp_ports) ?: 4; +} + static int mlx5e_vxlan_core_add_port_cmd(struct mlx5_core_dev *mdev, u16 port) { u32 in[MLX5_ST_SZ_DW(add_vxlan_udp_dport_in)] = {0}; @@ -98,6 +103,13 @@ static void mlx5e_vxlan_add_port(struct mlx5e_priv *priv, u16 port) return; } + if (vxlan_db->num_ports >= mlx5e_vxlan_max_udp_ports(priv->mdev)) { + netdev_info(priv->netdev, + "UDP port (%d) not offloaded, max number of UDP ports (%d) are already offloaded\n", + port, mlx5e_vxlan_max_udp_ports(priv->mdev)); + return; + } + if (mlx5e_vxlan_core_add_port_cmd(priv->mdev, port)) return; @@ -114,6 +126,7 @@ static void mlx5e_vxlan_add_port(struct mlx5e_priv *priv, u16 port) if (err) goto err_free; + vxlan_db->num_ports++; return; err_free: @@ -163,6 +176,7 @@ out_unlock: if (remove) { mlx5e_vxlan_core_del_port_cmd(priv->mdev, port); kfree(vxlan); + vxlan_db->num_ports--; } mutex_unlock(&priv->state_lock); kfree(vxlan_work); diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index 22f54bedfaae..60c2308fe062 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -668,7 +668,9 @@ struct mlx5_ifc_per_protocol_networking_offload_caps_bits { u8 swp[0x1]; u8 swp_csum[0x1]; u8 swp_lso[0x1]; - u8 reserved_at_23[0x1b]; + u8 reserved_at_23[0xd]; + u8 max_vxlan_udp_ports[0x8]; + u8 reserved_at_38[0x6]; u8 max_geneve_opt_len[0x1]; u8 tunnel_stateless_geneve_rx[0x1]; -- cgit v1.2.3-59-g8ed1b From 358aa5ce288aa1085f0f3ef9f315119563fa6541 Mon Sep 17 00:00:00 2001 From: Saeed Mahameed Date: Wed, 9 May 2018 13:28:00 -0700 Subject: net/mlx5e: Vxlan, move vxlan logic to core driver Move vxlan logic and objects to mlx5 core dirver. Since it going to be used from different mlx5 interfaces. e.g. mlx5e PF NIC netdev and mlx5e E-Switch representors. Signed-off-by: Saeed Mahameed Reviewed-by: Or Gerlitz --- drivers/net/ethernet/mellanox/mlx5/core/Makefile | 4 +- drivers/net/ethernet/mellanox/mlx5/core/en.h | 2 - drivers/net/ethernet/mellanox/mlx5/core/en_main.c | 21 +- drivers/net/ethernet/mellanox/mlx5/core/en_tc.c | 6 +- .../net/ethernet/mellanox/mlx5/core/lib/vxlan.c | 230 +++++++++++++++++++++ .../net/ethernet/mellanox/mlx5/core/lib/vxlan.h | 64 ++++++ drivers/net/ethernet/mellanox/mlx5/core/main.c | 5 + drivers/net/ethernet/mellanox/mlx5/core/vxlan.c | 230 --------------------- drivers/net/ethernet/mellanox/mlx5/core/vxlan.h | 64 ------ include/linux/mlx5/driver.h | 2 + 10 files changed, 315 insertions(+), 313 deletions(-) create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/lib/vxlan.c create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/lib/vxlan.h delete mode 100644 drivers/net/ethernet/mellanox/mlx5/core/vxlan.c delete mode 100644 drivers/net/ethernet/mellanox/mlx5/core/vxlan.h (limited to 'include/linux') diff --git a/drivers/net/ethernet/mellanox/mlx5/core/Makefile b/drivers/net/ethernet/mellanox/mlx5/core/Makefile index ae2bdcb1647c..f20fda1ced4f 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/Makefile +++ b/drivers/net/ethernet/mellanox/mlx5/core/Makefile @@ -14,8 +14,8 @@ mlx5_core-$(CONFIG_MLX5_FPGA) += fpga/cmd.o fpga/core.o fpga/conn.o fpga/sdk.o \ fpga/ipsec.o fpga/tls.o mlx5_core-$(CONFIG_MLX5_CORE_EN) += en_main.o en_common.o en_fs.o en_ethtool.o \ - en_tx.o en_rx.o en_dim.o en_txrx.o en/xdp.o en_stats.o vxlan.o \ - en_arfs.o en_fs_ethtool.o en_selftest.o en/port.o + en_tx.o en_rx.o en_dim.o en_txrx.o en/xdp.o en_stats.o \ + en_arfs.o en_fs_ethtool.o en_selftest.o en/port.o lib/vxlan.o mlx5_core-$(CONFIG_MLX5_MPFS) += lib/mpfs.o diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en.h b/drivers/net/ethernet/mellanox/mlx5/core/en.h index 1bd4536b9061..c7ed3d20fd54 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h @@ -52,7 +52,6 @@ #include "wq.h" #include "mlx5_core.h" #include "en_stats.h" -#include "vxlan.h" struct page_pool; @@ -812,7 +811,6 @@ struct mlx5e_priv { u32 tx_rates[MLX5E_MAX_NUM_SQS]; struct mlx5e_flow_steering fs; - struct mlx5_vxlan *vxlan; struct workqueue_struct *wq; struct work_struct update_carrier_work; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c index ef4b2f0c427c..fde35021a257 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c @@ -45,7 +45,7 @@ #include "en_accel/tls.h" #include "accel/ipsec.h" #include "accel/tls.h" -#include "vxlan.h" +#include "lib/vxlan.h" #include "en/port.h" #include "en/xdp.h" @@ -2974,7 +2974,7 @@ int mlx5e_open(struct net_device *netdev) mlx5_set_port_admin_status(priv->mdev, MLX5_PORT_UP); mutex_unlock(&priv->state_lock); - if (mlx5_vxlan_allowed(priv->vxlan)) + if (mlx5_vxlan_allowed(priv->mdev->vxlan)) udp_tunnel_get_rx_info(netdev); return err; @@ -3983,7 +3983,7 @@ static void mlx5e_vxlan_add_work(struct work_struct *work) u16 port = vxlan_work->port; mutex_lock(&priv->state_lock); - mlx5_vxlan_add_port(priv->vxlan, port); + mlx5_vxlan_add_port(priv->mdev->vxlan, port); mutex_unlock(&priv->state_lock); kfree(vxlan_work); @@ -3997,7 +3997,7 @@ static void mlx5e_vxlan_del_work(struct work_struct *work) u16 port = vxlan_work->port; mutex_lock(&priv->state_lock); - mlx5_vxlan_del_port(priv->vxlan, port); + mlx5_vxlan_del_port(priv->mdev->vxlan, port); mutex_unlock(&priv->state_lock); kfree(vxlan_work); } @@ -4028,7 +4028,7 @@ static void mlx5e_add_vxlan_port(struct net_device *netdev, if (ti->type != UDP_TUNNEL_TYPE_VXLAN) return; - if (!mlx5_vxlan_allowed(priv->vxlan)) + if (!mlx5_vxlan_allowed(priv->mdev->vxlan)) return; mlx5e_vxlan_queue_work(priv, be16_to_cpu(ti->port), 1); @@ -4042,7 +4042,7 @@ static void mlx5e_del_vxlan_port(struct net_device *netdev, if (ti->type != UDP_TUNNEL_TYPE_VXLAN) return; - if (!mlx5_vxlan_allowed(priv->vxlan)) + if (!mlx5_vxlan_allowed(priv->mdev->vxlan)) return; mlx5e_vxlan_queue_work(priv, be16_to_cpu(ti->port), 0); @@ -4076,7 +4076,7 @@ static netdev_features_t mlx5e_tunnel_features_check(struct mlx5e_priv *priv, port = be16_to_cpu(udph->dest); /* Verify if UDP port is being offloaded by HW */ - if (mlx5_vxlan_lookup_port(priv->vxlan, port)) + if (mlx5_vxlan_lookup_port(priv->mdev->vxlan, port)) return features; } @@ -4648,7 +4648,7 @@ static void mlx5e_build_nic_netdev(struct net_device *netdev) netdev->hw_features |= NETIF_F_HW_VLAN_CTAG_FILTER; netdev->hw_features |= NETIF_F_HW_VLAN_STAG_TX; - if (mlx5_vxlan_allowed(priv->vxlan) || MLX5_CAP_ETH(mdev, tunnel_stateless_gre)) { + if (mlx5_vxlan_allowed(mdev->vxlan) || MLX5_CAP_ETH(mdev, tunnel_stateless_gre)) { netdev->hw_enc_features |= NETIF_F_IP_CSUM; netdev->hw_enc_features |= NETIF_F_IPV6_CSUM; netdev->hw_enc_features |= NETIF_F_TSO; @@ -4656,7 +4656,7 @@ static void mlx5e_build_nic_netdev(struct net_device *netdev) netdev->hw_enc_features |= NETIF_F_GSO_PARTIAL; } - if (mlx5_vxlan_allowed(priv->vxlan)) { + if (mlx5_vxlan_allowed(mdev->vxlan)) { netdev->hw_features |= NETIF_F_GSO_UDP_TUNNEL | NETIF_F_GSO_UDP_TUNNEL_CSUM; netdev->hw_enc_features |= NETIF_F_GSO_UDP_TUNNEL | @@ -4758,8 +4758,6 @@ static void mlx5e_nic_init(struct mlx5_core_dev *mdev, struct mlx5e_priv *priv = netdev_priv(netdev); int err; - priv->vxlan = mlx5_vxlan_create(mdev); - mlx5e_build_nic_netdev_priv(mdev, netdev, profile, ppriv); err = mlx5e_ipsec_init(priv); if (err) @@ -4773,7 +4771,6 @@ static void mlx5e_nic_init(struct mlx5_core_dev *mdev, static void mlx5e_nic_cleanup(struct mlx5e_priv *priv) { - mlx5_vxlan_destroy(priv->vxlan); mlx5e_tls_cleanup(priv); mlx5e_ipsec_cleanup(priv); } diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c index 1b4931b62094..288a57f76e84 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c @@ -50,7 +50,7 @@ #include "en_rep.h" #include "en_tc.h" #include "eswitch.h" -#include "vxlan.h" +#include "lib/vxlan.h" #include "fs_core.h" #include "en/port.h" @@ -1133,7 +1133,7 @@ static int parse_tunnel_attr(struct mlx5e_priv *priv, if (memchr_inv(&mask->dst, 0xff, sizeof(mask->dst))) goto vxlan_match_offload_err; - if (mlx5_vxlan_lookup_port(up_priv->vxlan, be16_to_cpu(key->dst)) && + if (mlx5_vxlan_lookup_port(up_priv->mdev->vxlan, be16_to_cpu(key->dst)) && MLX5_CAP_ESW(priv->mdev, vxlan_encap_decap)) parse_vxlan_attr(spec, f); else { @@ -2557,7 +2557,7 @@ vxlan_encap_offload_err: return -EOPNOTSUPP; } - if (mlx5_vxlan_lookup_port(up_priv->vxlan, be16_to_cpu(key->tp_dst)) && + if (mlx5_vxlan_lookup_port(up_priv->mdev->vxlan, be16_to_cpu(key->tp_dst)) && MLX5_CAP_ESW(priv->mdev, vxlan_encap_decap)) { tunnel_type = MLX5_HEADER_TYPE_VXLAN; } else { diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lib/vxlan.c b/drivers/net/ethernet/mellanox/mlx5/core/lib/vxlan.c new file mode 100644 index 000000000000..9a8fd762167b --- /dev/null +++ b/drivers/net/ethernet/mellanox/mlx5/core/lib/vxlan.c @@ -0,0 +1,230 @@ +/* + * Copyright (c) 2016, Mellanox Technologies, Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include +#include "mlx5_core.h" +#include "vxlan.h" + +struct mlx5_vxlan { + struct mlx5_core_dev *mdev; + spinlock_t lock; /* protect vxlan table */ + /* max_num_ports is usuallly 4, 16 buckets is more than enough */ + DECLARE_HASHTABLE(htable, 4); + int num_ports; + struct mutex sync_lock; /* sync add/del port HW operations */ +}; + +struct mlx5_vxlan_port { + struct hlist_node hlist; + atomic_t refcount; + u16 udp_port; +}; + +static inline u8 mlx5_vxlan_max_udp_ports(struct mlx5_core_dev *mdev) +{ + return MLX5_CAP_ETH(mdev, max_vxlan_udp_ports) ?: 4; +} + +static int mlx5_vxlan_core_add_port_cmd(struct mlx5_core_dev *mdev, u16 port) +{ + u32 in[MLX5_ST_SZ_DW(add_vxlan_udp_dport_in)] = {0}; + u32 out[MLX5_ST_SZ_DW(add_vxlan_udp_dport_out)] = {0}; + + MLX5_SET(add_vxlan_udp_dport_in, in, opcode, + MLX5_CMD_OP_ADD_VXLAN_UDP_DPORT); + MLX5_SET(add_vxlan_udp_dport_in, in, vxlan_udp_port, port); + return mlx5_cmd_exec(mdev, in, sizeof(in), out, sizeof(out)); +} + +static int mlx5_vxlan_core_del_port_cmd(struct mlx5_core_dev *mdev, u16 port) +{ + u32 in[MLX5_ST_SZ_DW(delete_vxlan_udp_dport_in)] = {0}; + u32 out[MLX5_ST_SZ_DW(delete_vxlan_udp_dport_out)] = {0}; + + MLX5_SET(delete_vxlan_udp_dport_in, in, opcode, + MLX5_CMD_OP_DELETE_VXLAN_UDP_DPORT); + MLX5_SET(delete_vxlan_udp_dport_in, in, vxlan_udp_port, port); + return mlx5_cmd_exec(mdev, in, sizeof(in), out, sizeof(out)); +} + +static struct mlx5_vxlan_port* +mlx5_vxlan_lookup_port_locked(struct mlx5_vxlan *vxlan, u16 port) +{ + struct mlx5_vxlan_port *vxlanp; + + hash_for_each_possible(vxlan->htable, vxlanp, hlist, port) { + if (vxlanp->udp_port == port) + return vxlanp; + } + + return NULL; +} + +struct mlx5_vxlan_port *mlx5_vxlan_lookup_port(struct mlx5_vxlan *vxlan, u16 port) +{ + struct mlx5_vxlan_port *vxlanp; + + if (!mlx5_vxlan_allowed(vxlan)) + return NULL; + + spin_lock_bh(&vxlan->lock); + vxlanp = mlx5_vxlan_lookup_port_locked(vxlan, port); + spin_unlock_bh(&vxlan->lock); + + return vxlanp; +} + +int mlx5_vxlan_add_port(struct mlx5_vxlan *vxlan, u16 port) +{ + struct mlx5_vxlan_port *vxlanp; + int ret = -ENOSPC; + + vxlanp = mlx5_vxlan_lookup_port(vxlan, port); + if (vxlanp) { + atomic_inc(&vxlanp->refcount); + return 0; + } + + mutex_lock(&vxlan->sync_lock); + if (vxlan->num_ports >= mlx5_vxlan_max_udp_ports(vxlan->mdev)) { + mlx5_core_info(vxlan->mdev, + "UDP port (%d) not offloaded, max number of UDP ports (%d) are already offloaded\n", + port, mlx5_vxlan_max_udp_ports(vxlan->mdev)); + ret = -ENOSPC; + goto unlock; + } + + ret = mlx5_vxlan_core_add_port_cmd(vxlan->mdev, port); + if (ret) + goto unlock; + + vxlanp = kzalloc(sizeof(*vxlanp), GFP_KERNEL); + if (!vxlanp) { + ret = -ENOMEM; + goto err_delete_port; + } + + vxlanp->udp_port = port; + atomic_set(&vxlanp->refcount, 1); + + spin_lock_bh(&vxlan->lock); + hash_add(vxlan->htable, &vxlanp->hlist, port); + spin_unlock_bh(&vxlan->lock); + + vxlan->num_ports++; + mutex_unlock(&vxlan->sync_lock); + return 0; + +err_delete_port: + mlx5_vxlan_core_del_port_cmd(vxlan->mdev, port); + +unlock: + mutex_unlock(&vxlan->sync_lock); + return ret; +} + +int mlx5_vxlan_del_port(struct mlx5_vxlan *vxlan, u16 port) +{ + struct mlx5_vxlan_port *vxlanp; + bool remove = false; + int ret = 0; + + mutex_lock(&vxlan->sync_lock); + + spin_lock_bh(&vxlan->lock); + vxlanp = mlx5_vxlan_lookup_port_locked(vxlan, port); + if (!vxlanp) { + ret = -ENOENT; + goto out_unlock; + } + + if (atomic_dec_and_test(&vxlanp->refcount)) { + hash_del(&vxlanp->hlist); + remove = true; + } + +out_unlock: + spin_unlock_bh(&vxlan->lock); + + if (remove) { + mlx5_vxlan_core_del_port_cmd(vxlan->mdev, port); + kfree(vxlanp); + vxlan->num_ports--; + } + + mutex_unlock(&vxlan->sync_lock); + + return ret; +} + +struct mlx5_vxlan *mlx5_vxlan_create(struct mlx5_core_dev *mdev) +{ + struct mlx5_vxlan *vxlan; + + if (!MLX5_CAP_ETH(mdev, tunnel_stateless_vxlan) || !mlx5_core_is_pf(mdev)) + return ERR_PTR(-ENOTSUPP); + + vxlan = kzalloc(sizeof(*vxlan), GFP_KERNEL); + if (!vxlan) + return ERR_PTR(-ENOMEM); + + vxlan->mdev = mdev; + mutex_init(&vxlan->sync_lock); + spin_lock_init(&vxlan->lock); + hash_init(vxlan->htable); + + /* Hardware adds 4789 by default */ + mlx5_vxlan_add_port(vxlan, 4789); + + return vxlan; +} + +void mlx5_vxlan_destroy(struct mlx5_vxlan *vxlan) +{ + struct mlx5_vxlan_port *vxlanp; + struct hlist_node *tmp; + int bkt; + + if (!mlx5_vxlan_allowed(vxlan)) + return; + + /* Lockless since we are the only hash table consumers*/ + hash_for_each_safe(vxlan->htable, bkt, tmp, vxlanp, hlist) { + hash_del(&vxlanp->hlist); + mlx5_vxlan_core_del_port_cmd(vxlan->mdev, vxlanp->udp_port); + kfree(vxlanp); + } + + kfree(vxlan); +} diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lib/vxlan.h b/drivers/net/ethernet/mellanox/mlx5/core/lib/vxlan.h new file mode 100644 index 000000000000..fd874a30c4d0 --- /dev/null +++ b/drivers/net/ethernet/mellanox/mlx5/core/lib/vxlan.h @@ -0,0 +1,64 @@ +/* + * Copyright (c) 2016, Mellanox Technologies, Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef __MLX5_VXLAN_H__ +#define __MLX5_VXLAN_H__ + +#include + +struct mlx5_vxlan; +struct mlx5_vxlan_port; + +#ifdef CONFIG_MLX5_CORE_EN + +static inline bool mlx5_vxlan_allowed(struct mlx5_vxlan *vxlan) +{ + /* not allowed reason is encoded in vxlan pointer as error, + * on mlx5_vxlan_create + */ + return !IS_ERR_OR_NULL(vxlan); +} + +struct mlx5_vxlan *mlx5_vxlan_create(struct mlx5_core_dev *mdev); +void mlx5_vxlan_destroy(struct mlx5_vxlan *vxlan); +int mlx5_vxlan_add_port(struct mlx5_vxlan *vxlan, u16 port); +int mlx5_vxlan_del_port(struct mlx5_vxlan *vxlan, u16 port); +struct mlx5_vxlan_port *mlx5_vxlan_lookup_port(struct mlx5_vxlan *vxlan, u16 port); + +#else + +static inline struct mlx5_vxlan* +mlx5_vxlan_create(struct mlx5_core_dev *mdev) { return ERR_PTR(-ENOTSUPP); } +static inline void mlx5_vxlan_destroy(struct mlx5_vxlan *vxlan) { return; } + +#endif + +#endif /* __MLX5_VXLAN_H__ */ diff --git a/drivers/net/ethernet/mellanox/mlx5/core/main.c b/drivers/net/ethernet/mellanox/mlx5/core/main.c index 6ddbb70e95de..03b9c6733eed 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/main.c @@ -62,6 +62,7 @@ #include "accel/ipsec.h" #include "accel/tls.h" #include "lib/clock.h" +#include "lib/vxlan.h" #include "diag/fw_tracer.h" MODULE_AUTHOR("Eli Cohen "); @@ -961,6 +962,8 @@ static int mlx5_init_once(struct mlx5_core_dev *dev, struct mlx5_priv *priv) mlx5_init_clock(dev); + dev->vxlan = mlx5_vxlan_create(dev); + err = mlx5_init_rl_table(dev); if (err) { dev_err(&pdev->dev, "Failed to init rate limiting\n"); @@ -1004,6 +1007,7 @@ err_mpfs_cleanup: err_rl_cleanup: mlx5_cleanup_rl_table(dev); err_tables_cleanup: + mlx5_vxlan_destroy(dev->vxlan); mlx5_cleanup_mkey_table(dev); mlx5_cleanup_srq_table(dev); mlx5_cleanup_qp_table(dev); @@ -1024,6 +1028,7 @@ static void mlx5_cleanup_once(struct mlx5_core_dev *dev) mlx5_eswitch_cleanup(dev->priv.eswitch); mlx5_mpfs_cleanup(dev); mlx5_cleanup_rl_table(dev); + mlx5_vxlan_destroy(dev->vxlan); mlx5_cleanup_clock(dev); mlx5_cleanup_reserved_gids(dev); mlx5_cleanup_mkey_table(dev); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/vxlan.c b/drivers/net/ethernet/mellanox/mlx5/core/vxlan.c deleted file mode 100644 index 9a8fd762167b..000000000000 --- a/drivers/net/ethernet/mellanox/mlx5/core/vxlan.c +++ /dev/null @@ -1,230 +0,0 @@ -/* - * Copyright (c) 2016, Mellanox Technologies, Ltd. All rights reserved. - * - * This software is available to you under a choice of one of two - * licenses. You may choose to be licensed under the terms of the GNU - * General Public License (GPL) Version 2, available from the file - * COPYING in the main directory of this source tree, or the - * OpenIB.org BSD license below: - * - * Redistribution and use in source and binary forms, with or - * without modification, are permitted provided that the following - * conditions are met: - * - * - Redistributions of source code must retain the above - * copyright notice, this list of conditions and the following - * disclaimer. - * - * - Redistributions in binary form must reproduce the above - * copyright notice, this list of conditions and the following - * disclaimer in the documentation and/or other materials - * provided with the distribution. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -#include -#include -#include -#include "mlx5_core.h" -#include "vxlan.h" - -struct mlx5_vxlan { - struct mlx5_core_dev *mdev; - spinlock_t lock; /* protect vxlan table */ - /* max_num_ports is usuallly 4, 16 buckets is more than enough */ - DECLARE_HASHTABLE(htable, 4); - int num_ports; - struct mutex sync_lock; /* sync add/del port HW operations */ -}; - -struct mlx5_vxlan_port { - struct hlist_node hlist; - atomic_t refcount; - u16 udp_port; -}; - -static inline u8 mlx5_vxlan_max_udp_ports(struct mlx5_core_dev *mdev) -{ - return MLX5_CAP_ETH(mdev, max_vxlan_udp_ports) ?: 4; -} - -static int mlx5_vxlan_core_add_port_cmd(struct mlx5_core_dev *mdev, u16 port) -{ - u32 in[MLX5_ST_SZ_DW(add_vxlan_udp_dport_in)] = {0}; - u32 out[MLX5_ST_SZ_DW(add_vxlan_udp_dport_out)] = {0}; - - MLX5_SET(add_vxlan_udp_dport_in, in, opcode, - MLX5_CMD_OP_ADD_VXLAN_UDP_DPORT); - MLX5_SET(add_vxlan_udp_dport_in, in, vxlan_udp_port, port); - return mlx5_cmd_exec(mdev, in, sizeof(in), out, sizeof(out)); -} - -static int mlx5_vxlan_core_del_port_cmd(struct mlx5_core_dev *mdev, u16 port) -{ - u32 in[MLX5_ST_SZ_DW(delete_vxlan_udp_dport_in)] = {0}; - u32 out[MLX5_ST_SZ_DW(delete_vxlan_udp_dport_out)] = {0}; - - MLX5_SET(delete_vxlan_udp_dport_in, in, opcode, - MLX5_CMD_OP_DELETE_VXLAN_UDP_DPORT); - MLX5_SET(delete_vxlan_udp_dport_in, in, vxlan_udp_port, port); - return mlx5_cmd_exec(mdev, in, sizeof(in), out, sizeof(out)); -} - -static struct mlx5_vxlan_port* -mlx5_vxlan_lookup_port_locked(struct mlx5_vxlan *vxlan, u16 port) -{ - struct mlx5_vxlan_port *vxlanp; - - hash_for_each_possible(vxlan->htable, vxlanp, hlist, port) { - if (vxlanp->udp_port == port) - return vxlanp; - } - - return NULL; -} - -struct mlx5_vxlan_port *mlx5_vxlan_lookup_port(struct mlx5_vxlan *vxlan, u16 port) -{ - struct mlx5_vxlan_port *vxlanp; - - if (!mlx5_vxlan_allowed(vxlan)) - return NULL; - - spin_lock_bh(&vxlan->lock); - vxlanp = mlx5_vxlan_lookup_port_locked(vxlan, port); - spin_unlock_bh(&vxlan->lock); - - return vxlanp; -} - -int mlx5_vxlan_add_port(struct mlx5_vxlan *vxlan, u16 port) -{ - struct mlx5_vxlan_port *vxlanp; - int ret = -ENOSPC; - - vxlanp = mlx5_vxlan_lookup_port(vxlan, port); - if (vxlanp) { - atomic_inc(&vxlanp->refcount); - return 0; - } - - mutex_lock(&vxlan->sync_lock); - if (vxlan->num_ports >= mlx5_vxlan_max_udp_ports(vxlan->mdev)) { - mlx5_core_info(vxlan->mdev, - "UDP port (%d) not offloaded, max number of UDP ports (%d) are already offloaded\n", - port, mlx5_vxlan_max_udp_ports(vxlan->mdev)); - ret = -ENOSPC; - goto unlock; - } - - ret = mlx5_vxlan_core_add_port_cmd(vxlan->mdev, port); - if (ret) - goto unlock; - - vxlanp = kzalloc(sizeof(*vxlanp), GFP_KERNEL); - if (!vxlanp) { - ret = -ENOMEM; - goto err_delete_port; - } - - vxlanp->udp_port = port; - atomic_set(&vxlanp->refcount, 1); - - spin_lock_bh(&vxlan->lock); - hash_add(vxlan->htable, &vxlanp->hlist, port); - spin_unlock_bh(&vxlan->lock); - - vxlan->num_ports++; - mutex_unlock(&vxlan->sync_lock); - return 0; - -err_delete_port: - mlx5_vxlan_core_del_port_cmd(vxlan->mdev, port); - -unlock: - mutex_unlock(&vxlan->sync_lock); - return ret; -} - -int mlx5_vxlan_del_port(struct mlx5_vxlan *vxlan, u16 port) -{ - struct mlx5_vxlan_port *vxlanp; - bool remove = false; - int ret = 0; - - mutex_lock(&vxlan->sync_lock); - - spin_lock_bh(&vxlan->lock); - vxlanp = mlx5_vxlan_lookup_port_locked(vxlan, port); - if (!vxlanp) { - ret = -ENOENT; - goto out_unlock; - } - - if (atomic_dec_and_test(&vxlanp->refcount)) { - hash_del(&vxlanp->hlist); - remove = true; - } - -out_unlock: - spin_unlock_bh(&vxlan->lock); - - if (remove) { - mlx5_vxlan_core_del_port_cmd(vxlan->mdev, port); - kfree(vxlanp); - vxlan->num_ports--; - } - - mutex_unlock(&vxlan->sync_lock); - - return ret; -} - -struct mlx5_vxlan *mlx5_vxlan_create(struct mlx5_core_dev *mdev) -{ - struct mlx5_vxlan *vxlan; - - if (!MLX5_CAP_ETH(mdev, tunnel_stateless_vxlan) || !mlx5_core_is_pf(mdev)) - return ERR_PTR(-ENOTSUPP); - - vxlan = kzalloc(sizeof(*vxlan), GFP_KERNEL); - if (!vxlan) - return ERR_PTR(-ENOMEM); - - vxlan->mdev = mdev; - mutex_init(&vxlan->sync_lock); - spin_lock_init(&vxlan->lock); - hash_init(vxlan->htable); - - /* Hardware adds 4789 by default */ - mlx5_vxlan_add_port(vxlan, 4789); - - return vxlan; -} - -void mlx5_vxlan_destroy(struct mlx5_vxlan *vxlan) -{ - struct mlx5_vxlan_port *vxlanp; - struct hlist_node *tmp; - int bkt; - - if (!mlx5_vxlan_allowed(vxlan)) - return; - - /* Lockless since we are the only hash table consumers*/ - hash_for_each_safe(vxlan->htable, bkt, tmp, vxlanp, hlist) { - hash_del(&vxlanp->hlist); - mlx5_vxlan_core_del_port_cmd(vxlan->mdev, vxlanp->udp_port); - kfree(vxlanp); - } - - kfree(vxlan); -} diff --git a/drivers/net/ethernet/mellanox/mlx5/core/vxlan.h b/drivers/net/ethernet/mellanox/mlx5/core/vxlan.h deleted file mode 100644 index fd874a30c4d0..000000000000 --- a/drivers/net/ethernet/mellanox/mlx5/core/vxlan.h +++ /dev/null @@ -1,64 +0,0 @@ -/* - * Copyright (c) 2016, Mellanox Technologies, Ltd. All rights reserved. - * - * This software is available to you under a choice of one of two - * licenses. You may choose to be licensed under the terms of the GNU - * General Public License (GPL) Version 2, available from the file - * COPYING in the main directory of this source tree, or the - * OpenIB.org BSD license below: - * - * Redistribution and use in source and binary forms, with or - * without modification, are permitted provided that the following - * conditions are met: - * - * - Redistributions of source code must retain the above - * copyright notice, this list of conditions and the following - * disclaimer. - * - * - Redistributions in binary form must reproduce the above - * copyright notice, this list of conditions and the following - * disclaimer in the documentation and/or other materials - * provided with the distribution. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef __MLX5_VXLAN_H__ -#define __MLX5_VXLAN_H__ - -#include - -struct mlx5_vxlan; -struct mlx5_vxlan_port; - -#ifdef CONFIG_MLX5_CORE_EN - -static inline bool mlx5_vxlan_allowed(struct mlx5_vxlan *vxlan) -{ - /* not allowed reason is encoded in vxlan pointer as error, - * on mlx5_vxlan_create - */ - return !IS_ERR_OR_NULL(vxlan); -} - -struct mlx5_vxlan *mlx5_vxlan_create(struct mlx5_core_dev *mdev); -void mlx5_vxlan_destroy(struct mlx5_vxlan *vxlan); -int mlx5_vxlan_add_port(struct mlx5_vxlan *vxlan, u16 port); -int mlx5_vxlan_del_port(struct mlx5_vxlan *vxlan, u16 port); -struct mlx5_vxlan_port *mlx5_vxlan_lookup_port(struct mlx5_vxlan *vxlan, u16 port); - -#else - -static inline struct mlx5_vxlan* -mlx5_vxlan_create(struct mlx5_core_dev *mdev) { return ERR_PTR(-ENOTSUPP); } -static inline void mlx5_vxlan_destroy(struct mlx5_vxlan *vxlan) { return; } - -#endif - -#endif /* __MLX5_VXLAN_H__ */ diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index fd0aaa5568fe..54f385cc8811 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -818,6 +818,7 @@ struct mlx5_clock { }; struct mlx5_fw_tracer; +struct mlx5_vxlan; struct mlx5_core_dev { struct pci_dev *pdev; @@ -850,6 +851,7 @@ struct mlx5_core_dev { atomic_t num_qps; u32 issi; struct mlx5e_resources mlx5e_res; + struct mlx5_vxlan *vxlan; struct { struct mlx5_rsvd_gids reserved_gids; u32 roce_en; -- cgit v1.2.3-59-g8ed1b From 5cbf777cfdf6e5a7b7149006e4881a255da78fdd Mon Sep 17 00:00:00 2001 From: Xin Long Date: Fri, 27 Jul 2018 16:37:28 +0800 Subject: route: add support for directed broadcast forwarding This patch implements the feature described in rfc1812#section-5.3.5.2 and rfc2644. It allows the router to forward directed broadcast when sysctl bc_forwarding is enabled. Note that this feature could be done by iptables -j TEE, but it would cause some problems: - target TEE's gateway param has to be set with a specific address, and it's not flexible especially when the route wants forward all directed broadcasts. - this duplicates the directed broadcasts so this may cause side effects to applications. Besides, to keep consistent with other os router like BSD, it's also necessary to implement it in the route rx path. Note that route cache needs to be flushed when bc_forwarding is changed. Signed-off-by: Xin Long Signed-off-by: David S. Miller --- include/linux/inetdevice.h | 1 + include/uapi/linux/ip.h | 1 + include/uapi/linux/netconf.h | 1 + net/ipv4/devinet.c | 11 +++++++++++ net/ipv4/route.c | 6 +++++- 5 files changed, 19 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/inetdevice.h b/include/linux/inetdevice.h index 27650f1bff3d..c759d1cbcedd 100644 --- a/include/linux/inetdevice.h +++ b/include/linux/inetdevice.h @@ -93,6 +93,7 @@ static inline void ipv4_devconf_setall(struct in_device *in_dev) #define IN_DEV_FORWARD(in_dev) IN_DEV_CONF_GET((in_dev), FORWARDING) #define IN_DEV_MFORWARD(in_dev) IN_DEV_ANDCONF((in_dev), MC_FORWARDING) +#define IN_DEV_BFORWARD(in_dev) IN_DEV_ANDCONF((in_dev), BC_FORWARDING) #define IN_DEV_RPFILTER(in_dev) IN_DEV_MAXCONF((in_dev), RP_FILTER) #define IN_DEV_SRC_VMARK(in_dev) IN_DEV_ORCONF((in_dev), SRC_VMARK) #define IN_DEV_SOURCE_ROUTE(in_dev) IN_DEV_ANDCONF((in_dev), \ diff --git a/include/uapi/linux/ip.h b/include/uapi/linux/ip.h index b24a742beae5..e42d13b55cf3 100644 --- a/include/uapi/linux/ip.h +++ b/include/uapi/linux/ip.h @@ -168,6 +168,7 @@ enum IPV4_DEVCONF_IGNORE_ROUTES_WITH_LINKDOWN, IPV4_DEVCONF_DROP_UNICAST_IN_L2_MULTICAST, IPV4_DEVCONF_DROP_GRATUITOUS_ARP, + IPV4_DEVCONF_BC_FORWARDING, __IPV4_DEVCONF_MAX }; diff --git a/include/uapi/linux/netconf.h b/include/uapi/linux/netconf.h index c84fcdfca862..fac4edd55379 100644 --- a/include/uapi/linux/netconf.h +++ b/include/uapi/linux/netconf.h @@ -18,6 +18,7 @@ enum { NETCONFA_PROXY_NEIGH, NETCONFA_IGNORE_ROUTES_WITH_LINKDOWN, NETCONFA_INPUT, + NETCONFA_BC_FORWARDING, __NETCONFA_MAX }; #define NETCONFA_MAX (__NETCONFA_MAX - 1) diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c index d7585ab1a77a..ea4bd8a52422 100644 --- a/net/ipv4/devinet.c +++ b/net/ipv4/devinet.c @@ -1827,6 +1827,8 @@ static int inet_netconf_msgsize_devconf(int type) size += nla_total_size(4); if (all || type == NETCONFA_MC_FORWARDING) size += nla_total_size(4); + if (all || type == NETCONFA_BC_FORWARDING) + size += nla_total_size(4); if (all || type == NETCONFA_PROXY_NEIGH) size += nla_total_size(4); if (all || type == NETCONFA_IGNORE_ROUTES_WITH_LINKDOWN) @@ -1873,6 +1875,10 @@ static int inet_netconf_fill_devconf(struct sk_buff *skb, int ifindex, nla_put_s32(skb, NETCONFA_MC_FORWARDING, IPV4_DEVCONF(*devconf, MC_FORWARDING)) < 0) goto nla_put_failure; + if ((all || type == NETCONFA_BC_FORWARDING) && + nla_put_s32(skb, NETCONFA_BC_FORWARDING, + IPV4_DEVCONF(*devconf, BC_FORWARDING)) < 0) + goto nla_put_failure; if ((all || type == NETCONFA_PROXY_NEIGH) && nla_put_s32(skb, NETCONFA_PROXY_NEIGH, IPV4_DEVCONF(*devconf, PROXY_ARP)) < 0) @@ -2143,6 +2149,10 @@ static int devinet_conf_proc(struct ctl_table *ctl, int write, if ((new_value == 0) && (old_value != 0)) rt_cache_flush(net); + if (i == IPV4_DEVCONF_BC_FORWARDING - 1 && + new_value != old_value) + rt_cache_flush(net); + if (i == IPV4_DEVCONF_RP_FILTER - 1 && new_value != old_value) { ifindex = devinet_conf_ifindex(net, cnf); @@ -2259,6 +2269,7 @@ static struct devinet_sysctl_table { DEVINET_SYSCTL_COMPLEX_ENTRY(FORWARDING, "forwarding", devinet_sysctl_forward), DEVINET_SYSCTL_RO_ENTRY(MC_FORWARDING, "mc_forwarding"), + DEVINET_SYSCTL_RW_ENTRY(BC_FORWARDING, "bc_forwarding"), DEVINET_SYSCTL_RW_ENTRY(ACCEPT_REDIRECTS, "accept_redirects"), DEVINET_SYSCTL_RW_ENTRY(SECURE_REDIRECTS, "secure_redirects"), diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 1df6e97106d7..b678466da451 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -1996,8 +1996,11 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr, goto no_route; } - if (res->type == RTN_BROADCAST) + if (res->type == RTN_BROADCAST) { + if (IN_DEV_BFORWARD(in_dev)) + goto make_route; goto brd_input; + } if (res->type == RTN_LOCAL) { err = fib_validate_source(skb, saddr, daddr, tos, @@ -2014,6 +2017,7 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr, if (res->type != RTN_UNICAST) goto martian_destination; +make_route: err = ip_mkroute_input(skb, res, in_dev, daddr, saddr, tos, flkeys); out: return err; -- cgit v1.2.3-59-g8ed1b From 7a4c53bee3324ac00bf964aa2f82d15d279e86e4 Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Fri, 27 Jul 2018 13:43:23 -0700 Subject: net: report invalid mtu value via netlink extack If an invalid MTU value is set through rtnetlink return extra error information instead of putting message in kernel log. For other cases where there is no visible API, keep the error report in the log. Example: # ip li set dev enp12s0 mtu 10000 Error: mtu greater than device maximum. # ifconfig enp12s0 mtu 10000 SIOCSIFMTU: Invalid argument # dmesg | tail -1 [ 2047.795467] enp12s0: mtu greater than device maximum Signed-off-by: Stephen Hemminger Signed-off-by: David S. Miller --- include/linux/netdevice.h | 2 ++ net/core/dev.c | 23 +++++++++++++++++------ net/core/rtnetlink.c | 2 +- 3 files changed, 20 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index c1295c7a452e..9c917467a2c7 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -3546,6 +3546,8 @@ int dev_set_alias(struct net_device *, const char *, size_t); int dev_get_alias(const struct net_device *, char *, size_t); int dev_change_net_namespace(struct net_device *, struct net *, const char *); int __dev_set_mtu(struct net_device *, int); +int dev_set_mtu_ext(struct net_device *dev, int mtu, + struct netlink_ext_ack *extack); int dev_set_mtu(struct net_device *, int); int dev_change_tx_queue_len(struct net_device *, unsigned long); void dev_set_group(struct net_device *, int); diff --git a/net/core/dev.c b/net/core/dev.c index 87c42c8249ae..89031b5fef9f 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -7523,13 +7523,15 @@ int __dev_set_mtu(struct net_device *dev, int new_mtu) EXPORT_SYMBOL(__dev_set_mtu); /** - * dev_set_mtu - Change maximum transfer unit + * dev_set_mtu_ext - Change maximum transfer unit * @dev: device * @new_mtu: new transfer unit + * @extack: netlink extended ack * * Change the maximum transfer size of the network device. */ -int dev_set_mtu(struct net_device *dev, int new_mtu) +int dev_set_mtu_ext(struct net_device *dev, int new_mtu, + struct netlink_ext_ack *extack) { int err, orig_mtu; @@ -7538,14 +7540,12 @@ int dev_set_mtu(struct net_device *dev, int new_mtu) /* MTU must be positive, and in range */ if (new_mtu < 0 || new_mtu < dev->min_mtu) { - net_err_ratelimited("%s: Invalid MTU %d requested, hw min %d\n", - dev->name, new_mtu, dev->min_mtu); + NL_SET_ERR_MSG(extack, "mtu less than device minimum"); return -EINVAL; } if (dev->max_mtu > 0 && new_mtu > dev->max_mtu) { - net_err_ratelimited("%s: Invalid MTU %d requested, hw max %d\n", - dev->name, new_mtu, dev->max_mtu); + NL_SET_ERR_MSG(extack, "mtu greater than device maximum"); return -EINVAL; } @@ -7573,6 +7573,17 @@ int dev_set_mtu(struct net_device *dev, int new_mtu) } return err; } + +int dev_set_mtu(struct net_device *dev, int new_mtu) +{ + struct netlink_ext_ack extack; + int err; + + err = dev_set_mtu_ext(dev, new_mtu, &extack); + if (err) + net_err_ratelimited("%s: %s\n", dev->name, extack._msg); + return err; +} EXPORT_SYMBOL(dev_set_mtu); /** diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 510d4f765a13..24431e578310 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -2382,7 +2382,7 @@ static int do_setlink(const struct sk_buff *skb, } if (tb[IFLA_MTU]) { - err = dev_set_mtu(dev, nla_get_u32(tb[IFLA_MTU])); + err = dev_set_mtu_ext(dev, nla_get_u32(tb[IFLA_MTU]), extack); if (err < 0) goto errout; status |= DO_SETLINK_MODIFIED; -- cgit v1.2.3-59-g8ed1b From 51c23b47e6b8590ea7a6a6776ffb21810ece73bf Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Fri, 13 Jul 2018 14:54:45 +0200 Subject: netfilter: nf_osf: add nf_osf_find() This new function returns the OS genre as a string. Plan is to use to from the new nft_osf extension. Note that this doesn't yet support ttl options, but it could be easily extended to do so. Tested-by: Fernando Fernandez Mancera Signed-off-by: Pablo Neira Ayuso --- include/linux/netfilter/nf_osf.h | 9 +++++++++ net/netfilter/nf_osf.c | 30 ++++++++++++++++++++++++++++++ 2 files changed, 39 insertions(+) (limited to 'include/linux') diff --git a/include/linux/netfilter/nf_osf.h b/include/linux/netfilter/nf_osf.h index 0e114c492fb8..aee460fcbd31 100644 --- a/include/linux/netfilter/nf_osf.h +++ b/include/linux/netfilter/nf_osf.h @@ -1,3 +1,7 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _NFOSF_H +#define _NFOSF_H + #include /* Initial window size option state machine: multiple of mss, mtu or @@ -31,3 +35,8 @@ bool nf_osf_match(const struct sk_buff *skb, u_int8_t family, int hooknum, struct net_device *in, struct net_device *out, const struct nf_osf_info *info, struct net *net, const struct list_head *nf_osf_fingers); + +const char *nf_osf_find(const struct sk_buff *skb, + const struct list_head *nf_osf_fingers); + +#endif /* _NFOSF_H */ diff --git a/net/netfilter/nf_osf.c b/net/netfilter/nf_osf.c index b44d62d5d9a9..f4c75e982902 100644 --- a/net/netfilter/nf_osf.c +++ b/net/netfilter/nf_osf.c @@ -249,4 +249,34 @@ nf_osf_match(const struct sk_buff *skb, u_int8_t family, } EXPORT_SYMBOL_GPL(nf_osf_match); +const char *nf_osf_find(const struct sk_buff *skb, + const struct list_head *nf_osf_fingers) +{ + const struct iphdr *ip = ip_hdr(skb); + const struct nf_osf_user_finger *f; + unsigned char opts[MAX_IPOPTLEN]; + const struct nf_osf_finger *kf; + struct nf_osf_hdr_ctx ctx; + const struct tcphdr *tcp; + const char *genre = NULL; + + memset(&ctx, 0, sizeof(ctx)); + + tcp = nf_osf_hdr_ctx_init(&ctx, skb, ip, opts); + if (!tcp) + return false; + + list_for_each_entry_rcu(kf, &nf_osf_fingers[ctx.df], finger_entry) { + f = &kf->finger; + if (!nf_osf_match_one(skb, f, -1, &ctx)) + continue; + + genre = f->genre; + break; + } + + return genre; +} +EXPORT_SYMBOL_GPL(nf_osf_find); + MODULE_LICENSE("GPL"); -- cgit v1.2.3-59-g8ed1b From c29c2ebd2ae0066c026045a21aa33ccbfcd8bb3c Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Mon, 30 Jul 2018 20:43:51 -0700 Subject: net: update real_num_rx_queues even when !CONFIG_SYSFS We used to depend on real_num_rx_queues as a upper bound for sanity checks. For AF_XDP socket validation it's useful if the check behaves the same regardless of CONFIG_SYSFS setting. Signed-off-by: Jakub Kicinski Reviewed-by: Quentin Monnet Acked-by: Jesper Dangaard Brouer Signed-off-by: David S. Miller --- include/linux/netdevice.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 9c917467a2c7..3bf7e93c9e96 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -3431,8 +3431,9 @@ int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq); int netif_set_real_num_rx_queues(struct net_device *dev, unsigned int rxq); #else static inline int netif_set_real_num_rx_queues(struct net_device *dev, - unsigned int rxq) + unsigned int rxqs) { + dev->real_num_rx_queues = rxqs; return 0; } #endif -- cgit v1.2.3-59-g8ed1b From 84c6b86875e01a08a0daa6fdd4a01b36bf0bf0b2 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Mon, 30 Jul 2018 20:43:53 -0700 Subject: xsk: don't allow umem replace at stack level MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Currently drivers have to check if they already have a umem installed for a given queue and return an error if so. Make better use of XDP_QUERY_XSK_UMEM and move this functionality to the core. We need to keep rtnl across the calls now. Signed-off-by: Jakub Kicinski Reviewed-by: Quentin Monnet Acked-by: Björn Töpel Signed-off-by: David S. Miller --- include/linux/netdevice.h | 7 ++++--- net/xdp/xdp_umem.c | 37 ++++++++++++++++++++++++++++--------- 2 files changed, 32 insertions(+), 12 deletions(-) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 3bf7e93c9e96..282e2e95ad5b 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -872,10 +872,10 @@ struct netdev_bpf { struct { struct bpf_offloaded_map *offmap; }; - /* XDP_SETUP_XSK_UMEM */ + /* XDP_QUERY_XSK_UMEM, XDP_SETUP_XSK_UMEM */ struct { - struct xdp_umem *umem; - u16 queue_id; + struct xdp_umem *umem; /* out for query*/ + u16 queue_id; /* in for query */ } xsk; }; }; @@ -3568,6 +3568,7 @@ int dev_change_xdp_fd(struct net_device *dev, struct netlink_ext_ack *extack, int fd, u32 flags); u32 __dev_xdp_query(struct net_device *dev, bpf_op_t xdp_op, enum bpf_netdev_command cmd); +int xdp_umem_query(struct net_device *dev, u16 queue_id); int __dev_forward_skb(struct net_device *dev, struct sk_buff *skb); int dev_forward_skb(struct net_device *dev, struct sk_buff *skb); diff --git a/net/xdp/xdp_umem.c b/net/xdp/xdp_umem.c index c199d66b5f3f..911ca6d3cb5a 100644 --- a/net/xdp/xdp_umem.c +++ b/net/xdp/xdp_umem.c @@ -11,6 +11,8 @@ #include #include #include +#include +#include #include "xdp_umem.h" #include "xsk_queue.h" @@ -40,6 +42,21 @@ void xdp_del_sk_umem(struct xdp_umem *umem, struct xdp_sock *xs) } } +int xdp_umem_query(struct net_device *dev, u16 queue_id) +{ + struct netdev_bpf bpf; + + ASSERT_RTNL(); + + memset(&bpf, 0, sizeof(bpf)); + bpf.command = XDP_QUERY_XSK_UMEM; + bpf.xsk.queue_id = queue_id; + + if (!dev->netdev_ops->ndo_bpf) + return 0; + return dev->netdev_ops->ndo_bpf(dev, &bpf) ?: !!bpf.xsk.umem; +} + int xdp_umem_assign_dev(struct xdp_umem *umem, struct net_device *dev, u32 queue_id, u16 flags) { @@ -62,28 +79,30 @@ int xdp_umem_assign_dev(struct xdp_umem *umem, struct net_device *dev, bpf.command = XDP_QUERY_XSK_UMEM; rtnl_lock(); - err = dev->netdev_ops->ndo_bpf(dev, &bpf); - rtnl_unlock(); - - if (err) - return force_zc ? -ENOTSUPP : 0; + err = xdp_umem_query(dev, queue_id); + if (err) { + err = err < 0 ? -ENOTSUPP : -EBUSY; + goto err_rtnl_unlock; + } bpf.command = XDP_SETUP_XSK_UMEM; bpf.xsk.umem = umem; bpf.xsk.queue_id = queue_id; - rtnl_lock(); err = dev->netdev_ops->ndo_bpf(dev, &bpf); - rtnl_unlock(); - if (err) - return force_zc ? err : 0; /* fail or fallback */ + goto err_rtnl_unlock; + rtnl_unlock(); dev_hold(dev); umem->dev = dev; umem->queue_id = queue_id; umem->zc = true; return 0; + +err_rtnl_unlock: + rtnl_unlock(); + return force_zc ? err : 0; /* fail or fallback */ } static void xdp_umem_clear_dev(struct xdp_umem *umem) -- cgit v1.2.3-59-g8ed1b From e6476c21447c4b17c47e476aade6facf050f31e8 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 30 Jul 2018 09:45:07 +0200 Subject: net: remove bogus RCU annotations on socket.wq We never use RCU protection for it, just a lot of cargo-cult rcu_deference_protects calls. Note that we do keep the kfree_rcu call for it, as the references through struct sock are RCU protected and thus might require a grace period before freeing. Signed-off-by: Christoph Hellwig Reviewed-by: Eric Dumazet Acked-by: Paul E. McKenney Signed-off-by: David S. Miller --- include/linux/net.h | 2 +- include/net/sock.h | 2 +- net/socket.c | 10 ++++------ 3 files changed, 6 insertions(+), 8 deletions(-) (limited to 'include/linux') diff --git a/include/linux/net.h b/include/linux/net.h index 6554d3ba4396..e0930678c8bf 100644 --- a/include/linux/net.h +++ b/include/linux/net.h @@ -114,7 +114,7 @@ struct socket { unsigned long flags; - struct socket_wq __rcu *wq; + struct socket_wq *wq; struct file *file; struct sock *sk; diff --git a/include/net/sock.h b/include/net/sock.h index 2afea5d1bdfe..433f45fc2d68 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -1788,7 +1788,7 @@ static inline void sock_graft(struct sock *sk, struct socket *parent) { WARN_ON(parent->sk); write_lock_bh(&sk->sk_callback_lock); - sk->sk_wq = parent->wq; + rcu_assign_pointer(sk->sk_wq, parent->wq); parent->sk = sk; sk_set_socket(sk, parent); sk->sk_uid = SOCK_INODE(parent)->i_uid; diff --git a/net/socket.c b/net/socket.c index 5b7df6695f4f..475247e347ae 100644 --- a/net/socket.c +++ b/net/socket.c @@ -251,7 +251,7 @@ static struct inode *sock_alloc_inode(struct super_block *sb) init_waitqueue_head(&wq->wait); wq->fasync_list = NULL; wq->flags = 0; - RCU_INIT_POINTER(ei->socket.wq, wq); + ei->socket.wq = wq; ei->socket.state = SS_UNCONNECTED; ei->socket.flags = 0; @@ -265,11 +265,9 @@ static struct inode *sock_alloc_inode(struct super_block *sb) static void sock_destroy_inode(struct inode *inode) { struct socket_alloc *ei; - struct socket_wq *wq; ei = container_of(inode, struct socket_alloc, vfs_inode); - wq = rcu_dereference_protected(ei->socket.wq, 1); - kfree_rcu(wq, rcu); + kfree_rcu(ei->socket.wq, rcu); kmem_cache_free(sock_inode_cachep, ei); } @@ -603,7 +601,7 @@ static void __sock_release(struct socket *sock, struct inode *inode) module_put(owner); } - if (rcu_dereference_protected(sock->wq, 1)->fasync_list) + if (sock->wq->fasync_list) pr_err("%s: fasync list not empty!\n", __func__); if (!sock->file) { @@ -1181,7 +1179,7 @@ static int sock_fasync(int fd, struct file *filp, int on) return -EINVAL; lock_sock(sk); - wq = rcu_dereference_protected(sock->wq, lockdep_sock_is_held(sk)); + wq = sock->wq; fasync_helper(fd, filp, on, &wq->fasync_list); if (!wq->fasync_list) -- cgit v1.2.3-59-g8ed1b From ba113c3aa79a7f941ac162d05a3620bdc985c58d Mon Sep 17 00:00:00 2001 From: Wei Wang Date: Tue, 31 Jul 2018 17:46:21 -0700 Subject: tcp: add data bytes sent stats Introduce a new TCP stat to record the number of bytes sent (RFC4898 tcpEStatsPerfHCDataOctetsOut) and expose it in both tcp_info (TCP_INFO) and opt_stats (SOF_TIMESTAMPING_OPT_STATS). Signed-off-by: Wei Wang Signed-off-by: Eric Dumazet Acked-by: Neal Cardwell Acked-by: Soheil Hassas Yeganeh Acked-by: Yuchung Cheng Signed-off-by: David S. Miller --- include/linux/tcp.h | 3 +++ include/uapi/linux/tcp.h | 4 +++- net/ipv4/tcp.c | 6 ++++++ net/ipv4/tcp_output.c | 1 + 4 files changed, 13 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/tcp.h b/include/linux/tcp.h index 58a8d7d71354..d0798dcd2cab 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -181,6 +181,9 @@ struct tcp_sock { u32 data_segs_out; /* RFC4898 tcpEStatsPerfDataSegsOut * total number of data segments sent. */ + u64 bytes_sent; /* RFC4898 tcpEStatsPerfHCDataOctetsOut + * total number of data bytes sent. + */ u64 bytes_acked; /* RFC4898 tcpEStatsAppHCThruOctetsAcked * sum(delta(snd_una)), or how many bytes * were acked. diff --git a/include/uapi/linux/tcp.h b/include/uapi/linux/tcp.h index e3f6ed8a7064..1c70ed287c3b 100644 --- a/include/uapi/linux/tcp.h +++ b/include/uapi/linux/tcp.h @@ -235,6 +235,8 @@ struct tcp_info { __u32 tcpi_delivered; __u32 tcpi_delivered_ce; + + __u64 tcpi_bytes_sent; /* RFC4898 tcpEStatsPerfHCDataOctetsOut */ }; /* netlink attributes types for SCM_TIMESTAMPING_OPT_STATS */ @@ -257,7 +259,7 @@ enum { TCP_NLA_SND_SSTHRESH, /* Slow start size threshold */ TCP_NLA_DELIVERED, /* Data pkts delivered incl. out-of-order */ TCP_NLA_DELIVERED_CE, /* Like above but only ones w/ CE marks */ - + TCP_NLA_BYTES_SENT, /* Data bytes sent including retransmission */ }; /* for TCP_MD5SIG socket option */ diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 27bbe6a792b7..873cb9968ff5 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -2594,6 +2594,7 @@ int tcp_disconnect(struct sock *sk, int flags) sk->sk_rx_dst = NULL; tcp_saved_syn_free(tp); tp->compressed_ack = 0; + tp->bytes_sent = 0; /* Clean up fastopen related fields */ tcp_free_fastopen_req(tp); @@ -3201,6 +3202,7 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info) info->tcpi_delivery_rate = rate64; info->tcpi_delivered = tp->delivered; info->tcpi_delivered_ce = tp->delivered_ce; + info->tcpi_bytes_sent = tp->bytes_sent; unlock_sock_fast(sk, slow); } EXPORT_SYMBOL_GPL(tcp_get_info); @@ -3225,6 +3227,7 @@ static size_t tcp_opt_stats_get_size(void) nla_total_size(sizeof(u32)) + /* TCP_NLA_SND_SSTHRESH */ nla_total_size(sizeof(u32)) + /* TCP_NLA_DELIVERED */ nla_total_size(sizeof(u32)) + /* TCP_NLA_DELIVERED_CE */ + nla_total_size_64bit(sizeof(u64)) + /* TCP_NLA_BYTES_SENT */ 0; } @@ -3272,6 +3275,9 @@ struct sk_buff *tcp_get_timestamping_opt_stats(const struct sock *sk) nla_put_u32(stats, TCP_NLA_SNDQ_SIZE, tp->write_seq - tp->snd_una); nla_put_u8(stats, TCP_NLA_CA_STATE, inet_csk(sk)->icsk_ca_state); + nla_put_u64_64bit(stats, TCP_NLA_BYTES_SENT, tp->bytes_sent, + TCP_NLA_PAD); + return stats; } diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 490df62f26d4..861531fe0e97 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -1136,6 +1136,7 @@ static int __tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, if (skb->len != tcp_header_size) { tcp_event_data_sent(tp, sk); tp->data_segs_out += tcp_skb_pcount(skb); + tp->bytes_sent += skb->len - tcp_header_size; tcp_internal_pacing(sk, skb); } -- cgit v1.2.3-59-g8ed1b From fb31c9b9f6c85b1bad569ecedbde78d9e37cd87b Mon Sep 17 00:00:00 2001 From: Wei Wang Date: Tue, 31 Jul 2018 17:46:22 -0700 Subject: tcp: add data bytes retransmitted stats Introduce a new TCP stat to record the number of bytes retransmitted (RFC4898 tcpEStatsPerfOctetsRetrans) and expose it in both tcp_info (TCP_INFO) and opt_stats (SOF_TIMESTAMPING_OPT_STATS). Signed-off-by: Wei Wang Signed-off-by: Eric Dumazet Acked-by: Neal Cardwell Acked-by: Soheil Hassas Yeganeh Acked-by: Yuchung Cheng Signed-off-by: David S. Miller --- include/linux/tcp.h | 3 +++ include/uapi/linux/tcp.h | 2 ++ net/ipv4/tcp.c | 5 +++++ net/ipv4/tcp_output.c | 1 + 4 files changed, 11 insertions(+) (limited to 'include/linux') diff --git a/include/linux/tcp.h b/include/linux/tcp.h index d0798dcd2cab..fb67f9a51b95 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -333,6 +333,9 @@ struct tcp_sock { * the first SYN. */ u32 undo_marker; /* snd_una upon a new recovery episode. */ int undo_retrans; /* number of undoable retransmissions. */ + u64 bytes_retrans; /* RFC4898 tcpEStatsPerfOctetsRetrans + * Total data bytes retransmitted + */ u32 total_retrans; /* Total retransmits for entire connection */ u32 urg_seq; /* Seq of received urgent pointer */ diff --git a/include/uapi/linux/tcp.h b/include/uapi/linux/tcp.h index 1c70ed287c3b..c31f5100b744 100644 --- a/include/uapi/linux/tcp.h +++ b/include/uapi/linux/tcp.h @@ -237,6 +237,7 @@ struct tcp_info { __u32 tcpi_delivered_ce; __u64 tcpi_bytes_sent; /* RFC4898 tcpEStatsPerfHCDataOctetsOut */ + __u64 tcpi_bytes_retrans; /* RFC4898 tcpEStatsPerfOctetsRetrans */ }; /* netlink attributes types for SCM_TIMESTAMPING_OPT_STATS */ @@ -260,6 +261,7 @@ enum { TCP_NLA_DELIVERED, /* Data pkts delivered incl. out-of-order */ TCP_NLA_DELIVERED_CE, /* Like above but only ones w/ CE marks */ TCP_NLA_BYTES_SENT, /* Data bytes sent including retransmission */ + TCP_NLA_BYTES_RETRANS, /* Data bytes retransmitted */ }; /* for TCP_MD5SIG socket option */ diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 873cb9968ff5..5ed1be88e922 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -2595,6 +2595,7 @@ int tcp_disconnect(struct sock *sk, int flags) tcp_saved_syn_free(tp); tp->compressed_ack = 0; tp->bytes_sent = 0; + tp->bytes_retrans = 0; /* Clean up fastopen related fields */ tcp_free_fastopen_req(tp); @@ -3203,6 +3204,7 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info) info->tcpi_delivered = tp->delivered; info->tcpi_delivered_ce = tp->delivered_ce; info->tcpi_bytes_sent = tp->bytes_sent; + info->tcpi_bytes_retrans = tp->bytes_retrans; unlock_sock_fast(sk, slow); } EXPORT_SYMBOL_GPL(tcp_get_info); @@ -3228,6 +3230,7 @@ static size_t tcp_opt_stats_get_size(void) nla_total_size(sizeof(u32)) + /* TCP_NLA_DELIVERED */ nla_total_size(sizeof(u32)) + /* TCP_NLA_DELIVERED_CE */ nla_total_size_64bit(sizeof(u64)) + /* TCP_NLA_BYTES_SENT */ + nla_total_size_64bit(sizeof(u64)) + /* TCP_NLA_BYTES_RETRANS */ 0; } @@ -3277,6 +3280,8 @@ struct sk_buff *tcp_get_timestamping_opt_stats(const struct sock *sk) nla_put_u64_64bit(stats, TCP_NLA_BYTES_SENT, tp->bytes_sent, TCP_NLA_PAD); + nla_put_u64_64bit(stats, TCP_NLA_BYTES_RETRANS, tp->bytes_retrans, + TCP_NLA_PAD); return stats; } diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 861531fe0e97..50cabf7656f3 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -2871,6 +2871,7 @@ int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs) if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN) __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPSYNRETRANS); tp->total_retrans += segs; + tp->bytes_retrans += skb->len; /* make sure skb->data is aligned on arches that require it * and check if ack-trimming & collapsing extended the headroom -- cgit v1.2.3-59-g8ed1b From 7e10b6554ff2ce7f86d5d3eec3af5db8db482caa Mon Sep 17 00:00:00 2001 From: Wei Wang Date: Tue, 31 Jul 2018 17:46:23 -0700 Subject: tcp: add dsack blocks received stats Introduce a new TCP stat to record the number of DSACK blocks received (RFC4989 tcpEStatsStackDSACKDups) and expose it in both tcp_info (TCP_INFO) and opt_stats (SOF_TIMESTAMPING_OPT_STATS). Signed-off-by: Wei Wang Signed-off-by: Eric Dumazet Acked-by: Neal Cardwell Acked-by: Soheil Hassas Yeganeh Acked-by: Yuchung Cheng Signed-off-by: David S. Miller --- include/linux/tcp.h | 3 +++ include/uapi/linux/tcp.h | 2 ++ net/ipv4/tcp.c | 4 ++++ net/ipv4/tcp_input.c | 1 + 4 files changed, 10 insertions(+) (limited to 'include/linux') diff --git a/include/linux/tcp.h b/include/linux/tcp.h index fb67f9a51b95..da6281c549a5 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -188,6 +188,9 @@ struct tcp_sock { * sum(delta(snd_una)), or how many bytes * were acked. */ + u32 dsack_dups; /* RFC4898 tcpEStatsStackDSACKDups + * total number of DSACK blocks received + */ u32 snd_una; /* First byte we want an ack for */ u32 snd_sml; /* Last byte of the most recently transmitted small packet */ u32 rcv_tstamp; /* timestamp of last received ACK (for keepalives) */ diff --git a/include/uapi/linux/tcp.h b/include/uapi/linux/tcp.h index c31f5100b744..0e1c0aec0153 100644 --- a/include/uapi/linux/tcp.h +++ b/include/uapi/linux/tcp.h @@ -238,6 +238,7 @@ struct tcp_info { __u64 tcpi_bytes_sent; /* RFC4898 tcpEStatsPerfHCDataOctetsOut */ __u64 tcpi_bytes_retrans; /* RFC4898 tcpEStatsPerfOctetsRetrans */ + __u32 tcpi_dsack_dups; /* RFC4898 tcpEStatsStackDSACKDups */ }; /* netlink attributes types for SCM_TIMESTAMPING_OPT_STATS */ @@ -262,6 +263,7 @@ enum { TCP_NLA_DELIVERED_CE, /* Like above but only ones w/ CE marks */ TCP_NLA_BYTES_SENT, /* Data bytes sent including retransmission */ TCP_NLA_BYTES_RETRANS, /* Data bytes retransmitted */ + TCP_NLA_DSACK_DUPS, /* DSACK blocks received */ }; /* for TCP_MD5SIG socket option */ diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 5ed1be88e922..d6232b598cae 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -2596,6 +2596,7 @@ int tcp_disconnect(struct sock *sk, int flags) tp->compressed_ack = 0; tp->bytes_sent = 0; tp->bytes_retrans = 0; + tp->dsack_dups = 0; /* Clean up fastopen related fields */ tcp_free_fastopen_req(tp); @@ -3205,6 +3206,7 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info) info->tcpi_delivered_ce = tp->delivered_ce; info->tcpi_bytes_sent = tp->bytes_sent; info->tcpi_bytes_retrans = tp->bytes_retrans; + info->tcpi_dsack_dups = tp->dsack_dups; unlock_sock_fast(sk, slow); } EXPORT_SYMBOL_GPL(tcp_get_info); @@ -3231,6 +3233,7 @@ static size_t tcp_opt_stats_get_size(void) nla_total_size(sizeof(u32)) + /* TCP_NLA_DELIVERED_CE */ nla_total_size_64bit(sizeof(u64)) + /* TCP_NLA_BYTES_SENT */ nla_total_size_64bit(sizeof(u64)) + /* TCP_NLA_BYTES_RETRANS */ + nla_total_size(sizeof(u32)) + /* TCP_NLA_DSACK_DUPS */ 0; } @@ -3282,6 +3285,7 @@ struct sk_buff *tcp_get_timestamping_opt_stats(const struct sock *sk) TCP_NLA_PAD); nla_put_u64_64bit(stats, TCP_NLA_BYTES_RETRANS, tp->bytes_retrans, TCP_NLA_PAD); + nla_put_u32(stats, TCP_NLA_DSACK_DUPS, tp->dsack_dups); return stats; } diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index d51fa358b2b1..fbc85ff7d71d 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -874,6 +874,7 @@ static void tcp_dsack_seen(struct tcp_sock *tp) { tp->rx_opt.sack_ok |= TCP_DSACK_SEEN; tp->rack.dsack_seen = 1; + tp->dsack_dups++; } /* It's reordering when higher sequence was delivered (i.e. sacked) before -- cgit v1.2.3-59-g8ed1b From 7ec65372ca534217b53fd208500cf7aac223a383 Mon Sep 17 00:00:00 2001 From: Wei Wang Date: Tue, 31 Jul 2018 17:46:24 -0700 Subject: tcp: add stat of data packet reordering events Introduce a new TCP stats to record the number of reordering events seen and expose it in both tcp_info (TCP_INFO) and opt_stats (SOF_TIMESTAMPING_OPT_STATS). Application can use this stats to track the frequency of the reordering events in addition to the existing reordering stats which tracks the magnitude of the latest reordering event. Note: this new stats tracks reordering events triggered by ACKs, which could often be fewer than the actual number of packets being delivered out-of-order. Signed-off-by: Wei Wang Signed-off-by: Eric Dumazet Acked-by: Neal Cardwell Acked-by: Soheil Hassas Yeganeh Acked-by: Yuchung Cheng Signed-off-by: David S. Miller --- include/linux/tcp.h | 4 ++-- include/uapi/linux/tcp.h | 2 ++ net/ipv4/tcp.c | 4 ++++ net/ipv4/tcp_input.c | 3 ++- net/ipv4/tcp_recovery.c | 2 +- 5 files changed, 11 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/tcp.h b/include/linux/tcp.h index da6281c549a5..263e37271afd 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -220,8 +220,7 @@ struct tcp_sock { #define TCP_RACK_RECOVERY_THRESH 16 u8 reo_wnd_persist:5, /* No. of recovery since last adj */ dsack_seen:1, /* Whether DSACK seen after last adj */ - advanced:1, /* mstamp advanced since last lost marking */ - reord:1; /* reordering detected */ + advanced:1; /* mstamp advanced since last lost marking */ } rack; u16 advmss; /* Advertised MSS */ u8 compressed_ack; @@ -267,6 +266,7 @@ struct tcp_sock { u8 ecn_flags; /* ECN status bits. */ u8 keepalive_probes; /* num of allowed keep alive probes */ u32 reordering; /* Packet reordering metric. */ + u32 reord_seen; /* number of data packet reordering events */ u32 snd_up; /* Urgent pointer */ /* diff --git a/include/uapi/linux/tcp.h b/include/uapi/linux/tcp.h index 0e1c0aec0153..e02d31986ff9 100644 --- a/include/uapi/linux/tcp.h +++ b/include/uapi/linux/tcp.h @@ -239,6 +239,7 @@ struct tcp_info { __u64 tcpi_bytes_sent; /* RFC4898 tcpEStatsPerfHCDataOctetsOut */ __u64 tcpi_bytes_retrans; /* RFC4898 tcpEStatsPerfOctetsRetrans */ __u32 tcpi_dsack_dups; /* RFC4898 tcpEStatsStackDSACKDups */ + __u32 tcpi_reord_seen; /* reordering events seen */ }; /* netlink attributes types for SCM_TIMESTAMPING_OPT_STATS */ @@ -264,6 +265,7 @@ enum { TCP_NLA_BYTES_SENT, /* Data bytes sent including retransmission */ TCP_NLA_BYTES_RETRANS, /* Data bytes retransmitted */ TCP_NLA_DSACK_DUPS, /* DSACK blocks received */ + TCP_NLA_REORD_SEEN, /* reordering events seen */ }; /* for TCP_MD5SIG socket option */ diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index d6232b598cae..31fa1c080f28 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -2597,6 +2597,7 @@ int tcp_disconnect(struct sock *sk, int flags) tp->bytes_sent = 0; tp->bytes_retrans = 0; tp->dsack_dups = 0; + tp->reord_seen = 0; /* Clean up fastopen related fields */ tcp_free_fastopen_req(tp); @@ -3207,6 +3208,7 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info) info->tcpi_bytes_sent = tp->bytes_sent; info->tcpi_bytes_retrans = tp->bytes_retrans; info->tcpi_dsack_dups = tp->dsack_dups; + info->tcpi_reord_seen = tp->reord_seen; unlock_sock_fast(sk, slow); } EXPORT_SYMBOL_GPL(tcp_get_info); @@ -3234,6 +3236,7 @@ static size_t tcp_opt_stats_get_size(void) nla_total_size_64bit(sizeof(u64)) + /* TCP_NLA_BYTES_SENT */ nla_total_size_64bit(sizeof(u64)) + /* TCP_NLA_BYTES_RETRANS */ nla_total_size(sizeof(u32)) + /* TCP_NLA_DSACK_DUPS */ + nla_total_size(sizeof(u32)) + /* TCP_NLA_REORD_SEEN */ 0; } @@ -3286,6 +3289,7 @@ struct sk_buff *tcp_get_timestamping_opt_stats(const struct sock *sk) nla_put_u64_64bit(stats, TCP_NLA_BYTES_RETRANS, tp->bytes_retrans, TCP_NLA_PAD); nla_put_u32(stats, TCP_NLA_DSACK_DUPS, tp->dsack_dups); + nla_put_u32(stats, TCP_NLA_REORD_SEEN, tp->reord_seen); return stats; } diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index fbc85ff7d71d..3d6156f07a8d 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -906,8 +906,8 @@ static void tcp_check_sack_reordering(struct sock *sk, const u32 low_seq, sock_net(sk)->ipv4.sysctl_tcp_max_reordering); } - tp->rack.reord = 1; /* This exciting event is worth to be remembered. 8) */ + tp->reord_seen++; NET_INC_STATS(sock_net(sk), ts ? LINUX_MIB_TCPTSREORDER : LINUX_MIB_TCPSACKREORDER); } @@ -1871,6 +1871,7 @@ static void tcp_check_reno_reordering(struct sock *sk, const int addend) tp->reordering = min_t(u32, tp->packets_out + addend, sock_net(sk)->ipv4.sysctl_tcp_max_reordering); + tp->reord_seen++; NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPRENOREORDER); } diff --git a/net/ipv4/tcp_recovery.c b/net/ipv4/tcp_recovery.c index 71593e4400ab..c81aadff769b 100644 --- a/net/ipv4/tcp_recovery.c +++ b/net/ipv4/tcp_recovery.c @@ -25,7 +25,7 @@ static u32 tcp_rack_reo_wnd(const struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); - if (!tp->rack.reord) { + if (!tp->reord_seen) { /* If reordering has not been observed, be aggressive during * the recovery or starting the recovery by DUPACK threshold. */ -- cgit v1.2.3-59-g8ed1b From 0a4c58f5702858822621fa1177c7d3475f181ccb Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Thu, 2 Aug 2018 14:27:17 -0700 Subject: bpf: add ability to charge bpf maps memory dynamically This commits extends existing bpf maps memory charging API to support dynamic charging/uncharging. This is required to account memory used by maps, if all entries are created dynamically after the map initialization. Signed-off-by: Roman Gushchin Cc: Alexei Starovoitov Cc: Daniel Borkmann Acked-by: Martin KaFai Lau Signed-off-by: Daniel Borkmann --- include/linux/bpf.h | 2 ++ kernel/bpf/syscall.c | 58 ++++++++++++++++++++++++++++++++++++++-------------- 2 files changed, 45 insertions(+), 15 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 5b5ad95cf339..5a4a256473c3 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -435,6 +435,8 @@ struct bpf_map * __must_check bpf_map_inc(struct bpf_map *map, bool uref); void bpf_map_put_with_uref(struct bpf_map *map); void bpf_map_put(struct bpf_map *map); int bpf_map_precharge_memlock(u32 pages); +int bpf_map_charge_memlock(struct bpf_map *map, u32 pages); +void bpf_map_uncharge_memlock(struct bpf_map *map, u32 pages); void *bpf_map_area_alloc(size_t size, int numa_node); void bpf_map_area_free(void *base); void bpf_map_init_from_attr(struct bpf_map *map, union bpf_attr *attr); diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index a31a1ba0f8ea..7958252a4d29 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -181,32 +181,60 @@ int bpf_map_precharge_memlock(u32 pages) return 0; } -static int bpf_map_charge_memlock(struct bpf_map *map) +static int bpf_charge_memlock(struct user_struct *user, u32 pages) { - struct user_struct *user = get_current_user(); - unsigned long memlock_limit; + unsigned long memlock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; - memlock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; + if (atomic_long_add_return(pages, &user->locked_vm) > memlock_limit) { + atomic_long_sub(pages, &user->locked_vm); + return -EPERM; + } + return 0; +} - atomic_long_add(map->pages, &user->locked_vm); +static void bpf_uncharge_memlock(struct user_struct *user, u32 pages) +{ + atomic_long_sub(pages, &user->locked_vm); +} + +static int bpf_map_init_memlock(struct bpf_map *map) +{ + struct user_struct *user = get_current_user(); + int ret; - if (atomic_long_read(&user->locked_vm) > memlock_limit) { - atomic_long_sub(map->pages, &user->locked_vm); + ret = bpf_charge_memlock(user, map->pages); + if (ret) { free_uid(user); - return -EPERM; + return ret; } map->user = user; - return 0; + return ret; } -static void bpf_map_uncharge_memlock(struct bpf_map *map) +static void bpf_map_release_memlock(struct bpf_map *map) { struct user_struct *user = map->user; - - atomic_long_sub(map->pages, &user->locked_vm); + bpf_uncharge_memlock(user, map->pages); free_uid(user); } +int bpf_map_charge_memlock(struct bpf_map *map, u32 pages) +{ + int ret; + + ret = bpf_charge_memlock(map->user, pages); + if (ret) + return ret; + map->pages += pages; + return ret; +} + +void bpf_map_uncharge_memlock(struct bpf_map *map, u32 pages) +{ + bpf_uncharge_memlock(map->user, pages); + map->pages -= pages; +} + static int bpf_map_alloc_id(struct bpf_map *map) { int id; @@ -256,7 +284,7 @@ static void bpf_map_free_deferred(struct work_struct *work) { struct bpf_map *map = container_of(work, struct bpf_map, work); - bpf_map_uncharge_memlock(map); + bpf_map_release_memlock(map); security_bpf_map_free(map); /* implementation dependent freeing */ map->ops->map_free(map); @@ -492,7 +520,7 @@ static int map_create(union bpf_attr *attr) if (err) goto free_map_nouncharge; - err = bpf_map_charge_memlock(map); + err = bpf_map_init_memlock(map); if (err) goto free_map_sec; @@ -515,7 +543,7 @@ static int map_create(union bpf_attr *attr) return err; free_map: - bpf_map_uncharge_memlock(map); + bpf_map_release_memlock(map); free_map_sec: security_bpf_map_free(map); free_map_nouncharge: -- cgit v1.2.3-59-g8ed1b From de9cbbaadba5adf88a19e46df61f7054000838f6 Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Thu, 2 Aug 2018 14:27:18 -0700 Subject: bpf: introduce cgroup storage maps This commit introduces BPF_MAP_TYPE_CGROUP_STORAGE maps: a special type of maps which are implementing the cgroup storage. >From the userspace point of view it's almost a generic hash map with the (cgroup inode id, attachment type) pair used as a key. The only difference is that some operations are restricted: 1) a user can't create new entries, 2) a user can't remove existing entries. The lookup from userspace is o(log(n)). Signed-off-by: Roman Gushchin Cc: Alexei Starovoitov Cc: Daniel Borkmann Acked-by: Martin KaFai Lau Signed-off-by: Daniel Borkmann --- include/linux/bpf-cgroup.h | 38 +++++ include/linux/bpf.h | 1 + include/linux/bpf_types.h | 3 + include/uapi/linux/bpf.h | 6 + kernel/bpf/Makefile | 1 + kernel/bpf/local_storage.c | 376 +++++++++++++++++++++++++++++++++++++++++++++ kernel/bpf/syscall.c | 3 + kernel/bpf/verifier.c | 12 ++ 8 files changed, 440 insertions(+) create mode 100644 kernel/bpf/local_storage.c (limited to 'include/linux') diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h index d50c2f0a655a..7d00d58869ed 100644 --- a/include/linux/bpf-cgroup.h +++ b/include/linux/bpf-cgroup.h @@ -4,19 +4,39 @@ #include #include +#include #include struct sock; struct sockaddr; struct cgroup; struct sk_buff; +struct bpf_map; +struct bpf_prog; struct bpf_sock_ops_kern; +struct bpf_cgroup_storage; #ifdef CONFIG_CGROUP_BPF extern struct static_key_false cgroup_bpf_enabled_key; #define cgroup_bpf_enabled static_branch_unlikely(&cgroup_bpf_enabled_key) +struct bpf_cgroup_storage_map; + +struct bpf_storage_buffer { + struct rcu_head rcu; + char data[0]; +}; + +struct bpf_cgroup_storage { + struct bpf_storage_buffer *buf; + struct bpf_cgroup_storage_map *map; + struct bpf_cgroup_storage_key key; + struct list_head list; + struct rb_node node; + struct rcu_head rcu; +}; + struct bpf_prog_list { struct list_head node; struct bpf_prog *prog; @@ -77,6 +97,15 @@ int __cgroup_bpf_run_filter_sock_ops(struct sock *sk, int __cgroup_bpf_check_dev_permission(short dev_type, u32 major, u32 minor, short access, enum bpf_attach_type type); +struct bpf_cgroup_storage *bpf_cgroup_storage_alloc(struct bpf_prog *prog); +void bpf_cgroup_storage_free(struct bpf_cgroup_storage *storage); +void bpf_cgroup_storage_link(struct bpf_cgroup_storage *storage, + struct cgroup *cgroup, + enum bpf_attach_type type); +void bpf_cgroup_storage_unlink(struct bpf_cgroup_storage *storage); +int bpf_cgroup_storage_assign(struct bpf_prog *prog, struct bpf_map *map); +void bpf_cgroup_storage_release(struct bpf_prog *prog, struct bpf_map *map); + /* Wrappers for __cgroup_bpf_run_filter_skb() guarded by cgroup_bpf_enabled. */ #define BPF_CGROUP_RUN_PROG_INET_INGRESS(sk, skb) \ ({ \ @@ -221,6 +250,15 @@ static inline int cgroup_bpf_prog_query(const union bpf_attr *attr, return -EINVAL; } +static inline int bpf_cgroup_storage_assign(struct bpf_prog *prog, + struct bpf_map *map) { return 0; } +static inline void bpf_cgroup_storage_release(struct bpf_prog *prog, + struct bpf_map *map) {} +static inline struct bpf_cgroup_storage *bpf_cgroup_storage_alloc( + struct bpf_prog *prog) { return 0; } +static inline void bpf_cgroup_storage_free( + struct bpf_cgroup_storage *storage) {} + #define cgroup_bpf_enabled (0) #define BPF_CGROUP_PRE_CONNECT_ENABLED(sk) (0) #define BPF_CGROUP_RUN_PROG_INET_INGRESS(sk,skb) ({ 0; }) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 5a4a256473c3..9d1e4727495e 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -282,6 +282,7 @@ struct bpf_prog_aux { struct bpf_prog *prog; struct user_struct *user; u64 load_time; /* ns since boottime */ + struct bpf_map *cgroup_storage; char name[BPF_OBJ_NAME_LEN]; #ifdef CONFIG_SECURITY void *security; diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h index c5700c2d5549..add08be53b6f 100644 --- a/include/linux/bpf_types.h +++ b/include/linux/bpf_types.h @@ -37,6 +37,9 @@ BPF_MAP_TYPE(BPF_MAP_TYPE_PERF_EVENT_ARRAY, perf_event_array_map_ops) #ifdef CONFIG_CGROUPS BPF_MAP_TYPE(BPF_MAP_TYPE_CGROUP_ARRAY, cgroup_array_map_ops) #endif +#ifdef CONFIG_CGROUP_BPF +BPF_MAP_TYPE(BPF_MAP_TYPE_CGROUP_STORAGE, cgroup_storage_map_ops) +#endif BPF_MAP_TYPE(BPF_MAP_TYPE_HASH, htab_map_ops) BPF_MAP_TYPE(BPF_MAP_TYPE_PERCPU_HASH, htab_percpu_map_ops) BPF_MAP_TYPE(BPF_MAP_TYPE_LRU_HASH, htab_lru_map_ops) diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 0ebaaf7f3568..b10118ee5afe 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -75,6 +75,11 @@ struct bpf_lpm_trie_key { __u8 data[0]; /* Arbitrary size */ }; +struct bpf_cgroup_storage_key { + __u64 cgroup_inode_id; /* cgroup inode id */ + __u32 attach_type; /* program attach type */ +}; + /* BPF syscall commands, see bpf(2) man-page for details. */ enum bpf_cmd { BPF_MAP_CREATE, @@ -120,6 +125,7 @@ enum bpf_map_type { BPF_MAP_TYPE_CPUMAP, BPF_MAP_TYPE_XSKMAP, BPF_MAP_TYPE_SOCKHASH, + BPF_MAP_TYPE_CGROUP_STORAGE, }; enum bpf_prog_type { diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile index f27f5496d6fe..e8906cbad81f 100644 --- a/kernel/bpf/Makefile +++ b/kernel/bpf/Makefile @@ -3,6 +3,7 @@ obj-y := core.o obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list.o lpm_trie.o map_in_map.o +obj-$(CONFIG_BPF_SYSCALL) += local_storage.o obj-$(CONFIG_BPF_SYSCALL) += disasm.o obj-$(CONFIG_BPF_SYSCALL) += btf.o ifeq ($(CONFIG_NET),y) diff --git a/kernel/bpf/local_storage.c b/kernel/bpf/local_storage.c new file mode 100644 index 000000000000..f23d3fdeba23 --- /dev/null +++ b/kernel/bpf/local_storage.c @@ -0,0 +1,376 @@ +//SPDX-License-Identifier: GPL-2.0 +#include +#include +#include +#include +#include +#include +#include + +#ifdef CONFIG_CGROUP_BPF + +#define LOCAL_STORAGE_CREATE_FLAG_MASK \ + (BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY) + +struct bpf_cgroup_storage_map { + struct bpf_map map; + + spinlock_t lock; + struct bpf_prog *prog; + struct rb_root root; + struct list_head list; +}; + +static struct bpf_cgroup_storage_map *map_to_storage(struct bpf_map *map) +{ + return container_of(map, struct bpf_cgroup_storage_map, map); +} + +static int bpf_cgroup_storage_key_cmp( + const struct bpf_cgroup_storage_key *key1, + const struct bpf_cgroup_storage_key *key2) +{ + if (key1->cgroup_inode_id < key2->cgroup_inode_id) + return -1; + else if (key1->cgroup_inode_id > key2->cgroup_inode_id) + return 1; + else if (key1->attach_type < key2->attach_type) + return -1; + else if (key1->attach_type > key2->attach_type) + return 1; + return 0; +} + +static struct bpf_cgroup_storage *cgroup_storage_lookup( + struct bpf_cgroup_storage_map *map, struct bpf_cgroup_storage_key *key, + bool locked) +{ + struct rb_root *root = &map->root; + struct rb_node *node; + + if (!locked) + spin_lock_bh(&map->lock); + + node = root->rb_node; + while (node) { + struct bpf_cgroup_storage *storage; + + storage = container_of(node, struct bpf_cgroup_storage, node); + + switch (bpf_cgroup_storage_key_cmp(key, &storage->key)) { + case -1: + node = node->rb_left; + break; + case 1: + node = node->rb_right; + break; + default: + if (!locked) + spin_unlock_bh(&map->lock); + return storage; + } + } + + if (!locked) + spin_unlock_bh(&map->lock); + + return NULL; +} + +static int cgroup_storage_insert(struct bpf_cgroup_storage_map *map, + struct bpf_cgroup_storage *storage) +{ + struct rb_root *root = &map->root; + struct rb_node **new = &(root->rb_node), *parent = NULL; + + while (*new) { + struct bpf_cgroup_storage *this; + + this = container_of(*new, struct bpf_cgroup_storage, node); + + parent = *new; + switch (bpf_cgroup_storage_key_cmp(&storage->key, &this->key)) { + case -1: + new = &((*new)->rb_left); + break; + case 1: + new = &((*new)->rb_right); + break; + default: + return -EEXIST; + } + } + + rb_link_node(&storage->node, parent, new); + rb_insert_color(&storage->node, root); + + return 0; +} + +static void *cgroup_storage_lookup_elem(struct bpf_map *_map, void *_key) +{ + struct bpf_cgroup_storage_map *map = map_to_storage(_map); + struct bpf_cgroup_storage_key *key = _key; + struct bpf_cgroup_storage *storage; + + storage = cgroup_storage_lookup(map, key, false); + if (!storage) + return NULL; + + return &READ_ONCE(storage->buf)->data[0]; +} + +static int cgroup_storage_update_elem(struct bpf_map *map, void *_key, + void *value, u64 flags) +{ + struct bpf_cgroup_storage_key *key = _key; + struct bpf_cgroup_storage *storage; + struct bpf_storage_buffer *new; + + if (flags & BPF_NOEXIST) + return -EINVAL; + + storage = cgroup_storage_lookup((struct bpf_cgroup_storage_map *)map, + key, false); + if (!storage) + return -ENOENT; + + new = kmalloc_node(sizeof(struct bpf_storage_buffer) + + map->value_size, __GFP_ZERO | GFP_USER, + map->numa_node); + if (!new) + return -ENOMEM; + + memcpy(&new->data[0], value, map->value_size); + + new = xchg(&storage->buf, new); + kfree_rcu(new, rcu); + + return 0; +} + +static int cgroup_storage_get_next_key(struct bpf_map *_map, void *_key, + void *_next_key) +{ + struct bpf_cgroup_storage_map *map = map_to_storage(_map); + struct bpf_cgroup_storage_key *key = _key; + struct bpf_cgroup_storage_key *next = _next_key; + struct bpf_cgroup_storage *storage; + + spin_lock_bh(&map->lock); + + if (list_empty(&map->list)) + goto enoent; + + if (key) { + storage = cgroup_storage_lookup(map, key, true); + if (!storage) + goto enoent; + + storage = list_next_entry(storage, list); + if (!storage) + goto enoent; + } else { + storage = list_first_entry(&map->list, + struct bpf_cgroup_storage, list); + } + + spin_unlock_bh(&map->lock); + next->attach_type = storage->key.attach_type; + next->cgroup_inode_id = storage->key.cgroup_inode_id; + return 0; + +enoent: + spin_unlock_bh(&map->lock); + return -ENOENT; +} + +static struct bpf_map *cgroup_storage_map_alloc(union bpf_attr *attr) +{ + int numa_node = bpf_map_attr_numa_node(attr); + struct bpf_cgroup_storage_map *map; + + if (attr->key_size != sizeof(struct bpf_cgroup_storage_key)) + return ERR_PTR(-EINVAL); + + if (attr->value_size > PAGE_SIZE) + return ERR_PTR(-E2BIG); + + if (attr->map_flags & ~LOCAL_STORAGE_CREATE_FLAG_MASK) + /* reserved bits should not be used */ + return ERR_PTR(-EINVAL); + + if (attr->max_entries) + /* max_entries is not used and enforced to be 0 */ + return ERR_PTR(-EINVAL); + + map = kmalloc_node(sizeof(struct bpf_cgroup_storage_map), + __GFP_ZERO | GFP_USER, numa_node); + if (!map) + return ERR_PTR(-ENOMEM); + + map->map.pages = round_up(sizeof(struct bpf_cgroup_storage_map), + PAGE_SIZE) >> PAGE_SHIFT; + + /* copy mandatory map attributes */ + bpf_map_init_from_attr(&map->map, attr); + + spin_lock_init(&map->lock); + map->root = RB_ROOT; + INIT_LIST_HEAD(&map->list); + + return &map->map; +} + +static void cgroup_storage_map_free(struct bpf_map *_map) +{ + struct bpf_cgroup_storage_map *map = map_to_storage(_map); + + WARN_ON(!RB_EMPTY_ROOT(&map->root)); + WARN_ON(!list_empty(&map->list)); + + kfree(map); +} + +static int cgroup_storage_delete_elem(struct bpf_map *map, void *key) +{ + return -EINVAL; +} + +const struct bpf_map_ops cgroup_storage_map_ops = { + .map_alloc = cgroup_storage_map_alloc, + .map_free = cgroup_storage_map_free, + .map_get_next_key = cgroup_storage_get_next_key, + .map_lookup_elem = cgroup_storage_lookup_elem, + .map_update_elem = cgroup_storage_update_elem, + .map_delete_elem = cgroup_storage_delete_elem, +}; + +int bpf_cgroup_storage_assign(struct bpf_prog *prog, struct bpf_map *_map) +{ + struct bpf_cgroup_storage_map *map = map_to_storage(_map); + int ret = -EBUSY; + + spin_lock_bh(&map->lock); + + if (map->prog && map->prog != prog) + goto unlock; + if (prog->aux->cgroup_storage && prog->aux->cgroup_storage != _map) + goto unlock; + + map->prog = prog; + prog->aux->cgroup_storage = _map; + ret = 0; +unlock: + spin_unlock_bh(&map->lock); + + return ret; +} + +void bpf_cgroup_storage_release(struct bpf_prog *prog, struct bpf_map *_map) +{ + struct bpf_cgroup_storage_map *map = map_to_storage(_map); + + spin_lock_bh(&map->lock); + if (map->prog == prog) { + WARN_ON(prog->aux->cgroup_storage != _map); + map->prog = NULL; + prog->aux->cgroup_storage = NULL; + } + spin_unlock_bh(&map->lock); +} + +struct bpf_cgroup_storage *bpf_cgroup_storage_alloc(struct bpf_prog *prog) +{ + struct bpf_cgroup_storage *storage; + struct bpf_map *map; + u32 pages; + + map = prog->aux->cgroup_storage; + if (!map) + return NULL; + + pages = round_up(sizeof(struct bpf_cgroup_storage) + + sizeof(struct bpf_storage_buffer) + + map->value_size, PAGE_SIZE) >> PAGE_SHIFT; + if (bpf_map_charge_memlock(map, pages)) + return ERR_PTR(-EPERM); + + storage = kmalloc_node(sizeof(struct bpf_cgroup_storage), + __GFP_ZERO | GFP_USER, map->numa_node); + if (!storage) { + bpf_map_uncharge_memlock(map, pages); + return ERR_PTR(-ENOMEM); + } + + storage->buf = kmalloc_node(sizeof(struct bpf_storage_buffer) + + map->value_size, __GFP_ZERO | GFP_USER, + map->numa_node); + if (!storage->buf) { + bpf_map_uncharge_memlock(map, pages); + kfree(storage); + return ERR_PTR(-ENOMEM); + } + + storage->map = (struct bpf_cgroup_storage_map *)map; + + return storage; +} + +void bpf_cgroup_storage_free(struct bpf_cgroup_storage *storage) +{ + u32 pages; + struct bpf_map *map; + + if (!storage) + return; + + map = &storage->map->map; + pages = round_up(sizeof(struct bpf_cgroup_storage) + + sizeof(struct bpf_storage_buffer) + + map->value_size, PAGE_SIZE) >> PAGE_SHIFT; + bpf_map_uncharge_memlock(map, pages); + + kfree_rcu(storage->buf, rcu); + kfree_rcu(storage, rcu); +} + +void bpf_cgroup_storage_link(struct bpf_cgroup_storage *storage, + struct cgroup *cgroup, + enum bpf_attach_type type) +{ + struct bpf_cgroup_storage_map *map; + + if (!storage) + return; + + storage->key.attach_type = type; + storage->key.cgroup_inode_id = cgroup->kn->id.id; + + map = storage->map; + + spin_lock_bh(&map->lock); + WARN_ON(cgroup_storage_insert(map, storage)); + list_add(&storage->list, &map->list); + spin_unlock_bh(&map->lock); +} + +void bpf_cgroup_storage_unlink(struct bpf_cgroup_storage *storage) +{ + struct bpf_cgroup_storage_map *map; + struct rb_root *root; + + if (!storage) + return; + + map = storage->map; + + spin_lock_bh(&map->lock); + root = &map->root; + rb_erase(&storage->node, root); + + list_del(&storage->list); + spin_unlock_bh(&map->lock); +} + +#endif diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 7958252a4d29..5af4e9e2722d 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -957,6 +957,9 @@ static void free_used_maps(struct bpf_prog_aux *aux) { int i; + if (aux->cgroup_storage) + bpf_cgroup_storage_release(aux->prog, aux->cgroup_storage); + for (i = 0; i < aux->used_map_cnt; i++) bpf_map_put(aux->used_maps[i]); diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index e948303a0ea8..7e75434a9e54 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -5154,6 +5154,14 @@ static int replace_map_fd_with_map_ptr(struct bpf_verifier_env *env) } env->used_maps[env->used_map_cnt++] = map; + if (map->map_type == BPF_MAP_TYPE_CGROUP_STORAGE && + bpf_cgroup_storage_assign(env->prog, map)) { + verbose(env, + "only one cgroup storage is allowed\n"); + fdput(f); + return -EBUSY; + } + fdput(f); next_insn: insn++; @@ -5180,6 +5188,10 @@ static void release_maps(struct bpf_verifier_env *env) { int i; + if (env->prog->aux->cgroup_storage) + bpf_cgroup_storage_release(env->prog, + env->prog->aux->cgroup_storage); + for (i = 0; i < env->used_map_cnt; i++) bpf_map_put(env->used_maps[i]); } -- cgit v1.2.3-59-g8ed1b From aa0ad5b0391e268bdecf6cda31268388844f8afd Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Thu, 2 Aug 2018 14:27:19 -0700 Subject: bpf: pass a pointer to a cgroup storage using pcpu variable This commit introduces the bpf_cgroup_storage_set() helper, which will be used to pass a pointer to a cgroup storage to the bpf helper. Signed-off-by: Roman Gushchin Cc: Alexei Starovoitov Cc: Daniel Borkmann Acked-by: Martin KaFai Lau Signed-off-by: Daniel Borkmann --- include/linux/bpf-cgroup.h | 15 +++++++++++++++ kernel/bpf/local_storage.c | 2 ++ 2 files changed, 17 insertions(+) (limited to 'include/linux') diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h index 7d00d58869ed..9a144ddbbc8f 100644 --- a/include/linux/bpf-cgroup.h +++ b/include/linux/bpf-cgroup.h @@ -4,6 +4,7 @@ #include #include +#include #include #include @@ -21,6 +22,8 @@ struct bpf_cgroup_storage; extern struct static_key_false cgroup_bpf_enabled_key; #define cgroup_bpf_enabled static_branch_unlikely(&cgroup_bpf_enabled_key) +DECLARE_PER_CPU(void*, bpf_cgroup_storage); + struct bpf_cgroup_storage_map; struct bpf_storage_buffer { @@ -97,6 +100,17 @@ int __cgroup_bpf_run_filter_sock_ops(struct sock *sk, int __cgroup_bpf_check_dev_permission(short dev_type, u32 major, u32 minor, short access, enum bpf_attach_type type); +static inline void bpf_cgroup_storage_set(struct bpf_cgroup_storage *storage) +{ + struct bpf_storage_buffer *buf; + + if (!storage) + return; + + buf = READ_ONCE(storage->buf); + this_cpu_write(bpf_cgroup_storage, &buf->data[0]); +} + struct bpf_cgroup_storage *bpf_cgroup_storage_alloc(struct bpf_prog *prog); void bpf_cgroup_storage_free(struct bpf_cgroup_storage *storage); void bpf_cgroup_storage_link(struct bpf_cgroup_storage *storage, @@ -250,6 +264,7 @@ static inline int cgroup_bpf_prog_query(const union bpf_attr *attr, return -EINVAL; } +static inline void bpf_cgroup_storage_set(struct bpf_cgroup_storage *storage) {} static inline int bpf_cgroup_storage_assign(struct bpf_prog *prog, struct bpf_map *map) { return 0; } static inline void bpf_cgroup_storage_release(struct bpf_prog *prog, diff --git a/kernel/bpf/local_storage.c b/kernel/bpf/local_storage.c index f23d3fdeba23..fc4e37f68f2a 100644 --- a/kernel/bpf/local_storage.c +++ b/kernel/bpf/local_storage.c @@ -7,6 +7,8 @@ #include #include +DEFINE_PER_CPU(void*, bpf_cgroup_storage); + #ifdef CONFIG_CGROUP_BPF #define LOCAL_STORAGE_CREATE_FLAG_MASK \ -- cgit v1.2.3-59-g8ed1b From d7bf2c10af053191e931b58704cd862fccb7f9de Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Thu, 2 Aug 2018 14:27:20 -0700 Subject: bpf: allocate cgroup storage entries on attaching bpf programs If a bpf program is using cgroup local storage, allocate a bpf_cgroup_storage structure automatically on attaching the program to a cgroup and save the pointer into the corresponding bpf_prog_list entry. Analogically, release the cgroup local storage on detaching of the bpf program. Signed-off-by: Roman Gushchin Cc: Alexei Starovoitov Cc: Daniel Borkmann Acked-by: Martin KaFai Lau Signed-off-by: Daniel Borkmann --- include/linux/bpf-cgroup.h | 1 + kernel/bpf/cgroup.c | 35 +++++++++++++++++++++++++++++++---- 2 files changed, 32 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h index 9a144ddbbc8f..f91b0f8ff3a9 100644 --- a/include/linux/bpf-cgroup.h +++ b/include/linux/bpf-cgroup.h @@ -43,6 +43,7 @@ struct bpf_cgroup_storage { struct bpf_prog_list { struct list_head node; struct bpf_prog *prog; + struct bpf_cgroup_storage *storage; }; struct bpf_prog_array; diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index badabb0b435c..935274c86bfe 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -34,6 +34,8 @@ void cgroup_bpf_put(struct cgroup *cgrp) list_for_each_entry_safe(pl, tmp, progs, node) { list_del(&pl->node); bpf_prog_put(pl->prog); + bpf_cgroup_storage_unlink(pl->storage); + bpf_cgroup_storage_free(pl->storage); kfree(pl); static_branch_dec(&cgroup_bpf_enabled_key); } @@ -188,6 +190,7 @@ int __cgroup_bpf_attach(struct cgroup *cgrp, struct bpf_prog *prog, { struct list_head *progs = &cgrp->bpf.progs[type]; struct bpf_prog *old_prog = NULL; + struct bpf_cgroup_storage *storage, *old_storage = NULL; struct cgroup_subsys_state *css; struct bpf_prog_list *pl; bool pl_was_allocated; @@ -210,31 +213,47 @@ int __cgroup_bpf_attach(struct cgroup *cgrp, struct bpf_prog *prog, if (prog_list_length(progs) >= BPF_CGROUP_MAX_PROGS) return -E2BIG; + storage = bpf_cgroup_storage_alloc(prog); + if (IS_ERR(storage)) + return -ENOMEM; + if (flags & BPF_F_ALLOW_MULTI) { - list_for_each_entry(pl, progs, node) - if (pl->prog == prog) + list_for_each_entry(pl, progs, node) { + if (pl->prog == prog) { /* disallow attaching the same prog twice */ + bpf_cgroup_storage_free(storage); return -EINVAL; + } + } pl = kmalloc(sizeof(*pl), GFP_KERNEL); - if (!pl) + if (!pl) { + bpf_cgroup_storage_free(storage); return -ENOMEM; + } + pl_was_allocated = true; pl->prog = prog; + pl->storage = storage; list_add_tail(&pl->node, progs); } else { if (list_empty(progs)) { pl = kmalloc(sizeof(*pl), GFP_KERNEL); - if (!pl) + if (!pl) { + bpf_cgroup_storage_free(storage); return -ENOMEM; + } pl_was_allocated = true; list_add_tail(&pl->node, progs); } else { pl = list_first_entry(progs, typeof(*pl), node); old_prog = pl->prog; + old_storage = pl->storage; + bpf_cgroup_storage_unlink(old_storage); pl_was_allocated = false; } pl->prog = prog; + pl->storage = storage; } cgrp->bpf.flags[type] = flags; @@ -257,10 +276,13 @@ int __cgroup_bpf_attach(struct cgroup *cgrp, struct bpf_prog *prog, } static_branch_inc(&cgroup_bpf_enabled_key); + if (old_storage) + bpf_cgroup_storage_free(old_storage); if (old_prog) { bpf_prog_put(old_prog); static_branch_dec(&cgroup_bpf_enabled_key); } + bpf_cgroup_storage_link(storage, cgrp, type); return 0; cleanup: @@ -276,6 +298,9 @@ cleanup: /* and cleanup the prog list */ pl->prog = old_prog; + bpf_cgroup_storage_free(pl->storage); + pl->storage = old_storage; + bpf_cgroup_storage_link(old_storage, cgrp, type); if (pl_was_allocated) { list_del(&pl->node); kfree(pl); @@ -356,6 +381,8 @@ int __cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog, /* now can actually delete it from this cgroup list */ list_del(&pl->node); + bpf_cgroup_storage_unlink(pl->storage); + bpf_cgroup_storage_free(pl->storage); kfree(pl); if (list_empty(progs)) /* last program was detached, reset flags to zero */ -- cgit v1.2.3-59-g8ed1b From 394e40a29788820c9c0526b1c3497c9e0ec2a126 Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Thu, 2 Aug 2018 14:27:21 -0700 Subject: bpf: extend bpf_prog_array to store pointers to the cgroup storage This patch converts bpf_prog_array from an array of prog pointers to the array of struct bpf_prog_array_item elements. This allows to save a cgroup storage pointer for each bpf program efficiently attached to a cgroup. Signed-off-by: Roman Gushchin Cc: Alexei Starovoitov Cc: Daniel Borkmann Acked-by: Martin KaFai Lau Signed-off-by: Daniel Borkmann --- drivers/media/rc/bpf-lirc.c | 10 +++--- include/linux/bpf.h | 19 ++++++++---- kernel/bpf/cgroup.c | 21 +++++++------ kernel/bpf/core.c | 76 +++++++++++++++++++++++---------------------- 4 files changed, 70 insertions(+), 56 deletions(-) (limited to 'include/linux') diff --git a/drivers/media/rc/bpf-lirc.c b/drivers/media/rc/bpf-lirc.c index fcfab6635f9c..8c26df9b96c1 100644 --- a/drivers/media/rc/bpf-lirc.c +++ b/drivers/media/rc/bpf-lirc.c @@ -195,14 +195,16 @@ void lirc_bpf_run(struct rc_dev *rcdev, u32 sample) */ void lirc_bpf_free(struct rc_dev *rcdev) { - struct bpf_prog **progs; + struct bpf_prog_array_item *item; if (!rcdev->raw->progs) return; - progs = rcu_dereference(rcdev->raw->progs)->progs; - while (*progs) - bpf_prog_put(*progs++); + item = rcu_dereference(rcdev->raw->progs)->items; + while (item->prog) { + bpf_prog_put(item->prog); + item++; + } bpf_prog_array_free(rcdev->raw->progs); } diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 9d1e4727495e..16be67888c30 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -349,9 +349,14 @@ int bpf_prog_test_run_skb(struct bpf_prog *prog, const union bpf_attr *kattr, * The 'struct bpf_prog_array *' should only be replaced with xchg() * since other cpus are walking the array of pointers in parallel. */ +struct bpf_prog_array_item { + struct bpf_prog *prog; + struct bpf_cgroup_storage *cgroup_storage; +}; + struct bpf_prog_array { struct rcu_head rcu; - struct bpf_prog *progs[0]; + struct bpf_prog_array_item items[0]; }; struct bpf_prog_array *bpf_prog_array_alloc(u32 prog_cnt, gfp_t flags); @@ -372,7 +377,8 @@ int bpf_prog_array_copy(struct bpf_prog_array __rcu *old_array, #define __BPF_PROG_RUN_ARRAY(array, ctx, func, check_non_null) \ ({ \ - struct bpf_prog **_prog, *__prog; \ + struct bpf_prog_array_item *_item; \ + struct bpf_prog *_prog; \ struct bpf_prog_array *_array; \ u32 _ret = 1; \ preempt_disable(); \ @@ -380,10 +386,11 @@ int bpf_prog_array_copy(struct bpf_prog_array __rcu *old_array, _array = rcu_dereference(array); \ if (unlikely(check_non_null && !_array))\ goto _out; \ - _prog = _array->progs; \ - while ((__prog = READ_ONCE(*_prog))) { \ - _ret &= func(__prog, ctx); \ - _prog++; \ + _item = &_array->items[0]; \ + while ((_prog = READ_ONCE(_item->prog))) { \ + bpf_cgroup_storage_set(_item->cgroup_storage); \ + _ret &= func(_prog, ctx); \ + _item++; \ } \ _out: \ rcu_read_unlock(); \ diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index 935274c86bfe..ddfa6cc13e57 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -117,15 +117,18 @@ static int compute_effective_progs(struct cgroup *cgrp, cnt = 0; p = cgrp; do { - if (cnt == 0 || (p->bpf.flags[type] & BPF_F_ALLOW_MULTI)) - list_for_each_entry(pl, - &p->bpf.progs[type], node) { - if (!pl->prog) - continue; - progs->progs[cnt++] = pl->prog; - } - p = cgroup_parent(p); - } while (p); + if (cnt > 0 && !(p->bpf.flags[type] & BPF_F_ALLOW_MULTI)) + continue; + + list_for_each_entry(pl, &p->bpf.progs[type], node) { + if (!pl->prog) + continue; + + progs->items[cnt].prog = pl->prog; + progs->items[cnt].cgroup_storage = pl->storage; + cnt++; + } + } while ((p = cgroup_parent(p))); rcu_assign_pointer(*array, progs); return 0; diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 253aa8e79c7b..9abcf25ebf9f 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -1542,7 +1542,8 @@ struct bpf_prog_array *bpf_prog_array_alloc(u32 prog_cnt, gfp_t flags) { if (prog_cnt) return kzalloc(sizeof(struct bpf_prog_array) + - sizeof(struct bpf_prog *) * (prog_cnt + 1), + sizeof(struct bpf_prog_array_item) * + (prog_cnt + 1), flags); return &empty_prog_array.hdr; @@ -1556,43 +1557,45 @@ void bpf_prog_array_free(struct bpf_prog_array __rcu *progs) kfree_rcu(progs, rcu); } -int bpf_prog_array_length(struct bpf_prog_array __rcu *progs) +int bpf_prog_array_length(struct bpf_prog_array __rcu *array) { - struct bpf_prog **prog; + struct bpf_prog_array_item *item; u32 cnt = 0; rcu_read_lock(); - prog = rcu_dereference(progs)->progs; - for (; *prog; prog++) - if (*prog != &dummy_bpf_prog.prog) + item = rcu_dereference(array)->items; + for (; item->prog; item++) + if (item->prog != &dummy_bpf_prog.prog) cnt++; rcu_read_unlock(); return cnt; } -static bool bpf_prog_array_copy_core(struct bpf_prog **prog, + +static bool bpf_prog_array_copy_core(struct bpf_prog_array __rcu *array, u32 *prog_ids, u32 request_cnt) { + struct bpf_prog_array_item *item; int i = 0; - for (; *prog; prog++) { - if (*prog == &dummy_bpf_prog.prog) + item = rcu_dereference(array)->items; + for (; item->prog; item++) { + if (item->prog == &dummy_bpf_prog.prog) continue; - prog_ids[i] = (*prog)->aux->id; + prog_ids[i] = item->prog->aux->id; if (++i == request_cnt) { - prog++; + item++; break; } } - return !!(*prog); + return !!(item->prog); } -int bpf_prog_array_copy_to_user(struct bpf_prog_array __rcu *progs, +int bpf_prog_array_copy_to_user(struct bpf_prog_array __rcu *array, __u32 __user *prog_ids, u32 cnt) { - struct bpf_prog **prog; unsigned long err = 0; bool nospc; u32 *ids; @@ -1611,8 +1614,7 @@ int bpf_prog_array_copy_to_user(struct bpf_prog_array __rcu *progs, if (!ids) return -ENOMEM; rcu_read_lock(); - prog = rcu_dereference(progs)->progs; - nospc = bpf_prog_array_copy_core(prog, ids, cnt); + nospc = bpf_prog_array_copy_core(array, ids, cnt); rcu_read_unlock(); err = copy_to_user(prog_ids, ids, cnt * sizeof(u32)); kfree(ids); @@ -1623,14 +1625,14 @@ int bpf_prog_array_copy_to_user(struct bpf_prog_array __rcu *progs, return 0; } -void bpf_prog_array_delete_safe(struct bpf_prog_array __rcu *progs, +void bpf_prog_array_delete_safe(struct bpf_prog_array __rcu *array, struct bpf_prog *old_prog) { - struct bpf_prog **prog = progs->progs; + struct bpf_prog_array_item *item = array->items; - for (; *prog; prog++) - if (*prog == old_prog) { - WRITE_ONCE(*prog, &dummy_bpf_prog.prog); + for (; item->prog; item++) + if (item->prog == old_prog) { + WRITE_ONCE(item->prog, &dummy_bpf_prog.prog); break; } } @@ -1641,7 +1643,7 @@ int bpf_prog_array_copy(struct bpf_prog_array __rcu *old_array, struct bpf_prog_array **new_array) { int new_prog_cnt, carry_prog_cnt = 0; - struct bpf_prog **existing_prog; + struct bpf_prog_array_item *existing; struct bpf_prog_array *array; bool found_exclude = false; int new_prog_idx = 0; @@ -1650,15 +1652,15 @@ int bpf_prog_array_copy(struct bpf_prog_array __rcu *old_array, * the new array. */ if (old_array) { - existing_prog = old_array->progs; - for (; *existing_prog; existing_prog++) { - if (*existing_prog == exclude_prog) { + existing = old_array->items; + for (; existing->prog; existing++) { + if (existing->prog == exclude_prog) { found_exclude = true; continue; } - if (*existing_prog != &dummy_bpf_prog.prog) + if (existing->prog != &dummy_bpf_prog.prog) carry_prog_cnt++; - if (*existing_prog == include_prog) + if (existing->prog == include_prog) return -EEXIST; } } @@ -1684,15 +1686,17 @@ int bpf_prog_array_copy(struct bpf_prog_array __rcu *old_array, /* Fill in the new prog array */ if (carry_prog_cnt) { - existing_prog = old_array->progs; - for (; *existing_prog; existing_prog++) - if (*existing_prog != exclude_prog && - *existing_prog != &dummy_bpf_prog.prog) - array->progs[new_prog_idx++] = *existing_prog; + existing = old_array->items; + for (; existing->prog; existing++) + if (existing->prog != exclude_prog && + existing->prog != &dummy_bpf_prog.prog) { + array->items[new_prog_idx++].prog = + existing->prog; + } } if (include_prog) - array->progs[new_prog_idx++] = include_prog; - array->progs[new_prog_idx] = NULL; + array->items[new_prog_idx++].prog = include_prog; + array->items[new_prog_idx].prog = NULL; *new_array = array; return 0; } @@ -1701,7 +1705,6 @@ int bpf_prog_array_copy_info(struct bpf_prog_array __rcu *array, u32 *prog_ids, u32 request_cnt, u32 *prog_cnt) { - struct bpf_prog **prog; u32 cnt = 0; if (array) @@ -1714,8 +1717,7 @@ int bpf_prog_array_copy_info(struct bpf_prog_array __rcu *array, return 0; /* this function is called under trace/bpf_trace.c: bpf_event_mutex */ - prog = rcu_dereference_check(array, 1)->progs; - return bpf_prog_array_copy_core(prog, prog_ids, request_cnt) ? -ENOSPC + return bpf_prog_array_copy_core(array, prog_ids, request_cnt) ? -ENOSPC : 0; } -- cgit v1.2.3-59-g8ed1b From 3e6a4b3e0289dc9540a2c1d8a20657f4707fbabb Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Thu, 2 Aug 2018 14:27:22 -0700 Subject: bpf/verifier: introduce BPF_PTR_TO_MAP_VALUE BPF_MAP_TYPE_CGROUP_STORAGE maps are special in a way that the access from the bpf program side is lookup-free. That means the result is guaranteed to be a valid pointer to the cgroup storage; no NULL-check is required. This patch introduces BPF_PTR_TO_MAP_VALUE return type, which is required to cause the verifier accept programs, which are not checking the map value pointer for being NULL. Signed-off-by: Roman Gushchin Cc: Alexei Starovoitov Cc: Daniel Borkmann Acked-by: Martin KaFai Lau Signed-off-by: Daniel Borkmann --- include/linux/bpf.h | 1 + kernel/bpf/verifier.c | 8 ++++++-- 2 files changed, 7 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 16be67888c30..ca4ac2a39def 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -155,6 +155,7 @@ enum bpf_arg_type { enum bpf_return_type { RET_INTEGER, /* function returns integer */ RET_VOID, /* function doesn't return anything */ + RET_PTR_TO_MAP_VALUE, /* returns a pointer to map elem value */ RET_PTR_TO_MAP_VALUE_OR_NULL, /* returns a pointer to map elem value or NULL */ }; diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 7e75434a9e54..1ede16c8bb40 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -2545,8 +2545,12 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn mark_reg_unknown(env, regs, BPF_REG_0); } else if (fn->ret_type == RET_VOID) { regs[BPF_REG_0].type = NOT_INIT; - } else if (fn->ret_type == RET_PTR_TO_MAP_VALUE_OR_NULL) { - regs[BPF_REG_0].type = PTR_TO_MAP_VALUE_OR_NULL; + } else if (fn->ret_type == RET_PTR_TO_MAP_VALUE_OR_NULL || + fn->ret_type == RET_PTR_TO_MAP_VALUE) { + if (fn->ret_type == RET_PTR_TO_MAP_VALUE) + regs[BPF_REG_0].type = PTR_TO_MAP_VALUE; + else + regs[BPF_REG_0].type = PTR_TO_MAP_VALUE_OR_NULL; /* There is no offset yet applied, variable or fixed */ mark_reg_known_zero(env, regs, BPF_REG_0); regs[BPF_REG_0].off = 0; -- cgit v1.2.3-59-g8ed1b From cd3394317653837e2eb5c5d0904a8996102af9fc Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Thu, 2 Aug 2018 14:27:24 -0700 Subject: bpf: introduce the bpf_get_local_storage() helper function The bpf_get_local_storage() helper function is used to get a pointer to the bpf local storage from a bpf program. It takes a pointer to a storage map and flags as arguments. Right now it accepts only cgroup storage maps, and flags argument has to be 0. Further it can be extended to support other types of local storage: e.g. thread local storage etc. Signed-off-by: Roman Gushchin Cc: Alexei Starovoitov Cc: Daniel Borkmann Acked-by: Martin KaFai Lau Signed-off-by: Daniel Borkmann --- include/linux/bpf.h | 2 ++ include/uapi/linux/bpf.h | 21 ++++++++++++++++++++- kernel/bpf/cgroup.c | 2 ++ kernel/bpf/core.c | 1 + kernel/bpf/helpers.c | 20 ++++++++++++++++++++ kernel/bpf/verifier.c | 18 ++++++++++++++++++ net/core/filter.c | 23 ++++++++++++++++++++++- 7 files changed, 85 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index ca4ac2a39def..cd8790d2c6ed 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -788,6 +788,8 @@ extern const struct bpf_func_proto bpf_sock_map_update_proto; extern const struct bpf_func_proto bpf_sock_hash_update_proto; extern const struct bpf_func_proto bpf_get_current_cgroup_id_proto; +extern const struct bpf_func_proto bpf_get_local_storage_proto; + /* Shared helpers among cBPF and eBPF. */ void bpf_user_rnd_init_once(void); u64 bpf_user_rnd_u32(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5); diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index b10118ee5afe..dd5758dc35d3 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -2095,6 +2095,24 @@ union bpf_attr { * Return * A 64-bit integer containing the current cgroup id based * on the cgroup within which the current task is running. + * + * void* get_local_storage(void *map, u64 flags) + * Description + * Get the pointer to the local storage area. + * The type and the size of the local storage is defined + * by the *map* argument. + * The *flags* meaning is specific for each map type, + * and has to be 0 for cgroup local storage. + * + * Depending on the bpf program type, a local storage area + * can be shared between multiple instances of the bpf program, + * running simultaneously. + * + * A user should care about the synchronization by himself. + * For example, by using the BPF_STX_XADD instruction to alter + * the shared data. + * Return + * Pointer to the local storage area. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -2177,7 +2195,8 @@ union bpf_attr { FN(rc_repeat), \ FN(rc_keydown), \ FN(skb_cgroup_id), \ - FN(get_current_cgroup_id), + FN(get_current_cgroup_id), \ + FN(get_local_storage), /* integer value in 'imm' field of BPF_CALL instruction selects which helper * function eBPF program intends to call diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index ddfa6cc13e57..0a4fe5a7dc91 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -684,6 +684,8 @@ cgroup_dev_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_map_delete_elem_proto; case BPF_FUNC_get_current_uid_gid: return &bpf_get_current_uid_gid_proto; + case BPF_FUNC_get_local_storage: + return &bpf_get_local_storage_proto; case BPF_FUNC_trace_printk: if (capable(CAP_SYS_ADMIN)) return bpf_get_trace_printk_proto(); diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 9abcf25ebf9f..4d09e610777f 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -1795,6 +1795,7 @@ const struct bpf_func_proto bpf_get_current_comm_proto __weak; const struct bpf_func_proto bpf_sock_map_update_proto __weak; const struct bpf_func_proto bpf_sock_hash_update_proto __weak; const struct bpf_func_proto bpf_get_current_cgroup_id_proto __weak; +const struct bpf_func_proto bpf_get_local_storage_proto __weak; const struct bpf_func_proto * __weak bpf_get_trace_printk_proto(void) { diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 73065e2d23c2..1991466b8327 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -193,4 +193,24 @@ const struct bpf_func_proto bpf_get_current_cgroup_id_proto = { .gpl_only = false, .ret_type = RET_INTEGER, }; + +DECLARE_PER_CPU(void*, bpf_cgroup_storage); + +BPF_CALL_2(bpf_get_local_storage, struct bpf_map *, map, u64, flags) +{ + /* map and flags arguments are not used now, + * but provide an ability to extend the API + * for other types of local storages. + * verifier checks that their values are correct. + */ + return (unsigned long) this_cpu_read(bpf_cgroup_storage); +} + +const struct bpf_func_proto bpf_get_local_storage_proto = { + .func = bpf_get_local_storage, + .gpl_only = false, + .ret_type = RET_PTR_TO_MAP_VALUE, + .arg1_type = ARG_CONST_MAP_PTR, + .arg2_type = ARG_ANYTHING, +}; #endif diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 1ede16c8bb40..587468a9c37d 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -2127,6 +2127,10 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env, func_id != BPF_FUNC_current_task_under_cgroup) goto error; break; + case BPF_MAP_TYPE_CGROUP_STORAGE: + if (func_id != BPF_FUNC_get_local_storage) + goto error; + break; /* devmap returns a pointer to a live net_device ifindex that we cannot * allow to be modified from bpf side. So do not allow lookup elements * for now. @@ -2209,6 +2213,10 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env, if (map->map_type != BPF_MAP_TYPE_SOCKHASH) goto error; break; + case BPF_FUNC_get_local_storage: + if (map->map_type != BPF_MAP_TYPE_CGROUP_STORAGE) + goto error; + break; default: break; } @@ -2533,6 +2541,16 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn } regs = cur_regs(env); + + /* check that flags argument in get_local_storage(map, flags) is 0, + * this is required because get_local_storage() can't return an error. + */ + if (func_id == BPF_FUNC_get_local_storage && + !register_is_null(®s[BPF_REG_2])) { + verbose(env, "get_local_storage() doesn't support non-zero flags\n"); + return -EINVAL; + } + /* reset caller saved regs */ for (i = 0; i < CALLER_SAVED_REGS; i++) { mark_reg_not_init(env, regs, caller_saved[i]); diff --git a/net/core/filter.c b/net/core/filter.c index 9bb9a4488e25..9f73aae2f089 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -4820,6 +4820,8 @@ sock_filter_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) */ case BPF_FUNC_get_current_uid_gid: return &bpf_get_current_uid_gid_proto; + case BPF_FUNC_get_local_storage: + return &bpf_get_local_storage_proto; default: return bpf_base_func_proto(func_id); } @@ -4844,6 +4846,8 @@ sock_addr_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) } case BPF_FUNC_get_socket_cookie: return &bpf_get_socket_cookie_sock_addr_proto; + case BPF_FUNC_get_local_storage: + return &bpf_get_local_storage_proto; default: return bpf_base_func_proto(func_id); } @@ -4866,6 +4870,17 @@ sk_filter_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) } } +static const struct bpf_func_proto * +cg_skb_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) +{ + switch (func_id) { + case BPF_FUNC_get_local_storage: + return &bpf_get_local_storage_proto; + default: + return sk_filter_func_proto(func_id, prog); + } +} + static const struct bpf_func_proto * tc_cls_act_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { @@ -4988,6 +5003,8 @@ sock_ops_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_sock_hash_update_proto; case BPF_FUNC_get_socket_cookie: return &bpf_get_socket_cookie_sock_ops_proto; + case BPF_FUNC_get_local_storage: + return &bpf_get_local_storage_proto; default: return bpf_base_func_proto(func_id); } @@ -5007,6 +5024,8 @@ sk_msg_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_msg_cork_bytes_proto; case BPF_FUNC_msg_pull_data: return &bpf_msg_pull_data_proto; + case BPF_FUNC_get_local_storage: + return &bpf_get_local_storage_proto; default: return bpf_base_func_proto(func_id); } @@ -5034,6 +5053,8 @@ sk_skb_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_sk_redirect_map_proto; case BPF_FUNC_sk_redirect_hash: return &bpf_sk_redirect_hash_proto; + case BPF_FUNC_get_local_storage: + return &bpf_get_local_storage_proto; default: return bpf_base_func_proto(func_id); } @@ -6838,7 +6859,7 @@ const struct bpf_prog_ops xdp_prog_ops = { }; const struct bpf_verifier_ops cg_skb_verifier_ops = { - .get_func_proto = sk_filter_func_proto, + .get_func_proto = cg_skb_func_proto, .is_valid_access = sk_filter_is_valid_access, .convert_ctx_access = bpf_convert_ctx_access, }; -- cgit v1.2.3-59-g8ed1b From 7cca1ed0bb248b8d5768d17f5afe297a832d66c0 Mon Sep 17 00:00:00 2001 From: Fernando Fernandez Mancera Date: Tue, 31 Jul 2018 20:25:00 +0200 Subject: netfilter: nf_osf: move nf_osf_fingers to non-uapi header file All warnings (new ones prefixed by >>): >> ./usr/include/linux/netfilter/nf_osf.h:73: userspace cannot reference function or variable defined in the kernel Fixes: f9324952088f ("netfilter: nfnetlink_osf: extract nfnetlink_subsystem code from xt_osf.c") Reported-by: kbuild test robot Signed-off-by: Fernando Fernandez Mancera Signed-off-by: Pablo Neira Ayuso --- include/linux/netfilter/nf_osf.h | 2 ++ include/uapi/linux/netfilter/nf_osf.h | 2 -- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/netfilter/nf_osf.h b/include/linux/netfilter/nf_osf.h index aee460fcbd31..3e455d6f94d5 100644 --- a/include/linux/netfilter/nf_osf.h +++ b/include/linux/netfilter/nf_osf.h @@ -25,6 +25,8 @@ enum osf_fmatch_states { FMATCH_OPT_WRONG, }; +extern struct list_head nf_osf_fingers[2]; + struct nf_osf_finger { struct rcu_head rcu_head; struct list_head finger_entry; diff --git a/include/uapi/linux/netfilter/nf_osf.h b/include/uapi/linux/netfilter/nf_osf.h index cc2487ff74f6..3b93fbb9fc24 100644 --- a/include/uapi/linux/netfilter/nf_osf.h +++ b/include/uapi/linux/netfilter/nf_osf.h @@ -70,8 +70,6 @@ struct nf_osf_nlmsg { struct tcphdr tcp; }; -extern struct list_head nf_osf_fingers[2]; - /* Defines for IANA option kinds */ enum iana_options { OSFOPT_EOL = 0, /* End of options */ -- cgit v1.2.3-59-g8ed1b From ddba40be59c9be4059288464f8e6f38fbba27495 Mon Sep 17 00:00:00 2001 From: Fernando Fernandez Mancera Date: Tue, 31 Jul 2018 20:25:01 +0200 Subject: netfilter: nfnetlink_osf: rename nf_osf header file to nfnetlink_osf The first client of the nf_osf.h userspace header is nft_osf, coming in this batch, rename it to nfnetlink_osf.h as there are no userspace clients for this yet, hence this looks consistent with other nfnetlink subsystem. Suggested-by: Jan Engelhardt Signed-off-by: Fernando Fernandez Mancera Signed-off-by: Pablo Neira Ayuso --- include/linux/netfilter/nf_osf.h | 44 ----------- include/linux/netfilter/nfnetlink_osf.h | 44 +++++++++++ include/uapi/linux/netfilter/nf_osf.h | 106 --------------------------- include/uapi/linux/netfilter/nfnetlink_osf.h | 106 +++++++++++++++++++++++++++ include/uapi/linux/netfilter/xt_osf.h | 2 +- net/netfilter/nfnetlink_osf.c | 2 +- net/netfilter/nft_osf.c | 2 +- 7 files changed, 153 insertions(+), 153 deletions(-) delete mode 100644 include/linux/netfilter/nf_osf.h create mode 100644 include/linux/netfilter/nfnetlink_osf.h delete mode 100644 include/uapi/linux/netfilter/nf_osf.h create mode 100644 include/uapi/linux/netfilter/nfnetlink_osf.h (limited to 'include/linux') diff --git a/include/linux/netfilter/nf_osf.h b/include/linux/netfilter/nf_osf.h deleted file mode 100644 index 3e455d6f94d5..000000000000 --- a/include/linux/netfilter/nf_osf.h +++ /dev/null @@ -1,44 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _NFOSF_H -#define _NFOSF_H - -#include - -/* Initial window size option state machine: multiple of mss, mtu or - * plain numeric value. Can also be made as plain numeric value which - * is not a multiple of specified value. - */ -enum nf_osf_window_size_options { - OSF_WSS_PLAIN = 0, - OSF_WSS_MSS, - OSF_WSS_MTU, - OSF_WSS_MODULO, - OSF_WSS_MAX, -}; - -enum osf_fmatch_states { - /* Packet does not match the fingerprint */ - FMATCH_WRONG = 0, - /* Packet matches the fingerprint */ - FMATCH_OK, - /* Options do not match the fingerprint, but header does */ - FMATCH_OPT_WRONG, -}; - -extern struct list_head nf_osf_fingers[2]; - -struct nf_osf_finger { - struct rcu_head rcu_head; - struct list_head finger_entry; - struct nf_osf_user_finger finger; -}; - -bool nf_osf_match(const struct sk_buff *skb, u_int8_t family, - int hooknum, struct net_device *in, struct net_device *out, - const struct nf_osf_info *info, struct net *net, - const struct list_head *nf_osf_fingers); - -const char *nf_osf_find(const struct sk_buff *skb, - const struct list_head *nf_osf_fingers); - -#endif /* _NFOSF_H */ diff --git a/include/linux/netfilter/nfnetlink_osf.h b/include/linux/netfilter/nfnetlink_osf.h new file mode 100644 index 000000000000..a7311bc03d3a --- /dev/null +++ b/include/linux/netfilter/nfnetlink_osf.h @@ -0,0 +1,44 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _NFOSF_H +#define _NFOSF_H + +#include + +/* Initial window size option state machine: multiple of mss, mtu or + * plain numeric value. Can also be made as plain numeric value which + * is not a multiple of specified value. + */ +enum nf_osf_window_size_options { + OSF_WSS_PLAIN = 0, + OSF_WSS_MSS, + OSF_WSS_MTU, + OSF_WSS_MODULO, + OSF_WSS_MAX, +}; + +enum osf_fmatch_states { + /* Packet does not match the fingerprint */ + FMATCH_WRONG = 0, + /* Packet matches the fingerprint */ + FMATCH_OK, + /* Options do not match the fingerprint, but header does */ + FMATCH_OPT_WRONG, +}; + +extern struct list_head nf_osf_fingers[2]; + +struct nf_osf_finger { + struct rcu_head rcu_head; + struct list_head finger_entry; + struct nf_osf_user_finger finger; +}; + +bool nf_osf_match(const struct sk_buff *skb, u_int8_t family, + int hooknum, struct net_device *in, struct net_device *out, + const struct nf_osf_info *info, struct net *net, + const struct list_head *nf_osf_fingers); + +const char *nf_osf_find(const struct sk_buff *skb, + const struct list_head *nf_osf_fingers); + +#endif /* _NFOSF_H */ diff --git a/include/uapi/linux/netfilter/nf_osf.h b/include/uapi/linux/netfilter/nf_osf.h deleted file mode 100644 index 3b93fbb9fc24..000000000000 --- a/include/uapi/linux/netfilter/nf_osf.h +++ /dev/null @@ -1,106 +0,0 @@ -#ifndef _NF_OSF_H -#define _NF_OSF_H - -#include - -#define MAXGENRELEN 32 - -#define NF_OSF_GENRE (1 << 0) -#define NF_OSF_TTL (1 << 1) -#define NF_OSF_LOG (1 << 2) -#define NF_OSF_INVERT (1 << 3) - -#define NF_OSF_LOGLEVEL_ALL 0 /* log all matched fingerprints */ -#define NF_OSF_LOGLEVEL_FIRST 1 /* log only the first matced fingerprint */ -#define NF_OSF_LOGLEVEL_ALL_KNOWN 2 /* do not log unknown packets */ - -#define NF_OSF_TTL_TRUE 0 /* True ip and fingerprint TTL comparison */ - -/* Check if ip TTL is less than fingerprint one */ -#define NF_OSF_TTL_LESS 1 - -/* Do not compare ip and fingerprint TTL at all */ -#define NF_OSF_TTL_NOCHECK 2 - -#define NF_OSF_FLAGMASK (NF_OSF_GENRE | NF_OSF_TTL | \ - NF_OSF_LOG | NF_OSF_INVERT) -/* Wildcard MSS (kind of). - * It is used to implement a state machine for the different wildcard values - * of the MSS and window sizes. - */ -struct nf_osf_wc { - __u32 wc; - __u32 val; -}; - -/* This struct represents IANA options - * http://www.iana.org/assignments/tcp-parameters - */ -struct nf_osf_opt { - __u16 kind, length; - struct nf_osf_wc wc; -}; - -struct nf_osf_info { - char genre[MAXGENRELEN]; - __u32 len; - __u32 flags; - __u32 loglevel; - __u32 ttl; -}; - -struct nf_osf_user_finger { - struct nf_osf_wc wss; - - __u8 ttl, df; - __u16 ss, mss; - __u16 opt_num; - - char genre[MAXGENRELEN]; - char version[MAXGENRELEN]; - char subtype[MAXGENRELEN]; - - /* MAX_IPOPTLEN is maximum if all options are NOPs or EOLs */ - struct nf_osf_opt opt[MAX_IPOPTLEN]; -}; - -struct nf_osf_nlmsg { - struct nf_osf_user_finger f; - struct iphdr ip; - struct tcphdr tcp; -}; - -/* Defines for IANA option kinds */ -enum iana_options { - OSFOPT_EOL = 0, /* End of options */ - OSFOPT_NOP, /* NOP */ - OSFOPT_MSS, /* Maximum segment size */ - OSFOPT_WSO, /* Window scale option */ - OSFOPT_SACKP, /* SACK permitted */ - OSFOPT_SACK, /* SACK */ - OSFOPT_ECHO, - OSFOPT_ECHOREPLY, - OSFOPT_TS, /* Timestamp option */ - OSFOPT_POCP, /* Partial Order Connection Permitted */ - OSFOPT_POSP, /* Partial Order Service Profile */ - - /* Others are not used in the current OSF */ - OSFOPT_EMPTY = 255, -}; - -enum nf_osf_attr_type { - OSF_ATTR_UNSPEC, - OSF_ATTR_FINGER, - OSF_ATTR_MAX, -}; - -/* - * Add/remove fingerprint from the kernel. - */ -enum nf_osf_msg_types { - OSF_MSG_ADD, - OSF_MSG_REMOVE, - OSF_MSG_MAX, -}; - -#endif /* _NF_OSF_H */ diff --git a/include/uapi/linux/netfilter/nfnetlink_osf.h b/include/uapi/linux/netfilter/nfnetlink_osf.h new file mode 100644 index 000000000000..3b93fbb9fc24 --- /dev/null +++ b/include/uapi/linux/netfilter/nfnetlink_osf.h @@ -0,0 +1,106 @@ +#ifndef _NF_OSF_H +#define _NF_OSF_H + +#include + +#define MAXGENRELEN 32 + +#define NF_OSF_GENRE (1 << 0) +#define NF_OSF_TTL (1 << 1) +#define NF_OSF_LOG (1 << 2) +#define NF_OSF_INVERT (1 << 3) + +#define NF_OSF_LOGLEVEL_ALL 0 /* log all matched fingerprints */ +#define NF_OSF_LOGLEVEL_FIRST 1 /* log only the first matced fingerprint */ +#define NF_OSF_LOGLEVEL_ALL_KNOWN 2 /* do not log unknown packets */ + +#define NF_OSF_TTL_TRUE 0 /* True ip and fingerprint TTL comparison */ + +/* Check if ip TTL is less than fingerprint one */ +#define NF_OSF_TTL_LESS 1 + +/* Do not compare ip and fingerprint TTL at all */ +#define NF_OSF_TTL_NOCHECK 2 + +#define NF_OSF_FLAGMASK (NF_OSF_GENRE | NF_OSF_TTL | \ + NF_OSF_LOG | NF_OSF_INVERT) +/* Wildcard MSS (kind of). + * It is used to implement a state machine for the different wildcard values + * of the MSS and window sizes. + */ +struct nf_osf_wc { + __u32 wc; + __u32 val; +}; + +/* This struct represents IANA options + * http://www.iana.org/assignments/tcp-parameters + */ +struct nf_osf_opt { + __u16 kind, length; + struct nf_osf_wc wc; +}; + +struct nf_osf_info { + char genre[MAXGENRELEN]; + __u32 len; + __u32 flags; + __u32 loglevel; + __u32 ttl; +}; + +struct nf_osf_user_finger { + struct nf_osf_wc wss; + + __u8 ttl, df; + __u16 ss, mss; + __u16 opt_num; + + char genre[MAXGENRELEN]; + char version[MAXGENRELEN]; + char subtype[MAXGENRELEN]; + + /* MAX_IPOPTLEN is maximum if all options are NOPs or EOLs */ + struct nf_osf_opt opt[MAX_IPOPTLEN]; +}; + +struct nf_osf_nlmsg { + struct nf_osf_user_finger f; + struct iphdr ip; + struct tcphdr tcp; +}; + +/* Defines for IANA option kinds */ +enum iana_options { + OSFOPT_EOL = 0, /* End of options */ + OSFOPT_NOP, /* NOP */ + OSFOPT_MSS, /* Maximum segment size */ + OSFOPT_WSO, /* Window scale option */ + OSFOPT_SACKP, /* SACK permitted */ + OSFOPT_SACK, /* SACK */ + OSFOPT_ECHO, + OSFOPT_ECHOREPLY, + OSFOPT_TS, /* Timestamp option */ + OSFOPT_POCP, /* Partial Order Connection Permitted */ + OSFOPT_POSP, /* Partial Order Service Profile */ + + /* Others are not used in the current OSF */ + OSFOPT_EMPTY = 255, +}; + +enum nf_osf_attr_type { + OSF_ATTR_UNSPEC, + OSF_ATTR_FINGER, + OSF_ATTR_MAX, +}; + +/* + * Add/remove fingerprint from the kernel. + */ +enum nf_osf_msg_types { + OSF_MSG_ADD, + OSF_MSG_REMOVE, + OSF_MSG_MAX, +}; + +#endif /* _NF_OSF_H */ diff --git a/include/uapi/linux/netfilter/xt_osf.h b/include/uapi/linux/netfilter/xt_osf.h index a90e90c27cef..c56c59605c2b 100644 --- a/include/uapi/linux/netfilter/xt_osf.h +++ b/include/uapi/linux/netfilter/xt_osf.h @@ -23,7 +23,7 @@ #include #include #include -#include +#include #define XT_OSF_GENRE NF_OSF_GENRE #define XT_OSF_INVERT NF_OSF_INVERT diff --git a/net/netfilter/nfnetlink_osf.c b/net/netfilter/nfnetlink_osf.c index ba0fa11869ce..f9dba62c450f 100644 --- a/net/netfilter/nfnetlink_osf.c +++ b/net/netfilter/nfnetlink_osf.c @@ -18,7 +18,7 @@ #include #include #include -#include +#include /* * Indexed by dont-fragment bit. diff --git a/net/netfilter/nft_osf.c b/net/netfilter/nft_osf.c index bdacc4cffba4..9b2f3de7be4f 100644 --- a/net/netfilter/nft_osf.c +++ b/net/netfilter/nft_osf.c @@ -2,7 +2,7 @@ #include #include -#include +#include #define OSF_GENRE_SIZE 32 -- cgit v1.2.3-59-g8ed1b From 94276fa8a2a4c08ccb2e9d55e88b95dc972ccea3 Mon Sep 17 00:00:00 2001 From: Máté Eckl Date: Fri, 3 Aug 2018 13:36:13 +0200 Subject: netfilter: bridge: Expose nf_tables bridge hook priorities through uapi MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Netfilter exposes standard hook priorities in case of ipv4, ipv6 and arp but not in case of bridge. This patch exposes the hook priority values of the bridge family (which are different from the formerly mentioned) via uapi so that they can be used by user-space applications just like the others. Signed-off-by: Máté Eckl Signed-off-by: Pablo Neira Ayuso --- include/linux/netfilter_bridge.h | 11 ----------- include/uapi/linux/netfilter_bridge.h | 11 +++++++++++ net/bridge/br_netfilter_hooks.c | 1 + net/bridge/netfilter/ebtable_filter.c | 1 + net/bridge/netfilter/ebtable_nat.c | 1 + 5 files changed, 14 insertions(+), 11 deletions(-) (limited to 'include/linux') diff --git a/include/linux/netfilter_bridge.h b/include/linux/netfilter_bridge.h index b671fdfd212b..fa0686500970 100644 --- a/include/linux/netfilter_bridge.h +++ b/include/linux/netfilter_bridge.h @@ -5,17 +5,6 @@ #include #include -enum nf_br_hook_priorities { - NF_BR_PRI_FIRST = INT_MIN, - NF_BR_PRI_NAT_DST_BRIDGED = -300, - NF_BR_PRI_FILTER_BRIDGED = -200, - NF_BR_PRI_BRNF = 0, - NF_BR_PRI_NAT_DST_OTHER = 100, - NF_BR_PRI_FILTER_OTHER = 200, - NF_BR_PRI_NAT_SRC = 300, - NF_BR_PRI_LAST = INT_MAX, -}; - #if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) int br_handle_frame_finish(struct net *net, struct sock *sk, struct sk_buff *skb); diff --git a/include/uapi/linux/netfilter_bridge.h b/include/uapi/linux/netfilter_bridge.h index 12fb77633f83..156ccd089df1 100644 --- a/include/uapi/linux/netfilter_bridge.h +++ b/include/uapi/linux/netfilter_bridge.h @@ -26,4 +26,15 @@ #define NF_BR_BROUTING 5 #define NF_BR_NUMHOOKS 6 +enum nf_br_hook_priorities { + NF_BR_PRI_FIRST = INT_MIN, + NF_BR_PRI_NAT_DST_BRIDGED = -300, + NF_BR_PRI_FILTER_BRIDGED = -200, + NF_BR_PRI_BRNF = 0, + NF_BR_PRI_NAT_DST_OTHER = 100, + NF_BR_PRI_FILTER_OTHER = 200, + NF_BR_PRI_NAT_SRC = 300, + NF_BR_PRI_LAST = INT_MAX, +}; + #endif /* _UAPI__LINUX_BRIDGE_NETFILTER_H */ diff --git a/net/bridge/br_netfilter_hooks.c b/net/bridge/br_netfilter_hooks.c index 9b16eaf33819..6e0dc6bcd32a 100644 --- a/net/bridge/br_netfilter_hooks.c +++ b/net/bridge/br_netfilter_hooks.c @@ -26,6 +26,7 @@ #include #include #include +#include #include #include #include diff --git a/net/bridge/netfilter/ebtable_filter.c b/net/bridge/netfilter/ebtable_filter.c index c41da5fac84f..550324c516ee 100644 --- a/net/bridge/netfilter/ebtable_filter.c +++ b/net/bridge/netfilter/ebtable_filter.c @@ -9,6 +9,7 @@ */ #include +#include #include #define FILTER_VALID_HOOKS ((1 << NF_BR_LOCAL_IN) | (1 << NF_BR_FORWARD) | \ diff --git a/net/bridge/netfilter/ebtable_nat.c b/net/bridge/netfilter/ebtable_nat.c index 08df7406ecb3..c0fb3ca518af 100644 --- a/net/bridge/netfilter/ebtable_nat.c +++ b/net/bridge/netfilter/ebtable_nat.c @@ -9,6 +9,7 @@ */ #include +#include #include #define NAT_VALID_HOOKS ((1 << NF_BR_PRE_ROUTING) | (1 << NF_BR_LOCAL_OUT) | \ -- cgit v1.2.3-59-g8ed1b From 91305f2812624c0cf7ccbb44133b66d3b24676e4 Mon Sep 17 00:00:00 2001 From: Yangbo Lu Date: Wed, 1 Aug 2018 18:05:54 +0800 Subject: ptp_qoriq: support automatic configuration for ptp timer This patch is to support automatic configuration for ptp timer. If required ptp dts properties are not provided, driver could try to calculate a set of default configurations to initialize the ptp timer. This makes the driver work for many boards which don't have the required ptp dts properties in current kernel. Also the users could set dts properties by themselves according to their requirement. Signed-off-by: Yangbo Lu Signed-off-by: David S. Miller --- drivers/ptp/ptp_qoriq.c | 111 ++++++++++++++++++++++++++++++++++++++++-- include/linux/fsl/ptp_qoriq.h | 6 ++- 2 files changed, 113 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/drivers/ptp/ptp_qoriq.c b/drivers/ptp/ptp_qoriq.c index a14c317b5a38..095c18532dc7 100644 --- a/drivers/ptp/ptp_qoriq.c +++ b/drivers/ptp/ptp_qoriq.c @@ -29,6 +29,7 @@ #include #include #include +#include #include @@ -317,6 +318,105 @@ static const struct ptp_clock_info ptp_qoriq_caps = { .enable = ptp_qoriq_enable, }; +/** + * qoriq_ptp_nominal_freq - calculate nominal frequency according to + * reference clock frequency + * + * @clk_src: reference clock frequency + * + * The nominal frequency is the desired clock frequency. + * It should be less than the reference clock frequency. + * It should be a factor of 1000MHz. + * + * Return the nominal frequency + */ +static u32 qoriq_ptp_nominal_freq(u32 clk_src) +{ + u32 remainder = 0; + + clk_src /= 1000000; + remainder = clk_src % 100; + if (remainder) { + clk_src -= remainder; + clk_src += 100; + } + + do { + clk_src -= 100; + + } while (1000 % clk_src); + + return clk_src * 1000000; +} + +/** + * qoriq_ptp_auto_config - calculate a set of default configurations + * + * @qoriq_ptp: pointer to qoriq_ptp + * @node: pointer to device_node + * + * If below dts properties are not provided, this function will be + * called to calculate a set of default configurations for them. + * "fsl,tclk-period" + * "fsl,tmr-prsc" + * "fsl,tmr-add" + * "fsl,tmr-fiper1" + * "fsl,tmr-fiper2" + * "fsl,max-adj" + * + * Return 0 if success + */ +static int qoriq_ptp_auto_config(struct qoriq_ptp *qoriq_ptp, + struct device_node *node) +{ + struct clk *clk; + u64 freq_comp; + u64 max_adj; + u32 nominal_freq; + u32 clk_src = 0; + + qoriq_ptp->cksel = DEFAULT_CKSEL; + + clk = of_clk_get(node, 0); + if (!IS_ERR(clk)) { + clk_src = clk_get_rate(clk); + clk_put(clk); + } + + if (clk_src <= 100000000UL) { + pr_err("error reference clock value, or lower than 100MHz\n"); + return -EINVAL; + } + + nominal_freq = qoriq_ptp_nominal_freq(clk_src); + if (!nominal_freq) + return -EINVAL; + + qoriq_ptp->tclk_period = 1000000000UL / nominal_freq; + qoriq_ptp->tmr_prsc = DEFAULT_TMR_PRSC; + + /* Calculate initial frequency compensation value for TMR_ADD register. + * freq_comp = ceil(2^32 / freq_ratio) + * freq_ratio = reference_clock_freq / nominal_freq + */ + freq_comp = ((u64)1 << 32) * nominal_freq; + if (do_div(freq_comp, clk_src)) + freq_comp++; + + qoriq_ptp->tmr_add = freq_comp; + qoriq_ptp->tmr_fiper1 = DEFAULT_FIPER1_PERIOD - qoriq_ptp->tclk_period; + qoriq_ptp->tmr_fiper2 = DEFAULT_FIPER2_PERIOD - qoriq_ptp->tclk_period; + + /* max_adj = 1000000000 * (freq_ratio - 1.0) - 1 + * freq_ratio = reference_clock_freq / nominal_freq + */ + max_adj = 1000000000ULL * (clk_src - nominal_freq); + max_adj = max_adj / nominal_freq - 1; + qoriq_ptp->caps.max_adj = max_adj; + + return 0; +} + static int qoriq_ptp_probe(struct platform_device *dev) { struct device_node *node = dev->dev.of_node; @@ -332,7 +432,7 @@ static int qoriq_ptp_probe(struct platform_device *dev) if (!qoriq_ptp) goto no_memory; - err = -ENODEV; + err = -EINVAL; qoriq_ptp->caps = ptp_qoriq_caps; @@ -351,10 +451,14 @@ static int qoriq_ptp_probe(struct platform_device *dev) "fsl,tmr-fiper2", &qoriq_ptp->tmr_fiper2) || of_property_read_u32(node, "fsl,max-adj", &qoriq_ptp->caps.max_adj)) { - pr_err("device tree node missing required elements\n"); - goto no_node; + pr_warn("device tree node missing required elements, try automatic configuration\n"); + + if (qoriq_ptp_auto_config(qoriq_ptp, node)) + goto no_config; } + err = -ENODEV; + qoriq_ptp->irq = platform_get_irq(dev, 0); if (qoriq_ptp->irq < 0) { @@ -436,6 +540,7 @@ no_ioremap: release_resource(qoriq_ptp->rsrc); no_resource: free_irq(qoriq_ptp->irq, qoriq_ptp); +no_config: no_node: kfree(qoriq_ptp); no_memory: diff --git a/include/linux/fsl/ptp_qoriq.h b/include/linux/fsl/ptp_qoriq.h index dc3dac40f069..c1f003aadcce 100644 --- a/include/linux/fsl/ptp_qoriq.h +++ b/include/linux/fsl/ptp_qoriq.h @@ -127,9 +127,13 @@ struct qoriq_ptp_registers { #define DRIVER "ptp_qoriq" -#define DEFAULT_CKSEL 1 #define N_EXT_TS 2 +#define DEFAULT_CKSEL 1 +#define DEFAULT_TMR_PRSC 2 +#define DEFAULT_FIPER1_PERIOD 1000000000 +#define DEFAULT_FIPER2_PERIOD 100000 + struct qoriq_ptp { void __iomem *base; struct qoriq_ptp_registers regs; -- cgit v1.2.3-59-g8ed1b From 385114dec8a49b5e5945e77ba7de6356106713f4 Mon Sep 17 00:00:00 2001 From: Peter Oskolkov Date: Thu, 2 Aug 2018 23:34:38 +0000 Subject: net: modify skb_rbtree_purge to return the truesize of all purged skbs. Tested: see the next patch is the series. Suggested-by: Eric Dumazet Signed-off-by: Peter Oskolkov Signed-off-by: Eric Dumazet Cc: Florian Westphal Signed-off-by: David S. Miller --- include/linux/skbuff.h | 2 +- net/core/skbuff.c | 6 +++++- 2 files changed, 6 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index fd3cb1b247df..47848367c816 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -2585,7 +2585,7 @@ static inline void __skb_queue_purge(struct sk_buff_head *list) kfree_skb(skb); } -void skb_rbtree_purge(struct rb_root *root); +unsigned int skb_rbtree_purge(struct rb_root *root); void *netdev_alloc_frag(unsigned int fragsz); diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 51b0a9126e12..8d574a88125d 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -2858,23 +2858,27 @@ EXPORT_SYMBOL(skb_queue_purge); /** * skb_rbtree_purge - empty a skb rbtree * @root: root of the rbtree to empty + * Return value: the sum of truesizes of all purged skbs. * * Delete all buffers on an &sk_buff rbtree. Each buffer is removed from * the list and one reference dropped. This function does not take * any lock. Synchronization should be handled by the caller (e.g., TCP * out-of-order queue is protected by the socket lock). */ -void skb_rbtree_purge(struct rb_root *root) +unsigned int skb_rbtree_purge(struct rb_root *root) { struct rb_node *p = rb_first(root); + unsigned int sum = 0; while (p) { struct sk_buff *skb = rb_entry(p, struct sk_buff, rbnode); p = rb_next(p); rb_erase(&skb->rbnode, root); + sum += skb->truesize; kfree_skb(skb); } + return sum; } /** -- cgit v1.2.3-59-g8ed1b From fa0f527358bd900ef92f925878ed6bfbd51305cc Mon Sep 17 00:00:00 2001 From: Peter Oskolkov Date: Thu, 2 Aug 2018 23:34:39 +0000 Subject: ip: use rb trees for IP frag queue. Similar to TCP OOO RX queue, it makes sense to use rb trees to store IP fragments, so that OOO fragments are inserted faster. Tested: - a follow-up patch contains a rather comprehensive ip defrag self-test (functional) - ran neper `udp_stream -c -H -F 100 -l 300 -T 20`: netstat --statistics Ip: 282078937 total packets received 0 forwarded 0 incoming packets discarded 946760 incoming packets delivered 18743456 requests sent out 101 fragments dropped after timeout 282077129 reassemblies required 944952 packets reassembled ok 262734239 packet reassembles failed (The numbers/stats above are somewhat better re: reassemblies vs a kernel without this patchset. More comprehensive performance testing TBD). Reported-by: Jann Horn Reported-by: Juha-Matti Tilli Suggested-by: Eric Dumazet Signed-off-by: Peter Oskolkov Signed-off-by: Eric Dumazet Cc: Florian Westphal Signed-off-by: David S. Miller --- include/linux/skbuff.h | 9 +- include/net/inet_frag.h | 3 +- net/ipv4/inet_fragment.c | 16 +-- net/ipv4/ip_fragment.c | 182 ++++++++++++++++++-------------- net/ipv6/netfilter/nf_conntrack_reasm.c | 1 + net/ipv6/reassembly.c | 1 + 6 files changed, 121 insertions(+), 91 deletions(-) (limited to 'include/linux') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 47848367c816..7ebdf158a795 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -676,13 +676,16 @@ struct sk_buff { * UDP receive path is one user. */ unsigned long dev_scratch; - int ip_defrag_offset; }; }; - struct rb_node rbnode; /* used in netem & tcp stack */ + struct rb_node rbnode; /* used in netem, ip4 defrag, and tcp stack */ struct list_head list; }; - struct sock *sk; + + union { + struct sock *sk; + int ip_defrag_offset; + }; union { ktime_t tstamp; diff --git a/include/net/inet_frag.h b/include/net/inet_frag.h index f4272a29dc44..b86d14528188 100644 --- a/include/net/inet_frag.h +++ b/include/net/inet_frag.h @@ -75,7 +75,8 @@ struct inet_frag_queue { struct timer_list timer; spinlock_t lock; refcount_t refcnt; - struct sk_buff *fragments; + struct sk_buff *fragments; /* Used in IPv6. */ + struct rb_root rb_fragments; /* Used in IPv4. */ struct sk_buff *fragments_tail; ktime_t stamp; int len; diff --git a/net/ipv4/inet_fragment.c b/net/ipv4/inet_fragment.c index ccd140e4082d..6d258a5669e7 100644 --- a/net/ipv4/inet_fragment.c +++ b/net/ipv4/inet_fragment.c @@ -137,12 +137,16 @@ void inet_frag_destroy(struct inet_frag_queue *q) fp = q->fragments; nf = q->net; f = nf->f; - while (fp) { - struct sk_buff *xp = fp->next; - - sum_truesize += fp->truesize; - kfree_skb(fp); - fp = xp; + if (fp) { + do { + struct sk_buff *xp = fp->next; + + sum_truesize += fp->truesize; + kfree_skb(fp); + fp = xp; + } while (fp); + } else { + sum_truesize = skb_rbtree_purge(&q->rb_fragments); } sum = sum_truesize + f->qsize; diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c index 960bf5eab59f..0e8f8de77e71 100644 --- a/net/ipv4/ip_fragment.c +++ b/net/ipv4/ip_fragment.c @@ -136,7 +136,7 @@ static void ip_expire(struct timer_list *t) { struct inet_frag_queue *frag = from_timer(frag, t, timer); const struct iphdr *iph; - struct sk_buff *head; + struct sk_buff *head = NULL; struct net *net; struct ipq *qp; int err; @@ -152,14 +152,31 @@ static void ip_expire(struct timer_list *t) ipq_kill(qp); __IP_INC_STATS(net, IPSTATS_MIB_REASMFAILS); - - head = qp->q.fragments; - __IP_INC_STATS(net, IPSTATS_MIB_REASMTIMEOUT); - if (!(qp->q.flags & INET_FRAG_FIRST_IN) || !head) + if (!qp->q.flags & INET_FRAG_FIRST_IN) goto out; + /* sk_buff::dev and sk_buff::rbnode are unionized. So we + * pull the head out of the tree in order to be able to + * deal with head->dev. + */ + if (qp->q.fragments) { + head = qp->q.fragments; + qp->q.fragments = head->next; + } else { + head = skb_rb_first(&qp->q.rb_fragments); + if (!head) + goto out; + rb_erase(&head->rbnode, &qp->q.rb_fragments); + memset(&head->rbnode, 0, sizeof(head->rbnode)); + barrier(); + } + if (head == qp->q.fragments_tail) + qp->q.fragments_tail = NULL; + + sub_frag_mem_limit(qp->q.net, head->truesize); + head->dev = dev_get_by_index_rcu(net, qp->iif); if (!head->dev) goto out; @@ -179,16 +196,16 @@ static void ip_expire(struct timer_list *t) (skb_rtable(head)->rt_type != RTN_LOCAL)) goto out; - skb_get(head); spin_unlock(&qp->q.lock); icmp_send(head, ICMP_TIME_EXCEEDED, ICMP_EXC_FRAGTIME, 0); - kfree_skb(head); goto out_rcu_unlock; out: spin_unlock(&qp->q.lock); out_rcu_unlock: rcu_read_unlock(); + if (head) + kfree_skb(head); ipq_put(qp); } @@ -231,7 +248,7 @@ static int ip_frag_too_far(struct ipq *qp) end = atomic_inc_return(&peer->rid); qp->rid = end; - rc = qp->q.fragments && (end - start) > max; + rc = qp->q.fragments_tail && (end - start) > max; if (rc) { struct net *net; @@ -245,7 +262,6 @@ static int ip_frag_too_far(struct ipq *qp) static int ip_frag_reinit(struct ipq *qp) { - struct sk_buff *fp; unsigned int sum_truesize = 0; if (!mod_timer(&qp->q.timer, jiffies + qp->q.net->timeout)) { @@ -253,20 +269,14 @@ static int ip_frag_reinit(struct ipq *qp) return -ETIMEDOUT; } - fp = qp->q.fragments; - do { - struct sk_buff *xp = fp->next; - - sum_truesize += fp->truesize; - kfree_skb(fp); - fp = xp; - } while (fp); + sum_truesize = skb_rbtree_purge(&qp->q.rb_fragments); sub_frag_mem_limit(qp->q.net, sum_truesize); qp->q.flags = 0; qp->q.len = 0; qp->q.meat = 0; qp->q.fragments = NULL; + qp->q.rb_fragments = RB_ROOT; qp->q.fragments_tail = NULL; qp->iif = 0; qp->ecn = 0; @@ -278,7 +288,8 @@ static int ip_frag_reinit(struct ipq *qp) static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb) { struct net *net = container_of(qp->q.net, struct net, ipv4.frags); - struct sk_buff *prev, *next; + struct rb_node **rbn, *parent; + struct sk_buff *skb1; struct net_device *dev; unsigned int fragsize; int flags, offset; @@ -341,58 +352,58 @@ static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb) if (err) goto err; - /* Find out which fragments are in front and at the back of us - * in the chain of fragments so far. We must know where to put - * this fragment, right? - */ - prev = qp->q.fragments_tail; - if (!prev || prev->ip_defrag_offset < offset) { - next = NULL; - goto found; - } - prev = NULL; - for (next = qp->q.fragments; next != NULL; next = next->next) { - if (next->ip_defrag_offset >= offset) - break; /* bingo! */ - prev = next; - } + /* Note : skb->rbnode and skb->dev share the same location. */ + dev = skb->dev; + /* Makes sure compiler wont do silly aliasing games */ + barrier(); -found: /* RFC5722, Section 4, amended by Errata ID : 3089 * When reassembling an IPv6 datagram, if * one or more its constituent fragments is determined to be an * overlapping fragment, the entire datagram (and any constituent * fragments) MUST be silently discarded. * - * We do the same here for IPv4. + * We do the same here for IPv4 (and increment an snmp counter). */ - /* Is there an overlap with the previous fragment? */ - if (prev && - (prev->ip_defrag_offset + prev->len) > offset) - goto discard_qp; - - /* Is there an overlap with the next fragment? */ - if (next && next->ip_defrag_offset < end) - goto discard_qp; + /* Find out where to put this fragment. */ + skb1 = qp->q.fragments_tail; + if (!skb1) { + /* This is the first fragment we've received. */ + rb_link_node(&skb->rbnode, NULL, &qp->q.rb_fragments.rb_node); + qp->q.fragments_tail = skb; + } else if ((skb1->ip_defrag_offset + skb1->len) < end) { + /* This is the common/special case: skb goes to the end. */ + /* Detect and discard overlaps. */ + if (offset < (skb1->ip_defrag_offset + skb1->len)) + goto discard_qp; + /* Insert after skb1. */ + rb_link_node(&skb->rbnode, &skb1->rbnode, &skb1->rbnode.rb_right); + qp->q.fragments_tail = skb; + } else { + /* Binary search. Note that skb can become the first fragment, but + * not the last (covered above). */ + rbn = &qp->q.rb_fragments.rb_node; + do { + parent = *rbn; + skb1 = rb_to_skb(parent); + if (end <= skb1->ip_defrag_offset) + rbn = &parent->rb_left; + else if (offset >= skb1->ip_defrag_offset + skb1->len) + rbn = &parent->rb_right; + else /* Found an overlap with skb1. */ + goto discard_qp; + } while (*rbn); + /* Here we have parent properly set, and rbn pointing to + * one of its NULL left/right children. Insert skb. */ + rb_link_node(&skb->rbnode, parent, rbn); + } + rb_insert_color(&skb->rbnode, &qp->q.rb_fragments); - /* Note : skb->ip_defrag_offset and skb->dev share the same location */ - dev = skb->dev; if (dev) qp->iif = dev->ifindex; - /* Makes sure compiler wont do silly aliasing games */ - barrier(); skb->ip_defrag_offset = offset; - /* Insert this fragment in the chain of fragments. */ - skb->next = next; - if (!next) - qp->q.fragments_tail = skb; - if (prev) - prev->next = skb; - else - qp->q.fragments = skb; - qp->q.stamp = skb->tstamp; qp->q.meat += skb->len; qp->ecn |= ecn; @@ -414,7 +425,7 @@ found: unsigned long orefdst = skb->_skb_refdst; skb->_skb_refdst = 0UL; - err = ip_frag_reasm(qp, prev, dev); + err = ip_frag_reasm(qp, skb, dev); skb->_skb_refdst = orefdst; return err; } @@ -431,15 +442,15 @@ err: return err; } - /* Build a new IP datagram from all its fragments. */ - -static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev, +static int ip_frag_reasm(struct ipq *qp, struct sk_buff *skb, struct net_device *dev) { struct net *net = container_of(qp->q.net, struct net, ipv4.frags); struct iphdr *iph; - struct sk_buff *fp, *head = qp->q.fragments; + struct sk_buff *fp, *head = skb_rb_first(&qp->q.rb_fragments); + struct sk_buff **nextp; /* To build frag_list. */ + struct rb_node *rbn; int len; int ihlen; int err; @@ -453,25 +464,20 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev, goto out_fail; } /* Make the one we just received the head. */ - if (prev) { - head = prev->next; - fp = skb_clone(head, GFP_ATOMIC); + if (head != skb) { + fp = skb_clone(skb, GFP_ATOMIC); if (!fp) goto out_nomem; - - fp->next = head->next; - if (!fp->next) + rb_replace_node(&skb->rbnode, &fp->rbnode, &qp->q.rb_fragments); + if (qp->q.fragments_tail == skb) qp->q.fragments_tail = fp; - prev->next = fp; - - skb_morph(head, qp->q.fragments); - head->next = qp->q.fragments->next; - - consume_skb(qp->q.fragments); - qp->q.fragments = head; + skb_morph(skb, head); + rb_replace_node(&head->rbnode, &skb->rbnode, + &qp->q.rb_fragments); + consume_skb(head); + head = skb; } - WARN_ON(!head); WARN_ON(head->ip_defrag_offset != 0); /* Allocate a new buffer for the datagram. */ @@ -496,24 +502,35 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev, clone = alloc_skb(0, GFP_ATOMIC); if (!clone) goto out_nomem; - clone->next = head->next; - head->next = clone; skb_shinfo(clone)->frag_list = skb_shinfo(head)->frag_list; skb_frag_list_init(head); for (i = 0; i < skb_shinfo(head)->nr_frags; i++) plen += skb_frag_size(&skb_shinfo(head)->frags[i]); clone->len = clone->data_len = head->data_len - plen; - head->data_len -= clone->len; - head->len -= clone->len; + skb->truesize += clone->truesize; clone->csum = 0; clone->ip_summed = head->ip_summed; add_frag_mem_limit(qp->q.net, clone->truesize); + skb_shinfo(head)->frag_list = clone; + nextp = &clone->next; + } else { + nextp = &skb_shinfo(head)->frag_list; } - skb_shinfo(head)->frag_list = head->next; skb_push(head, head->data - skb_network_header(head)); - for (fp=head->next; fp; fp = fp->next) { + /* Traverse the tree in order, to build frag_list. */ + rbn = rb_next(&head->rbnode); + rb_erase(&head->rbnode, &qp->q.rb_fragments); + while (rbn) { + struct rb_node *rbnext = rb_next(rbn); + fp = rb_to_skb(rbn); + rb_erase(rbn, &qp->q.rb_fragments); + rbn = rbnext; + *nextp = fp; + nextp = &fp->next; + fp->prev = NULL; + memset(&fp->rbnode, 0, sizeof(fp->rbnode)); head->data_len += fp->len; head->len += fp->len; if (head->ip_summed != fp->ip_summed) @@ -524,7 +541,9 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev, } sub_frag_mem_limit(qp->q.net, head->truesize); + *nextp = NULL; head->next = NULL; + head->prev = NULL; head->dev = dev; head->tstamp = qp->q.stamp; IPCB(head)->frag_max_size = max(qp->max_df_size, qp->q.max_size); @@ -552,6 +571,7 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev, __IP_INC_STATS(net, IPSTATS_MIB_REASMOKS); qp->q.fragments = NULL; + qp->q.rb_fragments = RB_ROOT; qp->q.fragments_tail = NULL; return 0; diff --git a/net/ipv6/netfilter/nf_conntrack_reasm.c b/net/ipv6/netfilter/nf_conntrack_reasm.c index 0610bdab721c..38d69ef516d5 100644 --- a/net/ipv6/netfilter/nf_conntrack_reasm.c +++ b/net/ipv6/netfilter/nf_conntrack_reasm.c @@ -463,6 +463,7 @@ nf_ct_frag6_reasm(struct frag_queue *fq, struct sk_buff *prev, struct net_devic head->csum); fq->q.fragments = NULL; + fq->q.rb_fragments = RB_ROOT; fq->q.fragments_tail = NULL; return true; diff --git a/net/ipv6/reassembly.c b/net/ipv6/reassembly.c index 6edd2ac8ae4b..b4e558ab39fa 100644 --- a/net/ipv6/reassembly.c +++ b/net/ipv6/reassembly.c @@ -405,6 +405,7 @@ static int ip6_frag_reasm(struct frag_queue *fq, struct sk_buff *prev, __IP6_INC_STATS(net, __in6_dev_get(dev), IPSTATS_MIB_REASMOKS); rcu_read_unlock(); fq->q.fragments = NULL; + fq->q.rb_fragments = RB_ROOT; fq->q.fragments_tail = NULL; return 1; -- cgit v1.2.3-59-g8ed1b From 6fdecfe32f903d9f5f35ee4464fd32ede470dcad Mon Sep 17 00:00:00 2001 From: Arun Parameswaran Date: Tue, 7 Aug 2018 10:02:44 -0700 Subject: net: phy: Add support for Broadcom Omega internal Combo GPHY Add support for the Broadcom Omega SoC internal Combo Ethernet GPHY to the bcm7xxx phy driver. Signed-off-by: Arun Parameswaran Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- drivers/net/phy/bcm7xxx.c | 2 ++ include/linux/brcmphy.h | 1 + 2 files changed, 3 insertions(+) (limited to 'include/linux') diff --git a/drivers/net/phy/bcm7xxx.c b/drivers/net/phy/bcm7xxx.c index 01d2ff2f6241..b2b6307d64a4 100644 --- a/drivers/net/phy/bcm7xxx.c +++ b/drivers/net/phy/bcm7xxx.c @@ -229,6 +229,7 @@ static int bcm7xxx_28nm_config_init(struct phy_device *phydev) phy_read(phydev, MII_BMSR); switch (rev) { + case 0xa0: case 0xb0: ret = bcm7xxx_28nm_b0_afe_config_init(phydev); break; @@ -659,6 +660,7 @@ static struct phy_driver bcm7xxx_driver[] = { BCM7XXX_28NM_GPHY(PHY_ID_BCM7439, "Broadcom BCM7439"), BCM7XXX_28NM_GPHY(PHY_ID_BCM7439_2, "Broadcom BCM7439 (2)"), BCM7XXX_28NM_GPHY(PHY_ID_BCM7445, "Broadcom BCM7445"), + BCM7XXX_28NM_GPHY(PHY_ID_BCM_OMEGA, "Broadcom Omega Combo GPHY"), BCM7XXX_40NM_EPHY(PHY_ID_BCM7346, "Broadcom BCM7346"), BCM7XXX_40NM_EPHY(PHY_ID_BCM7362, "Broadcom BCM7362"), BCM7XXX_40NM_EPHY(PHY_ID_BCM7425, "Broadcom BCM7425"), diff --git a/include/linux/brcmphy.h b/include/linux/brcmphy.h index daa9234a9baf..949e9af8d9d6 100644 --- a/include/linux/brcmphy.h +++ b/include/linux/brcmphy.h @@ -45,6 +45,7 @@ #define PHY_ID_BCM7445 0x600d8510 #define PHY_ID_BCM_CYGNUS 0xae025200 +#define PHY_ID_BCM_OMEGA 0xae025100 #define PHY_BCM_OUI_MASK 0xfffffc00 #define PHY_BCM_OUI_1 0x00206000 -- cgit v1.2.3-59-g8ed1b From 212dfd909ea8b630e5d6fa4d25aeec9c4b4b14a5 Mon Sep 17 00:00:00 2001 From: Fernando Fernandez Mancera Date: Tue, 7 Aug 2018 22:06:52 +0200 Subject: netfilter: nfnetlink_osf: add missing enum in nfnetlink_osf uapi header xt_osf_window_size_options was originally part of include/uapi/linux/netfilter/xt_osf.h, restore it. Fixes: bfb15f2a95cb ("netfilter: extract Passive OS fingerprint infrastructure from xt_osf") Signed-off-by: Fernando Fernandez Mancera Signed-off-by: Pablo Neira Ayuso --- include/linux/netfilter/nfnetlink_osf.h | 12 ------------ include/uapi/linux/netfilter/nfnetlink_osf.h | 12 ++++++++++++ include/uapi/linux/netfilter/xt_osf.h | 1 + 3 files changed, 13 insertions(+), 12 deletions(-) (limited to 'include/linux') diff --git a/include/linux/netfilter/nfnetlink_osf.h b/include/linux/netfilter/nfnetlink_osf.h index a7311bc03d3a..ecf7dab81e9e 100644 --- a/include/linux/netfilter/nfnetlink_osf.h +++ b/include/linux/netfilter/nfnetlink_osf.h @@ -4,18 +4,6 @@ #include -/* Initial window size option state machine: multiple of mss, mtu or - * plain numeric value. Can also be made as plain numeric value which - * is not a multiple of specified value. - */ -enum nf_osf_window_size_options { - OSF_WSS_PLAIN = 0, - OSF_WSS_MSS, - OSF_WSS_MTU, - OSF_WSS_MODULO, - OSF_WSS_MAX, -}; - enum osf_fmatch_states { /* Packet does not match the fingerprint */ FMATCH_WRONG = 0, diff --git a/include/uapi/linux/netfilter/nfnetlink_osf.h b/include/uapi/linux/netfilter/nfnetlink_osf.h index 3b93fbb9fc24..76a3527df5dd 100644 --- a/include/uapi/linux/netfilter/nfnetlink_osf.h +++ b/include/uapi/linux/netfilter/nfnetlink_osf.h @@ -88,6 +88,18 @@ enum iana_options { OSFOPT_EMPTY = 255, }; +/* Initial window size option state machine: multiple of mss, mtu or + * plain numeric value. Can also be made as plain numeric value which + * is not a multiple of specified value. + */ +enum nf_osf_window_size_options { + OSF_WSS_PLAIN = 0, + OSF_WSS_MSS, + OSF_WSS_MTU, + OSF_WSS_MODULO, + OSF_WSS_MAX, +}; + enum nf_osf_attr_type { OSF_ATTR_UNSPEC, OSF_ATTR_FINGER, diff --git a/include/uapi/linux/netfilter/xt_osf.h b/include/uapi/linux/netfilter/xt_osf.h index c56c59605c2b..24102b5286ec 100644 --- a/include/uapi/linux/netfilter/xt_osf.h +++ b/include/uapi/linux/netfilter/xt_osf.h @@ -46,6 +46,7 @@ #define xt_osf_finger nf_osf_finger #define xt_osf_nlmsg nf_osf_nlmsg +#define xt_osf_window_size_options nf_osf_window_size_options #define xt_osf_attr_type nf_osf_attr_type #define xt_osf_msg_types nf_osf_msg_types -- cgit v1.2.3-59-g8ed1b From 342ac8448f1fb213908656ae5581d0f37a5954e8 Mon Sep 17 00:00:00 2001 From: Denis Drozdov Date: Wed, 8 Aug 2018 16:23:48 -0700 Subject: net/mlx5: Use max_num_eqs for calculation of required MSIX vectors New firmware has defined new HCA capability field called "max_num_eqs", that is the number of available EQs after subtracting reserved FW EQs. Before this capability the FW reported the EQ number in "log_max_eqs", the reported value also contained FW reserved EQs, but the driver might be failing to load on 320 cpus systems due to the fact that FW reserved EQs were not available to the driver. Now the driver has to obtain max_num_eqs value from new FW to get real number of EQs available. Signed-off-by: Denis Drozdov Reviewed-by: Alex Vesker Signed-off-by: Saeed Mahameed Signed-off-by: David S. Miller --- drivers/net/ethernet/mellanox/mlx5/core/main.c | 4 +++- include/linux/mlx5/mlx5_ifc.h | 5 ++++- 2 files changed, 7 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/drivers/net/ethernet/mellanox/mlx5/core/main.c b/drivers/net/ethernet/mellanox/mlx5/core/main.c index 03b9c6733eed..cf3e4a659052 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/main.c @@ -323,7 +323,9 @@ static int mlx5_alloc_irq_vectors(struct mlx5_core_dev *dev) { struct mlx5_priv *priv = &dev->priv; struct mlx5_eq_table *table = &priv->eq_table; - int num_eqs = 1 << MLX5_CAP_GEN(dev, log_max_eq); + int num_eqs = MLX5_CAP_GEN(dev, max_num_eqs) ? + MLX5_CAP_GEN(dev, max_num_eqs) : + 1 << MLX5_CAP_GEN(dev, log_max_eq); int nvec; int err; diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index 60c2308fe062..63e1ce4c4826 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -1133,7 +1133,10 @@ struct mlx5_ifc_cmd_hca_cap_bits { u8 general_obj_types[0x40]; - u8 reserved_at_440[0x40]; + u8 reserved_at_440[0x20]; + + u8 reserved_at_460[0x10]; + u8 max_num_eqs[0x10]; u8 reserved_at_480[0x3]; u8 log_max_l2_table[0x5]; -- cgit v1.2.3-59-g8ed1b From cc9c82a8668cf98748bfbaa83c5c092d5115c25b Mon Sep 17 00:00:00 2001 From: Eran Ben Elisha Date: Wed, 8 Aug 2018 16:23:49 -0700 Subject: net/mlx5: Rename modify/query_vport state related enums Modify and query vport state commands share the same admin_state and op_mod values, rename the enums to fit them both. In addition, remove the esw prefix from the admin state enum as this also applied for vnic. Signed-off-by: Eran Ben Elisha Signed-off-by: Saeed Mahameed Signed-off-by: David S. Miller --- drivers/net/ethernet/mellanox/mlx5/core/en_main.c | 12 ++++++------ drivers/net/ethernet/mellanox/mlx5/core/en_rep.c | 8 ++++---- drivers/net/ethernet/mellanox/mlx5/core/en_selftest.c | 2 +- drivers/net/ethernet/mellanox/mlx5/core/eswitch.c | 10 +++++----- include/linux/mlx5/device.h | 6 +++--- include/linux/mlx5/mlx5_ifc.h | 4 ++-- 6 files changed, 21 insertions(+), 21 deletions(-) (limited to 'include/linux') diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c index a2fb21ca5767..2731ba2d8049 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c @@ -228,7 +228,7 @@ static void mlx5e_update_carrier(struct mlx5e_priv *priv) u8 port_state; port_state = mlx5_query_vport_state(mdev, - MLX5_QUERY_VPORT_STATE_IN_OP_MOD_VNIC_VPORT, + MLX5_VPORT_STATE_OP_MOD_VNIC_VPORT, 0); if (port_state == VPORT_STATE_UP) { @@ -3916,9 +3916,9 @@ static int mlx5e_set_vf_rate(struct net_device *dev, int vf, int min_tx_rate, static int mlx5_vport_link2ifla(u8 esw_link) { switch (esw_link) { - case MLX5_ESW_VPORT_ADMIN_STATE_DOWN: + case MLX5_VPORT_ADMIN_STATE_DOWN: return IFLA_VF_LINK_STATE_DISABLE; - case MLX5_ESW_VPORT_ADMIN_STATE_UP: + case MLX5_VPORT_ADMIN_STATE_UP: return IFLA_VF_LINK_STATE_ENABLE; } return IFLA_VF_LINK_STATE_AUTO; @@ -3928,11 +3928,11 @@ static int mlx5_ifla_link2vport(u8 ifla_link) { switch (ifla_link) { case IFLA_VF_LINK_STATE_DISABLE: - return MLX5_ESW_VPORT_ADMIN_STATE_DOWN; + return MLX5_VPORT_ADMIN_STATE_DOWN; case IFLA_VF_LINK_STATE_ENABLE: - return MLX5_ESW_VPORT_ADMIN_STATE_UP; + return MLX5_VPORT_ADMIN_STATE_UP; } - return MLX5_ESW_VPORT_ADMIN_STATE_AUTO; + return MLX5_VPORT_ADMIN_STATE_AUTO; } static int mlx5e_set_vf_link_state(struct net_device *dev, int vf, diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c b/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c index 8e3c5b4b90ab..c9cc9747d21d 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c @@ -698,8 +698,8 @@ static int mlx5e_rep_open(struct net_device *dev) goto unlock; if (!mlx5_modify_vport_admin_state(priv->mdev, - MLX5_QUERY_VPORT_STATE_IN_OP_MOD_ESW_VPORT, - rep->vport, MLX5_ESW_VPORT_ADMIN_STATE_UP)) + MLX5_VPORT_STATE_OP_MOD_ESW_VPORT, + rep->vport, MLX5_VPORT_ADMIN_STATE_UP)) netif_carrier_on(dev); unlock: @@ -716,8 +716,8 @@ static int mlx5e_rep_close(struct net_device *dev) mutex_lock(&priv->state_lock); mlx5_modify_vport_admin_state(priv->mdev, - MLX5_QUERY_VPORT_STATE_IN_OP_MOD_ESW_VPORT, - rep->vport, MLX5_ESW_VPORT_ADMIN_STATE_DOWN); + MLX5_VPORT_STATE_OP_MOD_ESW_VPORT, + rep->vport, MLX5_VPORT_ADMIN_STATE_DOWN); ret = mlx5e_close_locked(dev); mutex_unlock(&priv->state_lock); return ret; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_selftest.c b/drivers/net/ethernet/mellanox/mlx5/core/en_selftest.c index 4d316cc9b008..35ded91203f5 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_selftest.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_selftest.c @@ -74,7 +74,7 @@ static int mlx5e_test_link_state(struct mlx5e_priv *priv) if (!netif_carrier_ok(priv->netdev)) return 1; - port_state = mlx5_query_vport_state(priv->mdev, MLX5_QUERY_VPORT_STATE_IN_OP_MOD_VNIC_VPORT, 0); + port_state = mlx5_query_vport_state(priv->mdev, MLX5_VPORT_STATE_OP_MOD_VNIC_VPORT, 0); return port_state == VPORT_STATE_UP ? 0 : 1; } diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c index 40dba9e8af92..cd0760a8d45a 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c @@ -1469,7 +1469,7 @@ static void esw_apply_vport_conf(struct mlx5_eswitch *esw, return; mlx5_modify_vport_admin_state(esw->dev, - MLX5_QUERY_VPORT_STATE_IN_OP_MOD_ESW_VPORT, + MLX5_VPORT_STATE_OP_MOD_ESW_VPORT, vport_num, vport->info.link_state); mlx5_modify_nic_vport_mac_address(esw->dev, vport_num, vport->info.mac); @@ -1582,9 +1582,9 @@ static void esw_disable_vport(struct mlx5_eswitch *esw, int vport_num) esw_vport_disable_qos(esw, vport_num); if (vport_num && esw->mode == SRIOV_LEGACY) { mlx5_modify_vport_admin_state(esw->dev, - MLX5_QUERY_VPORT_STATE_IN_OP_MOD_ESW_VPORT, + MLX5_VPORT_STATE_OP_MOD_ESW_VPORT, vport_num, - MLX5_ESW_VPORT_ADMIN_STATE_DOWN); + MLX5_VPORT_ADMIN_STATE_DOWN); esw_vport_disable_egress_acl(esw, vport); esw_vport_disable_ingress_acl(esw, vport); esw_vport_destroy_drop_counters(vport); @@ -1736,7 +1736,7 @@ int mlx5_eswitch_init(struct mlx5_core_dev *dev) struct mlx5_vport *vport = &esw->vports[vport_num]; vport->vport = vport_num; - vport->info.link_state = MLX5_ESW_VPORT_ADMIN_STATE_AUTO; + vport->info.link_state = MLX5_VPORT_ADMIN_STATE_AUTO; vport->dev = dev; INIT_WORK(&vport->vport_change_handler, esw_vport_change_handler); @@ -1860,7 +1860,7 @@ int mlx5_eswitch_set_vport_state(struct mlx5_eswitch *esw, evport = &esw->vports[vport]; err = mlx5_modify_vport_admin_state(esw->dev, - MLX5_QUERY_VPORT_STATE_IN_OP_MOD_ESW_VPORT, + MLX5_VPORT_STATE_OP_MOD_ESW_VPORT, vport, link_state); if (err) { mlx5_core_warn(esw->dev, diff --git a/include/linux/mlx5/device.h b/include/linux/mlx5/device.h index d489494b0a84..11fa4e66afc5 100644 --- a/include/linux/mlx5/device.h +++ b/include/linux/mlx5/device.h @@ -946,9 +946,9 @@ enum { }; enum { - MLX5_ESW_VPORT_ADMIN_STATE_DOWN = 0x0, - MLX5_ESW_VPORT_ADMIN_STATE_UP = 0x1, - MLX5_ESW_VPORT_ADMIN_STATE_AUTO = 0x2, + MLX5_VPORT_ADMIN_STATE_DOWN = 0x0, + MLX5_VPORT_ADMIN_STATE_UP = 0x1, + MLX5_VPORT_ADMIN_STATE_AUTO = 0x2, }; enum { diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index 63e1ce4c4826..6ead9c1a5396 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -3764,8 +3764,8 @@ struct mlx5_ifc_query_vport_state_out_bits { }; enum { - MLX5_QUERY_VPORT_STATE_IN_OP_MOD_VNIC_VPORT = 0x0, - MLX5_QUERY_VPORT_STATE_IN_OP_MOD_ESW_VPORT = 0x1, + MLX5_VPORT_STATE_OP_MOD_VNIC_VPORT = 0x0, + MLX5_VPORT_STATE_OP_MOD_ESW_VPORT = 0x1, }; struct mlx5_ifc_query_vport_state_in_bits { -- cgit v1.2.3-59-g8ed1b From 29d8ebd44de8999e292dbb4de76744d4a41039ec Mon Sep 17 00:00:00 2001 From: Eli Cohen Date: Wed, 8 Aug 2018 16:23:51 -0700 Subject: net/mlx5: Remove unused mlx5_query_vport_admin_state mlx5_query_vport_admin_state() is not used anywhere. Remove it. Signed-off-by: Eli Cohen Reviewed-by: Or Gerlitz Signed-off-by: Saeed Mahameed Signed-off-by: David S. Miller --- drivers/net/ethernet/mellanox/mlx5/core/vport.c | 10 ---------- include/linux/mlx5/vport.h | 2 -- 2 files changed, 12 deletions(-) (limited to 'include/linux') diff --git a/drivers/net/ethernet/mellanox/mlx5/core/vport.c b/drivers/net/ethernet/mellanox/mlx5/core/vport.c index 7eecd5b07bb1..fa6fa171b94c 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/vport.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/vport.c @@ -64,16 +64,6 @@ u8 mlx5_query_vport_state(struct mlx5_core_dev *mdev, u8 opmod, u16 vport) } EXPORT_SYMBOL_GPL(mlx5_query_vport_state); -u8 mlx5_query_vport_admin_state(struct mlx5_core_dev *mdev, u8 opmod, u16 vport) -{ - u32 out[MLX5_ST_SZ_DW(query_vport_state_out)] = {0}; - - _mlx5_query_vport_state(mdev, opmod, vport, out, sizeof(out)); - - return MLX5_GET(query_vport_state_out, out, admin_state); -} -EXPORT_SYMBOL_GPL(mlx5_query_vport_admin_state); - int mlx5_modify_vport_admin_state(struct mlx5_core_dev *mdev, u8 opmod, u16 vport, u8 state) { diff --git a/include/linux/mlx5/vport.h b/include/linux/mlx5/vport.h index 9208cb8809ac..7e7c6dfcfb09 100644 --- a/include/linux/mlx5/vport.h +++ b/include/linux/mlx5/vport.h @@ -43,8 +43,6 @@ enum { }; u8 mlx5_query_vport_state(struct mlx5_core_dev *mdev, u8 opmod, u16 vport); -u8 mlx5_query_vport_admin_state(struct mlx5_core_dev *mdev, u8 opmod, - u16 vport); int mlx5_modify_vport_admin_state(struct mlx5_core_dev *mdev, u8 opmod, u16 vport, u8 state); int mlx5_query_nic_vport_mac_address(struct mlx5_core_dev *mdev, -- cgit v1.2.3-59-g8ed1b From 209b43759d65b2cc99ce7757249aacc82b03c4e2 Mon Sep 17 00:00:00 2001 From: Michael Büsch Date: Tue, 31 Jul 2018 22:15:09 +0200 Subject: ssb: Remove SSB_WARN_ON, SSB_BUG_ON and SSB_DEBUG Use the standard WARN_ON instead. If a small kernel is desired, WARN_ON can be disabled globally. Also remove SSB_DEBUG. Besides WARN_ON it only adds a tiny debug check. Include this check unconditionally. Signed-off-by: Michael Buesch Signed-off-by: Kalle Valo --- arch/mips/configs/bcm47xx_defconfig | 1 - arch/powerpc/configs/wii_defconfig | 1 - drivers/ssb/Kconfig | 9 --------- drivers/ssb/driver_chipcommon.c | 8 ++++---- drivers/ssb/driver_chipcommon_pmu.c | 10 +++++----- drivers/ssb/driver_gpio.c | 4 ++-- drivers/ssb/driver_pcicore.c | 6 +++--- drivers/ssb/embedded.c | 10 +++++----- drivers/ssb/host_soc.c | 12 ++++++------ drivers/ssb/main.c | 38 +++++++++++++++++-------------------- drivers/ssb/pci.c | 19 ++++++------------- drivers/ssb/pcmcia.c | 14 +++++++------- drivers/ssb/scan.c | 4 ++-- drivers/ssb/sdio.c | 12 ++++++------ drivers/ssb/ssb_private.h | 9 --------- include/linux/ssb/ssb.h | 2 -- 16 files changed, 63 insertions(+), 96 deletions(-) (limited to 'include/linux') diff --git a/arch/mips/configs/bcm47xx_defconfig b/arch/mips/configs/bcm47xx_defconfig index fad8e964f14c..ba800a892384 100644 --- a/arch/mips/configs/bcm47xx_defconfig +++ b/arch/mips/configs/bcm47xx_defconfig @@ -66,7 +66,6 @@ CONFIG_HW_RANDOM=y CONFIG_GPIO_SYSFS=y CONFIG_WATCHDOG=y CONFIG_BCM47XX_WDT=y -CONFIG_SSB_DEBUG=y CONFIG_SSB_DRIVER_GIGE=y CONFIG_BCMA_DRIVER_GMAC_CMN=y CONFIG_USB=y diff --git a/arch/powerpc/configs/wii_defconfig b/arch/powerpc/configs/wii_defconfig index 10940533da71..f5c366b02828 100644 --- a/arch/powerpc/configs/wii_defconfig +++ b/arch/powerpc/configs/wii_defconfig @@ -78,7 +78,6 @@ CONFIG_GPIO_HLWD=y CONFIG_POWER_RESET=y CONFIG_POWER_RESET_GPIO=y # CONFIG_HWMON is not set -CONFIG_SSB_DEBUG=y CONFIG_FB=y # CONFIG_VGA_CONSOLE is not set CONFIG_FRAMEBUFFER_CONSOLE=y diff --git a/drivers/ssb/Kconfig b/drivers/ssb/Kconfig index 6c438c819eb9..df30e1323252 100644 --- a/drivers/ssb/Kconfig +++ b/drivers/ssb/Kconfig @@ -89,15 +89,6 @@ config SSB_HOST_SOC If unsure, say N -config SSB_DEBUG - bool "SSB debugging" - depends on SSB - help - This turns on additional runtime checks and debugging - messages. Turn this on for SSB troubleshooting. - - If unsure, say N - config SSB_SERIAL bool depends on SSB diff --git a/drivers/ssb/driver_chipcommon.c b/drivers/ssb/driver_chipcommon.c index 48050c6fd847..99a4656d113d 100644 --- a/drivers/ssb/driver_chipcommon.c +++ b/drivers/ssb/driver_chipcommon.c @@ -56,7 +56,7 @@ void ssb_chipco_set_clockmode(struct ssb_chipcommon *cc, if (cc->capabilities & SSB_CHIPCO_CAP_PMU) return; /* PMU controls clockmode, separated function needed */ - SSB_WARN_ON(ccdev->id.revision >= 20); + WARN_ON(ccdev->id.revision >= 20); /* chipcommon cores prior to rev6 don't support dynamic clock control */ if (ccdev->id.revision < 6) @@ -111,7 +111,7 @@ void ssb_chipco_set_clockmode(struct ssb_chipcommon *cc, } break; default: - SSB_WARN_ON(1); + WARN_ON(1); } } @@ -164,7 +164,7 @@ static int chipco_pctl_clockfreqlimit(struct ssb_chipcommon *cc, int get_max) divisor = 32; break; default: - SSB_WARN_ON(1); + WARN_ON(1); } } else if (cc->dev->id.revision < 10) { switch (clocksrc) { @@ -277,7 +277,7 @@ static void calc_fast_powerup_delay(struct ssb_chipcommon *cc) minfreq = chipco_pctl_clockfreqlimit(cc, 0); pll_on_delay = chipco_read32(cc, SSB_CHIPCO_PLLONDELAY); tmp = (((pll_on_delay + 2) * 1000000) + (minfreq - 1)) / minfreq; - SSB_WARN_ON(tmp & ~0xFFFF); + WARN_ON(tmp & ~0xFFFF); cc->fast_pwrup_delay = tmp; } diff --git a/drivers/ssb/driver_chipcommon_pmu.c b/drivers/ssb/driver_chipcommon_pmu.c index e28682a53cdf..0f60e90ded26 100644 --- a/drivers/ssb/driver_chipcommon_pmu.c +++ b/drivers/ssb/driver_chipcommon_pmu.c @@ -128,7 +128,7 @@ static void ssb_pmu0_pllinit_r0(struct ssb_chipcommon *cc, ~(1 << SSB_PMURES_5354_BB_PLL_PU)); break; default: - SSB_WARN_ON(1); + WARN_ON(1); } for (i = 1500; i; i--) { tmp = chipco_read32(cc, SSB_CHIPCO_CLKCTLST); @@ -265,7 +265,7 @@ static void ssb_pmu1_pllinit_r0(struct ssb_chipcommon *cc, buffer_strength = 0x222222; break; default: - SSB_WARN_ON(1); + WARN_ON(1); } for (i = 1500; i; i--) { tmp = chipco_read32(cc, SSB_CHIPCO_CLKCTLST); @@ -501,7 +501,7 @@ static void ssb_pmu_resources_init(struct ssb_chipcommon *cc) ~(depend_tab[i].depend)); break; default: - SSB_WARN_ON(1); + WARN_ON(1); } } } @@ -568,12 +568,12 @@ void ssb_pmu_set_ldo_voltage(struct ssb_chipcommon *cc, mask = 0x3F; break; default: - SSB_WARN_ON(1); + WARN_ON(1); return; } break; case 0x4312: - if (SSB_WARN_ON(id != LDO_PAREF)) + if (WARN_ON(id != LDO_PAREF)) return; addr = 0; shift = 21; diff --git a/drivers/ssb/driver_gpio.c b/drivers/ssb/driver_gpio.c index 6ce4abf7d473..e809dae4c470 100644 --- a/drivers/ssb/driver_gpio.c +++ b/drivers/ssb/driver_gpio.c @@ -461,7 +461,7 @@ int ssb_gpio_init(struct ssb_bus *bus) else if (ssb_extif_available(&bus->extif)) return ssb_gpio_extif_init(bus); else - SSB_WARN_ON(1); + WARN_ON(1); return -1; } @@ -473,7 +473,7 @@ int ssb_gpio_unregister(struct ssb_bus *bus) gpiochip_remove(&bus->gpio); return 0; } else { - SSB_WARN_ON(1); + WARN_ON(1); } return -1; diff --git a/drivers/ssb/driver_pcicore.c b/drivers/ssb/driver_pcicore.c index ae80b3171523..6a5622e0ded5 100644 --- a/drivers/ssb/driver_pcicore.c +++ b/drivers/ssb/driver_pcicore.c @@ -115,7 +115,7 @@ static int ssb_extpci_read_config(struct ssb_pcicore *pc, u32 addr, val; void __iomem *mmio; - SSB_WARN_ON(!pc->hostmode); + WARN_ON(!pc->hostmode); if (unlikely(len != 1 && len != 2 && len != 4)) goto out; addr = get_cfgspace_addr(pc, bus, dev, func, off); @@ -161,7 +161,7 @@ static int ssb_extpci_write_config(struct ssb_pcicore *pc, u32 addr, val = 0; void __iomem *mmio; - SSB_WARN_ON(!pc->hostmode); + WARN_ON(!pc->hostmode); if (unlikely(len != 1 && len != 2 && len != 4)) goto out; addr = get_cfgspace_addr(pc, bus, dev, func, off); @@ -702,7 +702,7 @@ int ssb_pcicore_dev_irqvecs_enable(struct ssb_pcicore *pc, /* Calculate the "coremask" for the device. */ coremask = (1 << dev->core_index); - SSB_WARN_ON(bus->bustype != SSB_BUSTYPE_PCI); + WARN_ON(bus->bustype != SSB_BUSTYPE_PCI); err = pci_read_config_dword(bus->host_pci, SSB_PCI_IRQMASK, &tmp); if (err) goto out; diff --git a/drivers/ssb/embedded.c b/drivers/ssb/embedded.c index c39ddf8988c1..8254ed25e063 100644 --- a/drivers/ssb/embedded.c +++ b/drivers/ssb/embedded.c @@ -77,7 +77,7 @@ u32 ssb_gpio_in(struct ssb_bus *bus, u32 mask) else if (ssb_extif_available(&bus->extif)) res = ssb_extif_gpio_in(&bus->extif, mask); else - SSB_WARN_ON(1); + WARN_ON(1); spin_unlock_irqrestore(&bus->gpio_lock, flags); return res; @@ -95,7 +95,7 @@ u32 ssb_gpio_out(struct ssb_bus *bus, u32 mask, u32 value) else if (ssb_extif_available(&bus->extif)) res = ssb_extif_gpio_out(&bus->extif, mask, value); else - SSB_WARN_ON(1); + WARN_ON(1); spin_unlock_irqrestore(&bus->gpio_lock, flags); return res; @@ -113,7 +113,7 @@ u32 ssb_gpio_outen(struct ssb_bus *bus, u32 mask, u32 value) else if (ssb_extif_available(&bus->extif)) res = ssb_extif_gpio_outen(&bus->extif, mask, value); else - SSB_WARN_ON(1); + WARN_ON(1); spin_unlock_irqrestore(&bus->gpio_lock, flags); return res; @@ -145,7 +145,7 @@ u32 ssb_gpio_intmask(struct ssb_bus *bus, u32 mask, u32 value) else if (ssb_extif_available(&bus->extif)) res = ssb_extif_gpio_intmask(&bus->extif, mask, value); else - SSB_WARN_ON(1); + WARN_ON(1); spin_unlock_irqrestore(&bus->gpio_lock, flags); return res; @@ -163,7 +163,7 @@ u32 ssb_gpio_polarity(struct ssb_bus *bus, u32 mask, u32 value) else if (ssb_extif_available(&bus->extif)) res = ssb_extif_gpio_polarity(&bus->extif, mask, value); else - SSB_WARN_ON(1); + WARN_ON(1); spin_unlock_irqrestore(&bus->gpio_lock, flags); return res; diff --git a/drivers/ssb/host_soc.c b/drivers/ssb/host_soc.c index eadaedf466f6..3b438480515c 100644 --- a/drivers/ssb/host_soc.c +++ b/drivers/ssb/host_soc.c @@ -61,7 +61,7 @@ static void ssb_host_soc_block_read(struct ssb_device *dev, void *buffer, case sizeof(u16): { __le16 *buf = buffer; - SSB_WARN_ON(count & 1); + WARN_ON(count & 1); while (count) { *buf = (__force __le16)__raw_readw(addr); buf++; @@ -72,7 +72,7 @@ static void ssb_host_soc_block_read(struct ssb_device *dev, void *buffer, case sizeof(u32): { __le32 *buf = buffer; - SSB_WARN_ON(count & 3); + WARN_ON(count & 3); while (count) { *buf = (__force __le32)__raw_readl(addr); buf++; @@ -81,7 +81,7 @@ static void ssb_host_soc_block_read(struct ssb_device *dev, void *buffer, break; } default: - SSB_WARN_ON(1); + WARN_ON(1); } } #endif /* CONFIG_SSB_BLOCKIO */ @@ -134,7 +134,7 @@ static void ssb_host_soc_block_write(struct ssb_device *dev, const void *buffer, case sizeof(u16): { const __le16 *buf = buffer; - SSB_WARN_ON(count & 1); + WARN_ON(count & 1); while (count) { __raw_writew((__force u16)(*buf), addr); buf++; @@ -145,7 +145,7 @@ static void ssb_host_soc_block_write(struct ssb_device *dev, const void *buffer, case sizeof(u32): { const __le32 *buf = buffer; - SSB_WARN_ON(count & 3); + WARN_ON(count & 3); while (count) { __raw_writel((__force u32)(*buf), addr); buf++; @@ -154,7 +154,7 @@ static void ssb_host_soc_block_write(struct ssb_device *dev, const void *buffer, break; } default: - SSB_WARN_ON(1); + WARN_ON(1); } } #endif /* CONFIG_SSB_BLOCKIO */ diff --git a/drivers/ssb/main.c b/drivers/ssb/main.c index 9da56d2fdbd3..0a26984acb2c 100644 --- a/drivers/ssb/main.c +++ b/drivers/ssb/main.c @@ -209,7 +209,7 @@ int ssb_devices_freeze(struct ssb_bus *bus, struct ssb_freeze_context *ctx) memset(ctx, 0, sizeof(*ctx)); ctx->bus = bus; - SSB_WARN_ON(bus->nr_devices > ARRAY_SIZE(ctx->device_frozen)); + WARN_ON(bus->nr_devices > ARRAY_SIZE(ctx->device_frozen)); for (i = 0; i < bus->nr_devices; i++) { sdev = ssb_device_get(&bus->devices[i]); @@ -220,7 +220,7 @@ int ssb_devices_freeze(struct ssb_bus *bus, struct ssb_freeze_context *ctx) continue; } sdrv = drv_to_ssb_drv(sdev->dev->driver); - if (SSB_WARN_ON(!sdrv->remove)) + if (WARN_ON(!sdrv->remove)) continue; sdrv->remove(sdev); ctx->device_frozen[i] = 1; @@ -248,10 +248,10 @@ int ssb_devices_thaw(struct ssb_freeze_context *ctx) continue; sdev = &bus->devices[i]; - if (SSB_WARN_ON(!sdev->dev || !sdev->dev->driver)) + if (WARN_ON(!sdev->dev || !sdev->dev->driver)) continue; sdrv = drv_to_ssb_drv(sdev->dev->driver); - if (SSB_WARN_ON(!sdrv || !sdrv->probe)) + if (WARN_ON(!sdrv || !sdrv->probe)) continue; err = sdrv->probe(sdev, &sdev->id); @@ -861,13 +861,13 @@ u32 ssb_calc_clock_rate(u32 plltype, u32 n, u32 m) case SSB_PLLTYPE_2: /* 48Mhz, 4 dividers */ n1 += SSB_CHIPCO_CLK_T2_BIAS; n2 += SSB_CHIPCO_CLK_T2_BIAS; - SSB_WARN_ON(!((n1 >= 2) && (n1 <= 7))); - SSB_WARN_ON(!((n2 >= 5) && (n2 <= 23))); + WARN_ON(!((n1 >= 2) && (n1 <= 7))); + WARN_ON(!((n2 >= 5) && (n2 <= 23))); break; case SSB_PLLTYPE_5: /* 25Mhz, 4 dividers */ return 100000000; default: - SSB_WARN_ON(1); + WARN_ON(1); } switch (plltype) { @@ -916,9 +916,9 @@ u32 ssb_calc_clock_rate(u32 plltype, u32 n, u32 m) m1 += SSB_CHIPCO_CLK_T2_BIAS; m2 += SSB_CHIPCO_CLK_T2M2_BIAS; m3 += SSB_CHIPCO_CLK_T2_BIAS; - SSB_WARN_ON(!((m1 >= 2) && (m1 <= 7))); - SSB_WARN_ON(!((m2 >= 3) && (m2 <= 10))); - SSB_WARN_ON(!((m3 >= 2) && (m3 <= 7))); + WARN_ON(!((m1 >= 2) && (m1 <= 7))); + WARN_ON(!((m2 >= 3) && (m2 <= 10))); + WARN_ON(!((m3 >= 2) && (m3 <= 7))); if (!(mc & SSB_CHIPCO_CLK_T2MC_M1BYP)) clock /= m1; @@ -928,7 +928,7 @@ u32 ssb_calc_clock_rate(u32 plltype, u32 n, u32 m) clock /= m3; return clock; default: - SSB_WARN_ON(1); + WARN_ON(1); } return 0; } @@ -1169,9 +1169,7 @@ int ssb_bus_may_powerdown(struct ssb_bus *bus) if (err) goto error; out: -#ifdef CONFIG_SSB_DEBUG bus->powered_up = 0; -#endif return err; error: pr_err("Bus powerdown failed\n"); @@ -1188,9 +1186,7 @@ int ssb_bus_powerup(struct ssb_bus *bus, bool dynamic_pctl) if (err) goto error; -#ifdef CONFIG_SSB_DEBUG bus->powered_up = 1; -#endif mode = dynamic_pctl ? SSB_CLKMODE_DYNAMIC : SSB_CLKMODE_FAST; ssb_chipco_set_clockmode(&bus->chipco, mode); @@ -1242,15 +1238,15 @@ u32 ssb_admatch_base(u32 adm) base = (adm & SSB_ADM_BASE0); break; case SSB_ADM_TYPE1: - SSB_WARN_ON(adm & SSB_ADM_NEG); /* unsupported */ + WARN_ON(adm & SSB_ADM_NEG); /* unsupported */ base = (adm & SSB_ADM_BASE1); break; case SSB_ADM_TYPE2: - SSB_WARN_ON(adm & SSB_ADM_NEG); /* unsupported */ + WARN_ON(adm & SSB_ADM_NEG); /* unsupported */ base = (adm & SSB_ADM_BASE2); break; default: - SSB_WARN_ON(1); + WARN_ON(1); } return base; @@ -1266,15 +1262,15 @@ u32 ssb_admatch_size(u32 adm) size = ((adm & SSB_ADM_SZ0) >> SSB_ADM_SZ0_SHIFT); break; case SSB_ADM_TYPE1: - SSB_WARN_ON(adm & SSB_ADM_NEG); /* unsupported */ + WARN_ON(adm & SSB_ADM_NEG); /* unsupported */ size = ((adm & SSB_ADM_SZ1) >> SSB_ADM_SZ1_SHIFT); break; case SSB_ADM_TYPE2: - SSB_WARN_ON(adm & SSB_ADM_NEG); /* unsupported */ + WARN_ON(adm & SSB_ADM_NEG); /* unsupported */ size = ((adm & SSB_ADM_SZ2) >> SSB_ADM_SZ2_SHIFT); break; default: - SSB_WARN_ON(1); + WARN_ON(1); } size = (1 << (size + 1)); diff --git a/drivers/ssb/pci.c b/drivers/ssb/pci.c index ad4308529eba..84807a9b4b13 100644 --- a/drivers/ssb/pci.c +++ b/drivers/ssb/pci.c @@ -946,7 +946,6 @@ out: return err; } -#ifdef CONFIG_SSB_DEBUG static int ssb_pci_assert_buspower(struct ssb_bus *bus) { if (likely(bus->powered_up)) @@ -960,12 +959,6 @@ static int ssb_pci_assert_buspower(struct ssb_bus *bus) return -ENODEV; } -#else /* DEBUG */ -static inline int ssb_pci_assert_buspower(struct ssb_bus *bus) -{ - return 0; -} -#endif /* DEBUG */ static u8 ssb_pci_read8(struct ssb_device *dev, u16 offset) { @@ -1024,15 +1017,15 @@ static void ssb_pci_block_read(struct ssb_device *dev, void *buffer, ioread8_rep(addr, buffer, count); break; case sizeof(u16): - SSB_WARN_ON(count & 1); + WARN_ON(count & 1); ioread16_rep(addr, buffer, count >> 1); break; case sizeof(u32): - SSB_WARN_ON(count & 3); + WARN_ON(count & 3); ioread32_rep(addr, buffer, count >> 2); break; default: - SSB_WARN_ON(1); + WARN_ON(1); } return; @@ -1098,15 +1091,15 @@ static void ssb_pci_block_write(struct ssb_device *dev, const void *buffer, iowrite8_rep(addr, buffer, count); break; case sizeof(u16): - SSB_WARN_ON(count & 1); + WARN_ON(count & 1); iowrite16_rep(addr, buffer, count >> 1); break; case sizeof(u32): - SSB_WARN_ON(count & 3); + WARN_ON(count & 3); iowrite32_rep(addr, buffer, count >> 2); break; default: - SSB_WARN_ON(1); + WARN_ON(1); } } #endif /* CONFIG_SSB_BLOCKIO */ diff --git a/drivers/ssb/pcmcia.c b/drivers/ssb/pcmcia.c index 20f63cc88e70..567013f8a8be 100644 --- a/drivers/ssb/pcmcia.c +++ b/drivers/ssb/pcmcia.c @@ -169,7 +169,7 @@ int ssb_pcmcia_switch_segment(struct ssb_bus *bus, u8 seg) int err; u8 val; - SSB_WARN_ON((seg != 0) && (seg != 1)); + WARN_ON((seg != 0) && (seg != 1)); while (1) { err = ssb_pcmcia_cfg_write(bus, SSB_PCMCIA_MEMSEG, seg); if (err) @@ -299,7 +299,7 @@ static void ssb_pcmcia_block_read(struct ssb_device *dev, void *buffer, case sizeof(u16): { __le16 *buf = buffer; - SSB_WARN_ON(count & 1); + WARN_ON(count & 1); while (count) { *buf = (__force __le16)__raw_readw(addr); buf++; @@ -310,7 +310,7 @@ static void ssb_pcmcia_block_read(struct ssb_device *dev, void *buffer, case sizeof(u32): { __le16 *buf = buffer; - SSB_WARN_ON(count & 3); + WARN_ON(count & 3); while (count) { *buf = (__force __le16)__raw_readw(addr); buf++; @@ -321,7 +321,7 @@ static void ssb_pcmcia_block_read(struct ssb_device *dev, void *buffer, break; } default: - SSB_WARN_ON(1); + WARN_ON(1); } unlock: spin_unlock_irqrestore(&bus->bar_lock, flags); @@ -399,7 +399,7 @@ static void ssb_pcmcia_block_write(struct ssb_device *dev, const void *buffer, case sizeof(u16): { const __le16 *buf = buffer; - SSB_WARN_ON(count & 1); + WARN_ON(count & 1); while (count) { __raw_writew((__force u16)(*buf), addr); buf++; @@ -410,7 +410,7 @@ static void ssb_pcmcia_block_write(struct ssb_device *dev, const void *buffer, case sizeof(u32): { const __le16 *buf = buffer; - SSB_WARN_ON(count & 3); + WARN_ON(count & 3); while (count) { __raw_writew((__force u16)(*buf), addr); buf++; @@ -421,7 +421,7 @@ static void ssb_pcmcia_block_write(struct ssb_device *dev, const void *buffer, break; } default: - SSB_WARN_ON(1); + WARN_ON(1); } unlock: mmiowb(); diff --git a/drivers/ssb/scan.c b/drivers/ssb/scan.c index 71fbb4b3eb6a..6ceee98ed6ff 100644 --- a/drivers/ssb/scan.c +++ b/drivers/ssb/scan.c @@ -210,7 +210,7 @@ void ssb_iounmap(struct ssb_bus *bus) #ifdef CONFIG_SSB_PCIHOST pci_iounmap(bus->host_pci, bus->mmio); #else - SSB_BUG_ON(1); /* Can't reach this code. */ + WARN_ON(1); /* Can't reach this code. */ #endif break; case SSB_BUSTYPE_SDIO: @@ -236,7 +236,7 @@ static void __iomem *ssb_ioremap(struct ssb_bus *bus, #ifdef CONFIG_SSB_PCIHOST mmio = pci_iomap(bus->host_pci, 0, ~0UL); #else - SSB_BUG_ON(1); /* Can't reach this code. */ + WARN_ON(1); /* Can't reach this code. */ #endif break; case SSB_BUSTYPE_SDIO: diff --git a/drivers/ssb/sdio.c b/drivers/ssb/sdio.c index 1aedc5fbb62f..7fe0afb42234 100644 --- a/drivers/ssb/sdio.c +++ b/drivers/ssb/sdio.c @@ -316,18 +316,18 @@ static void ssb_sdio_block_read(struct ssb_device *dev, void *buffer, break; } case sizeof(u16): { - SSB_WARN_ON(count & 1); + WARN_ON(count & 1); error = sdio_readsb(bus->host_sdio, buffer, offset, count); break; } case sizeof(u32): { - SSB_WARN_ON(count & 3); + WARN_ON(count & 3); offset |= SBSDIO_SB_ACCESS_2_4B_FLAG; /* 32 bit data access */ error = sdio_readsb(bus->host_sdio, buffer, offset, count); break; } default: - SSB_WARN_ON(1); + WARN_ON(1); } if (!error) goto out; @@ -423,18 +423,18 @@ static void ssb_sdio_block_write(struct ssb_device *dev, const void *buffer, (void *)buffer, count); break; case sizeof(u16): - SSB_WARN_ON(count & 1); + WARN_ON(count & 1); error = sdio_writesb(bus->host_sdio, offset, (void *)buffer, count); break; case sizeof(u32): - SSB_WARN_ON(count & 3); + WARN_ON(count & 3); offset |= SBSDIO_SB_ACCESS_2_4B_FLAG; /* 32 bit data access */ error = sdio_writesb(bus->host_sdio, offset, (void *)buffer, count); break; default: - SSB_WARN_ON(1); + WARN_ON(1); } if (!error) goto out; diff --git a/drivers/ssb/ssb_private.h b/drivers/ssb/ssb_private.h index 885e278c4487..5f31bdfbe77f 100644 --- a/drivers/ssb/ssb_private.h +++ b/drivers/ssb/ssb_private.h @@ -9,15 +9,6 @@ #include #include -#ifdef CONFIG_SSB_DEBUG -# define SSB_WARN_ON(x) WARN_ON(x) -# define SSB_BUG_ON(x) BUG_ON(x) -#else -static inline int __ssb_do_nothing(int x) { return x; } -# define SSB_WARN_ON(x) __ssb_do_nothing(unlikely(!!(x))) -# define SSB_BUG_ON(x) __ssb_do_nothing(unlikely(!!(x))) -#endif - /* pci.c */ #ifdef CONFIG_SSB_PCIHOST diff --git a/include/linux/ssb/ssb.h b/include/linux/ssb/ssb.h index 3b43655cabe6..0d5a2691e7e9 100644 --- a/include/linux/ssb/ssb.h +++ b/include/linux/ssb/ssb.h @@ -499,11 +499,9 @@ struct ssb_bus { /* Internal-only stuff follows. Do not touch. */ struct list_head list; -#ifdef CONFIG_SSB_DEBUG /* Is the bus already powered up? */ bool powered_up; int power_warn_count; -#endif /* DEBUG */ }; enum ssb_quirks { -- cgit v1.2.3-59-g8ed1b From 624c0f0239f04cce78c86afa95eb2841c84fbab1 Mon Sep 17 00:00:00 2001 From: Russell King Date: Thu, 9 Aug 2018 15:38:38 +0200 Subject: phylink: add helper for configuring 2500BaseX modes Add a helper for MAC drivers to use in their validate callback to deal with 2500BaseX vs 1000BaseX modes, where the hardware supports both but it is not possible to automatically select between them. This helper defaults to 1000BaseX, as that is the 802.3 standard, and will allow users to select 2500BaseX either by forcing the speed if AN is disabled, or by changing the advertising mask if AN is enabled. Disabling AN is not recommended as it is only the speed that we're interested in controlling, not the duplex or pause mode parameters. Signed-off-by: Russell King Signed-off-by: Andrew Lunn Signed-off-by: David S. Miller --- drivers/net/phy/phylink.c | 30 ++++++++++++++++++++++++++++++ include/linux/phylink.h | 1 + 2 files changed, 31 insertions(+) (limited to 'include/linux') diff --git a/drivers/net/phy/phylink.c b/drivers/net/phy/phylink.c index af4dc4425be2..3ba5cf2a8a5f 100644 --- a/drivers/net/phy/phylink.c +++ b/drivers/net/phy/phylink.c @@ -1688,4 +1688,34 @@ static const struct sfp_upstream_ops sfp_phylink_ops = { .disconnect_phy = phylink_sfp_disconnect_phy, }; +/* Helpers for MAC drivers */ + +/** + * phylink_helper_basex_speed() - 1000BaseX/2500BaseX helper + * @state: a pointer to a &struct phylink_link_state + * + * Inspect the interface mode, advertising mask or forced speed and + * decide whether to run at 2.5Gbit or 1Gbit appropriately, switching + * the interface mode to suit. @state->interface is appropriately + * updated, and the advertising mask has the "other" baseX_Full flag + * cleared. + */ +void phylink_helper_basex_speed(struct phylink_link_state *state) +{ + if (phy_interface_mode_is_8023z(state->interface)) { + bool want_2500 = state->an_enabled ? + phylink_test(state->advertising, 2500baseX_Full) : + state->speed == SPEED_2500; + + if (want_2500) { + phylink_clear(state->advertising, 1000baseX_Full); + state->interface = PHY_INTERFACE_MODE_2500BASEX; + } else { + phylink_clear(state->advertising, 2500baseX_Full); + state->interface = PHY_INTERFACE_MODE_1000BASEX; + } + } +} +EXPORT_SYMBOL_GPL(phylink_helper_basex_speed); + MODULE_LICENSE("GPL"); diff --git a/include/linux/phylink.h b/include/linux/phylink.h index 50eeae025f1e..021fc6595856 100644 --- a/include/linux/phylink.h +++ b/include/linux/phylink.h @@ -234,5 +234,6 @@ int phylink_mii_ioctl(struct phylink *, struct ifreq *, int); #define phylink_test(bm, mode) __phylink_do_bit(test_bit, bm, mode) void phylink_set_port_modes(unsigned long *bits); +void phylink_helper_basex_speed(struct phylink_link_state *state); #endif -- cgit v1.2.3-59-g8ed1b From 48ae5554a076c1bca31448d60263e4038def9f6f Mon Sep 17 00:00:00 2001 From: Jose Abreu Date: Wed, 8 Aug 2018 09:04:29 +0100 Subject: net: stmmac: Add XGMAC 2.10 HWIF entry Add a new entry to HWIF table for XGMAC 2.10. For now we fill it with empty callbacks which will be added in posterior patches. Signed-off-by: Jose Abreu Cc: David S. Miller Cc: Joao Pinto Cc: Giuseppe Cavallaro Cc: Alexandre Torgue Signed-off-by: David S. Miller --- drivers/net/ethernet/stmicro/stmmac/common.h | 14 +++++++------ drivers/net/ethernet/stmicro/stmmac/hwif.c | 31 ++++++++++++++++++++++++++-- include/linux/stmmac.h | 1 + 3 files changed, 38 insertions(+), 8 deletions(-) (limited to 'include/linux') diff --git a/drivers/net/ethernet/stmicro/stmmac/common.h b/drivers/net/ethernet/stmicro/stmmac/common.h index 78fd0f8b8e81..3fb81acbd274 100644 --- a/drivers/net/ethernet/stmicro/stmmac/common.h +++ b/drivers/net/ethernet/stmicro/stmmac/common.h @@ -36,12 +36,14 @@ #include "mmc.h" /* Synopsys Core versions */ -#define DWMAC_CORE_3_40 0x34 -#define DWMAC_CORE_3_50 0x35 -#define DWMAC_CORE_4_00 0x40 -#define DWMAC_CORE_4_10 0x41 -#define DWMAC_CORE_5_00 0x50 -#define DWMAC_CORE_5_10 0x51 +#define DWMAC_CORE_3_40 0x34 +#define DWMAC_CORE_3_50 0x35 +#define DWMAC_CORE_4_00 0x40 +#define DWMAC_CORE_4_10 0x41 +#define DWMAC_CORE_5_00 0x50 +#define DWMAC_CORE_5_10 0x51 +#define DWXGMAC_CORE_2_10 0x21 + #define STMMAC_CHAN0 0 /* Always supported and default for all chips */ /* These need to be power of two, and >= 4 */ diff --git a/drivers/net/ethernet/stmicro/stmmac/hwif.c b/drivers/net/ethernet/stmicro/stmmac/hwif.c index 1f50e83cafb2..24f5ff175aa4 100644 --- a/drivers/net/ethernet/stmicro/stmmac/hwif.c +++ b/drivers/net/ethernet/stmicro/stmmac/hwif.c @@ -72,6 +72,7 @@ static int stmmac_dwmac4_quirks(struct stmmac_priv *priv) static const struct stmmac_hwif_entry { bool gmac; bool gmac4; + bool xgmac; u32 min_id; const struct stmmac_regs_off regs; const void *desc; @@ -87,6 +88,7 @@ static const struct stmmac_hwif_entry { { .gmac = false, .gmac4 = false, + .xgmac = false, .min_id = 0, .regs = { .ptp_off = PTP_GMAC3_X_OFFSET, @@ -103,6 +105,7 @@ static const struct stmmac_hwif_entry { }, { .gmac = true, .gmac4 = false, + .xgmac = false, .min_id = 0, .regs = { .ptp_off = PTP_GMAC3_X_OFFSET, @@ -119,6 +122,7 @@ static const struct stmmac_hwif_entry { }, { .gmac = false, .gmac4 = true, + .xgmac = false, .min_id = 0, .regs = { .ptp_off = PTP_GMAC4_OFFSET, @@ -135,6 +139,7 @@ static const struct stmmac_hwif_entry { }, { .gmac = false, .gmac4 = true, + .xgmac = false, .min_id = DWMAC_CORE_4_00, .regs = { .ptp_off = PTP_GMAC4_OFFSET, @@ -151,6 +156,7 @@ static const struct stmmac_hwif_entry { }, { .gmac = false, .gmac4 = true, + .xgmac = false, .min_id = DWMAC_CORE_4_10, .regs = { .ptp_off = PTP_GMAC4_OFFSET, @@ -167,6 +173,7 @@ static const struct stmmac_hwif_entry { }, { .gmac = false, .gmac4 = true, + .xgmac = false, .min_id = DWMAC_CORE_5_10, .regs = { .ptp_off = PTP_GMAC4_OFFSET, @@ -180,11 +187,29 @@ static const struct stmmac_hwif_entry { .tc = &dwmac510_tc_ops, .setup = dwmac4_setup, .quirks = NULL, - } + }, { + .gmac = false, + .gmac4 = false, + .xgmac = true, + .min_id = DWXGMAC_CORE_2_10, + .regs = { + .ptp_off = 0, + .mmc_off = 0, + }, + .desc = NULL, + .dma = NULL, + .mac = NULL, + .hwtimestamp = NULL, + .mode = NULL, + .tc = NULL, + .setup = NULL, + .quirks = NULL, + }, }; int stmmac_hwif_init(struct stmmac_priv *priv) { + bool needs_xgmac = priv->plat->has_xgmac; bool needs_gmac4 = priv->plat->has_gmac4; bool needs_gmac = priv->plat->has_gmac; const struct stmmac_hwif_entry *entry; @@ -195,7 +220,7 @@ int stmmac_hwif_init(struct stmmac_priv *priv) if (needs_gmac) { id = stmmac_get_id(priv, GMAC_VERSION); - } else if (needs_gmac4) { + } else if (needs_gmac4 || needs_xgmac) { id = stmmac_get_id(priv, GMAC4_VERSION); } else { id = 0; @@ -229,6 +254,8 @@ int stmmac_hwif_init(struct stmmac_priv *priv) continue; if (needs_gmac4 ^ entry->gmac4) continue; + if (needs_xgmac ^ entry->xgmac) + continue; /* Use synopsys_id var because some setups can override this */ if (priv->synopsys_id < entry->min_id) continue; diff --git a/include/linux/stmmac.h b/include/linux/stmmac.h index 32feac5bbd75..c43e9a01b892 100644 --- a/include/linux/stmmac.h +++ b/include/linux/stmmac.h @@ -190,5 +190,6 @@ struct plat_stmmacenet_data { bool tso_en; int mac_port_sel_speed; bool en_tx_lpi_clockgating; + int has_xgmac; }; #endif -- cgit v1.2.3-59-g8ed1b From 5e7baf0fcb2a3aef7329f3c7543d4695a46bd321 Mon Sep 17 00:00:00 2001 From: Manish Chopra Date: Thu, 9 Aug 2018 11:13:49 -0700 Subject: qed/qede: Multi CoS support. This patch adds support for tc mqprio offload, using this different traffic classes on the adapter can be utilized based on configured priority to tc map. For example - tc qdisc add dev eth0 root mqprio num_tc 4 map 0 1 2 3 This will cause SKBs with priority 0,1,2,3 to transmit over tc 0,1,2,3 hardware queues respectively. Signed-off-by: Manish Chopra Signed-off-by: Ariel Elior Signed-off-by: David S. Miller --- drivers/net/ethernet/qlogic/qed/qed_l2.c | 9 +- drivers/net/ethernet/qlogic/qed/qed_main.c | 5 +- drivers/net/ethernet/qlogic/qede/qede.h | 13 +++ drivers/net/ethernet/qlogic/qede/qede_ethtool.c | 48 ++++++-- drivers/net/ethernet/qlogic/qede/qede_fp.c | 29 +++-- drivers/net/ethernet/qlogic/qede/qede_main.c | 139 +++++++++++++++++++----- include/linux/qed/qed_eth_if.h | 6 + 7 files changed, 200 insertions(+), 49 deletions(-) (limited to 'include/linux') diff --git a/drivers/net/ethernet/qlogic/qed/qed_l2.c b/drivers/net/ethernet/qlogic/qed/qed_l2.c index 5ede6408649d..82a1bd1f8a8c 100644 --- a/drivers/net/ethernet/qlogic/qed/qed_l2.c +++ b/drivers/net/ethernet/qlogic/qed/qed_l2.c @@ -2188,16 +2188,17 @@ out: static int qed_fill_eth_dev_info(struct qed_dev *cdev, struct qed_dev_eth_info *info) { + struct qed_hwfn *p_hwfn = QED_LEADING_HWFN(cdev); int i; memset(info, 0, sizeof(*info)); - info->num_tc = 1; - if (IS_PF(cdev)) { int max_vf_vlan_filters = 0; int max_vf_mac_filters = 0; + info->num_tc = p_hwfn->hw_info.num_hw_tc; + if (cdev->int_params.out.int_mode == QED_INT_MODE_MSIX) { u16 num_queues = 0; @@ -2248,6 +2249,8 @@ static int qed_fill_eth_dev_info(struct qed_dev *cdev, } else { u16 total_cids = 0; + info->num_tc = 1; + /* Determine queues & XDP support */ for_each_hwfn(cdev, i) { struct qed_hwfn *p_hwfn = &cdev->hwfns[i]; @@ -2554,7 +2557,7 @@ static int qed_start_txq(struct qed_dev *cdev, rc = qed_eth_tx_queue_start(p_hwfn, p_hwfn->hw_info.opaque_fid, - p_params, 0, + p_params, p_params->tc, pbl_addr, pbl_size, ret_params); if (rc) { diff --git a/drivers/net/ethernet/qlogic/qed/qed_main.c b/drivers/net/ethernet/qlogic/qed/qed_main.c index dbe81310c0b6..2094d86a7a08 100644 --- a/drivers/net/ethernet/qlogic/qed/qed_main.c +++ b/drivers/net/ethernet/qlogic/qed/qed_main.c @@ -948,13 +948,14 @@ static void qed_update_pf_params(struct qed_dev *cdev, params->eth_pf_params.num_arfs_filters = 0; /* In case we might support RDMA, don't allow qede to be greedy - * with the L2 contexts. Allow for 64 queues [rx, tx, xdp] per hwfn. + * with the L2 contexts. Allow for 64 queues [rx, tx cos, xdp] + * per hwfn. */ if (QED_IS_RDMA_PERSONALITY(QED_LEADING_HWFN(cdev))) { u16 *num_cons; num_cons = ¶ms->eth_pf_params.num_cons; - *num_cons = min_t(u16, *num_cons, 192); + *num_cons = min_t(u16, *num_cons, QED_MAX_L2_CONS); } for (i = 0; i < cdev->num_hwfns; i++) { diff --git a/drivers/net/ethernet/qlogic/qede/qede.h b/drivers/net/ethernet/qlogic/qede/qede.h index d7ed0d3dbf71..e90c60a8fb01 100644 --- a/drivers/net/ethernet/qlogic/qede/qede.h +++ b/drivers/net/ethernet/qlogic/qede/qede.h @@ -386,6 +386,15 @@ struct qede_tx_queue { #define QEDE_TXQ_XDP_TO_IDX(edev, txq) ((txq)->index - \ QEDE_MAX_TSS_CNT(edev)) #define QEDE_TXQ_IDX_TO_XDP(edev, idx) ((idx) + QEDE_MAX_TSS_CNT(edev)) +#define QEDE_NDEV_TXQ_ID_TO_FP_ID(edev, idx) ((edev)->fp_num_rx + \ + ((idx) % QEDE_TSS_COUNT(edev))) +#define QEDE_NDEV_TXQ_ID_TO_TXQ_COS(edev, idx) ((idx) / QEDE_TSS_COUNT(edev)) +#define QEDE_TXQ_TO_NDEV_TXQ_ID(edev, txq) ((QEDE_TSS_COUNT(edev) * \ + (txq)->cos) + (txq)->index) +#define QEDE_NDEV_TXQ_ID_TO_TXQ(edev, idx) \ + (&((edev)->fp_array[QEDE_NDEV_TXQ_ID_TO_FP_ID(edev, idx)].txq \ + [QEDE_NDEV_TXQ_ID_TO_TXQ_COS(edev, idx)])) +#define QEDE_FP_TC0_TXQ(fp) (&((fp)->txq[0])) /* Regular Tx requires skb + metadata for release purpose, * while XDP requires the pages and the mapped address. @@ -399,6 +408,8 @@ struct qede_tx_queue { /* Slowpath; Should be kept in end [unless missing padding] */ void *handle; + u16 cos; + u16 ndev_txq_id; }; #define BD_UNMAP_ADDR(bd) HILO_U64(le32_to_cpu((bd)->addr.hi), \ @@ -541,5 +552,7 @@ void qede_update_rx_prod(struct qede_dev *edev, struct qede_rx_queue *rxq); #define QEDE_RX_HDR_SIZE 256 #define QEDE_MAX_JUMBO_PACKET_SIZE 9600 #define for_each_queue(i) for (i = 0; i < edev->num_queues; i++) +#define for_each_cos_in_txq(edev, var) \ + for ((var) = 0; (var) < (edev)->dev_info.num_tc; (var)++) #endif /* _QEDE_H_ */ diff --git a/drivers/net/ethernet/qlogic/qede/qede_ethtool.c b/drivers/net/ethernet/qlogic/qede/qede_ethtool.c index b37857f3f950..2bd84d6d9b15 100644 --- a/drivers/net/ethernet/qlogic/qede/qede_ethtool.c +++ b/drivers/net/ethernet/qlogic/qede/qede_ethtool.c @@ -222,7 +222,7 @@ static void qede_get_strings_stats_txq(struct qede_dev *edev, QEDE_TXQ_XDP_TO_IDX(edev, txq), qede_tqstats_arr[i].string); else - sprintf(*buf, "%d: %s", txq->index, + sprintf(*buf, "%d_%d: %s", txq->index, txq->cos, qede_tqstats_arr[i].string); *buf += ETH_GSTRING_LEN; } @@ -262,8 +262,13 @@ static void qede_get_strings_stats(struct qede_dev *edev, u8 *buf) if (fp->type & QEDE_FASTPATH_XDP) qede_get_strings_stats_txq(edev, fp->xdp_tx, &buf); - if (fp->type & QEDE_FASTPATH_TX) - qede_get_strings_stats_txq(edev, fp->txq, &buf); + if (fp->type & QEDE_FASTPATH_TX) { + int cos; + + for_each_cos_in_txq(edev, cos) + qede_get_strings_stats_txq(edev, + &fp->txq[cos], &buf); + } } /* Account for non-queue statistics */ @@ -338,8 +343,12 @@ static void qede_get_ethtool_stats(struct net_device *dev, if (fp->type & QEDE_FASTPATH_XDP) qede_get_ethtool_stats_txq(fp->xdp_tx, &buf); - if (fp->type & QEDE_FASTPATH_TX) - qede_get_ethtool_stats_txq(fp->txq, &buf); + if (fp->type & QEDE_FASTPATH_TX) { + int cos; + + for_each_cos_in_txq(edev, cos) + qede_get_ethtool_stats_txq(&fp->txq[cos], &buf); + } } for (i = 0; i < QEDE_NUM_STATS; i++) { @@ -366,7 +375,8 @@ static int qede_get_sset_count(struct net_device *dev, int stringset) num_stats--; /* Account for the Regular Tx statistics */ - num_stats += QEDE_TSS_COUNT(edev) * QEDE_NUM_TQSTATS; + num_stats += QEDE_TSS_COUNT(edev) * QEDE_NUM_TQSTATS * + edev->dev_info.num_tc; /* Account for the Regular Rx statistics */ num_stats += QEDE_RSS_COUNT(edev) * QEDE_NUM_RQSTATS; @@ -741,9 +751,17 @@ static int qede_get_coalesce(struct net_device *dev, } for_each_queue(i) { + struct qede_tx_queue *txq; + fp = &edev->fp_array[i]; + + /* All TX queues of given fastpath uses same + * coalescing value, so no need to iterate over + * all TCs, TC0 txq should suffice. + */ if (fp->type & QEDE_FASTPATH_TX) { - tx_handle = fp->txq->handle; + txq = QEDE_FP_TC0_TXQ(fp); + tx_handle = txq->handle; break; } } @@ -801,9 +819,17 @@ static int qede_set_coalesce(struct net_device *dev, } if (edev->fp_array[i].type & QEDE_FASTPATH_TX) { + struct qede_tx_queue *txq; + + /* All TX queues of given fastpath uses same + * coalescing value, so no need to iterate over + * all TCs, TC0 txq should suffice. + */ + txq = QEDE_FP_TC0_TXQ(fp); + rc = edev->ops->common->set_coalesce(edev->cdev, 0, txc, - fp->txq->handle); + txq->handle); if (rc) { DP_INFO(edev, "Set TX coalesce error, rc = %d\n", rc); @@ -1385,8 +1411,10 @@ static int qede_selftest_transmit_traffic(struct qede_dev *edev, u16 val; for_each_queue(i) { - if (edev->fp_array[i].type & QEDE_FASTPATH_TX) { - txq = edev->fp_array[i].txq; + struct qede_fastpath *fp = &edev->fp_array[i]; + + if (fp->type & QEDE_FASTPATH_TX) { + txq = QEDE_FP_TC0_TXQ(fp); break; } } diff --git a/drivers/net/ethernet/qlogic/qede/qede_fp.c b/drivers/net/ethernet/qlogic/qede/qede_fp.c index 8c9e95ba9917..1a78027de071 100644 --- a/drivers/net/ethernet/qlogic/qede/qede_fp.c +++ b/drivers/net/ethernet/qlogic/qede/qede_fp.c @@ -408,12 +408,12 @@ static void qede_xdp_tx_int(struct qede_dev *edev, struct qede_tx_queue *txq) static int qede_tx_int(struct qede_dev *edev, struct qede_tx_queue *txq) { + unsigned int pkts_compl = 0, bytes_compl = 0; struct netdev_queue *netdev_txq; u16 hw_bd_cons; - unsigned int pkts_compl = 0, bytes_compl = 0; int rc; - netdev_txq = netdev_get_tx_queue(edev->ndev, txq->index); + netdev_txq = netdev_get_tx_queue(edev->ndev, txq->ndev_txq_id); hw_bd_cons = le16_to_cpu(*txq->hw_cons_ptr); barrier(); @@ -1365,9 +1365,14 @@ static bool qede_poll_is_more_work(struct qede_fastpath *fp) if (qede_txq_has_work(fp->xdp_tx)) return true; - if (likely(fp->type & QEDE_FASTPATH_TX)) - if (qede_txq_has_work(fp->txq)) - return true; + if (likely(fp->type & QEDE_FASTPATH_TX)) { + int cos; + + for_each_cos_in_txq(fp->edev, cos) { + if (qede_txq_has_work(&fp->txq[cos])) + return true; + } + } return false; } @@ -1382,8 +1387,14 @@ int qede_poll(struct napi_struct *napi, int budget) struct qede_dev *edev = fp->edev; int rx_work_done = 0; - if (likely(fp->type & QEDE_FASTPATH_TX) && qede_txq_has_work(fp->txq)) - qede_tx_int(edev, fp->txq); + if (likely(fp->type & QEDE_FASTPATH_TX)) { + int cos; + + for_each_cos_in_txq(fp->edev, cos) { + if (qede_txq_has_work(&fp->txq[cos])) + qede_tx_int(edev, &fp->txq[cos]); + } + } if ((fp->type & QEDE_FASTPATH_XDP) && qede_txq_has_work(fp->xdp_tx)) qede_xdp_tx_int(edev, fp->xdp_tx); @@ -1444,8 +1455,8 @@ netdev_tx_t qede_start_xmit(struct sk_buff *skb, struct net_device *ndev) /* Get tx-queue context and netdev index */ txq_index = skb_get_queue_mapping(skb); - WARN_ON(txq_index >= QEDE_TSS_COUNT(edev)); - txq = edev->fp_array[edev->fp_num_rx + txq_index].txq; + WARN_ON(txq_index >= QEDE_TSS_COUNT(edev) * edev->dev_info.num_tc); + txq = QEDE_NDEV_TXQ_ID_TO_TXQ(edev, txq_index); netdev_txq = netdev_get_tx_queue(ndev, txq_index); WARN_ON(qed_chain_get_elem_left(&txq->tx_pbl) < (MAX_SKB_FRAGS + 1)); diff --git a/drivers/net/ethernet/qlogic/qede/qede_main.c b/drivers/net/ethernet/qlogic/qede/qede_main.c index 6a796040a32c..d7299afa902c 100644 --- a/drivers/net/ethernet/qlogic/qede/qede_main.c +++ b/drivers/net/ethernet/qlogic/qede/qede_main.c @@ -536,6 +536,43 @@ static int qede_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd) return 0; } +int qede_setup_tc(struct net_device *ndev, u8 num_tc) +{ + struct qede_dev *edev = netdev_priv(ndev); + int cos, count, offset; + + if (num_tc > edev->dev_info.num_tc) + return -EINVAL; + + netdev_reset_tc(ndev); + netdev_set_num_tc(ndev, num_tc); + + for_each_cos_in_txq(edev, cos) { + count = QEDE_TSS_COUNT(edev); + offset = cos * QEDE_TSS_COUNT(edev); + netdev_set_tc_queue(ndev, cos, count, offset); + } + + return 0; +} + +static int +qede_setup_tc_offload(struct net_device *dev, enum tc_setup_type type, + void *type_data) +{ + struct tc_mqprio_qopt *mqprio; + + switch (type) { + case TC_SETUP_QDISC_MQPRIO: + mqprio = type_data; + + mqprio->hw = TC_MQPRIO_HW_OFFLOAD_TCS; + return qede_setup_tc(dev, mqprio->num_tc); + default: + return -EOPNOTSUPP; + } +} + static const struct net_device_ops qede_netdev_ops = { .ndo_open = qede_open, .ndo_stop = qede_close, @@ -568,6 +605,7 @@ static const struct net_device_ops qede_netdev_ops = { #ifdef CONFIG_RFS_ACCEL .ndo_rx_flow_steer = qede_rx_flow_steer, #endif + .ndo_setup_tc = qede_setup_tc_offload, }; static const struct net_device_ops qede_netdev_vf_ops = { @@ -621,7 +659,8 @@ static struct qede_dev *qede_alloc_etherdev(struct qed_dev *cdev, struct qede_dev *edev; ndev = alloc_etherdev_mqs(sizeof(*edev), - info->num_queues, info->num_queues); + info->num_queues * info->num_tc, + info->num_queues); if (!ndev) { pr_err("etherdev allocation failed\n"); return NULL; @@ -830,7 +869,8 @@ static int qede_alloc_fp_array(struct qede_dev *edev) } if (fp->type & QEDE_FASTPATH_TX) { - fp->txq = kzalloc(sizeof(*fp->txq), GFP_KERNEL); + fp->txq = kcalloc(edev->dev_info.num_tc, + sizeof(*fp->txq), GFP_KERNEL); if (!fp->txq) goto err; } @@ -879,10 +919,15 @@ static void qede_sp_task(struct work_struct *work) static void qede_update_pf_params(struct qed_dev *cdev) { struct qed_pf_params pf_params; + u16 num_cons; /* 64 rx + 64 tx + 64 XDP */ memset(&pf_params, 0, sizeof(struct qed_pf_params)); - pf_params.eth_pf_params.num_cons = (MAX_SB_PER_PF_MIMD - 1) * 3; + + /* 1 rx + 1 xdp + max tx cos */ + num_cons = QED_MIN_L2_CONS; + + pf_params.eth_pf_params.num_cons = (MAX_SB_PER_PF_MIMD - 1) * num_cons; /* Same for VFs - make sure they'll have sufficient connections * to support XDP Tx queues. @@ -1363,8 +1408,12 @@ static void qede_free_mem_fp(struct qede_dev *edev, struct qede_fastpath *fp) if (fp->type & QEDE_FASTPATH_XDP) qede_free_mem_txq(edev, fp->xdp_tx); - if (fp->type & QEDE_FASTPATH_TX) - qede_free_mem_txq(edev, fp->txq); + if (fp->type & QEDE_FASTPATH_TX) { + int cos; + + for_each_cos_in_txq(edev, cos) + qede_free_mem_txq(edev, &fp->txq[cos]); + } } /* This function allocates all memory needed for a single fp (i.e. an entity @@ -1391,9 +1440,13 @@ static int qede_alloc_mem_fp(struct qede_dev *edev, struct qede_fastpath *fp) } if (fp->type & QEDE_FASTPATH_TX) { - rc = qede_alloc_mem_txq(edev, fp->txq); - if (rc) - goto out; + int cos; + + for_each_cos_in_txq(edev, cos) { + rc = qede_alloc_mem_txq(edev, &fp->txq[cos]); + if (rc) + goto out; + } } out: @@ -1466,10 +1519,23 @@ static void qede_init_fp(struct qede_dev *edev) } if (fp->type & QEDE_FASTPATH_TX) { - fp->txq->index = txq_index++; - if (edev->dev_info.is_legacy) - fp->txq->is_legacy = 1; - fp->txq->dev = &edev->pdev->dev; + int cos; + + for_each_cos_in_txq(edev, cos) { + struct qede_tx_queue *txq = &fp->txq[cos]; + u16 ndev_tx_id; + + txq->cos = cos; + txq->index = txq_index; + ndev_tx_id = QEDE_TXQ_TO_NDEV_TXQ_ID(edev, txq); + txq->ndev_txq_id = ndev_tx_id; + + if (edev->dev_info.is_legacy) + txq->is_legacy = 1; + txq->dev = &edev->pdev->dev; + } + + txq_index++; } snprintf(fp->name, sizeof(fp->name), "%s-fp-%d", @@ -1483,7 +1549,9 @@ static int qede_set_real_num_queues(struct qede_dev *edev) { int rc = 0; - rc = netif_set_real_num_tx_queues(edev->ndev, QEDE_TSS_COUNT(edev)); + rc = netif_set_real_num_tx_queues(edev->ndev, + QEDE_TSS_COUNT(edev) * + edev->dev_info.num_tc); if (rc) { DP_NOTICE(edev, "Failed to set real number of Tx queues\n"); return rc; @@ -1685,9 +1753,13 @@ static int qede_stop_queues(struct qede_dev *edev) fp = &edev->fp_array[i]; if (fp->type & QEDE_FASTPATH_TX) { - rc = qede_drain_txq(edev, fp->txq, true); - if (rc) - return rc; + int cos; + + for_each_cos_in_txq(edev, cos) { + rc = qede_drain_txq(edev, &fp->txq[cos], true); + if (rc) + return rc; + } } if (fp->type & QEDE_FASTPATH_XDP) { @@ -1703,9 +1775,13 @@ static int qede_stop_queues(struct qede_dev *edev) /* Stop the Tx Queue(s) */ if (fp->type & QEDE_FASTPATH_TX) { - rc = qede_stop_txq(edev, fp->txq, i); - if (rc) - return rc; + int cos; + + for_each_cos_in_txq(edev, cos) { + rc = qede_stop_txq(edev, &fp->txq[cos], i); + if (rc) + return rc; + } } /* Stop the Rx Queue */ @@ -1758,6 +1834,7 @@ static int qede_start_txq(struct qede_dev *edev, params.p_sb = fp->sb_info; params.sb_idx = sb_idx; + params.tc = txq->cos; rc = edev->ops->q_tx_start(edev->cdev, rss_id, ¶ms, phys_table, page_cnt, &ret_params); @@ -1877,9 +1954,14 @@ static int qede_start_queues(struct qede_dev *edev, bool clear_stats) } if (fp->type & QEDE_FASTPATH_TX) { - rc = qede_start_txq(edev, fp, fp->txq, i, TX_PI(0)); - if (rc) - goto out; + int cos; + + for_each_cos_in_txq(edev, cos) { + rc = qede_start_txq(edev, fp, &fp->txq[cos], i, + TX_PI(cos)); + if (rc) + goto out; + } } } @@ -1973,6 +2055,7 @@ static int qede_load(struct qede_dev *edev, enum qede_load_mode mode, bool is_locked) { struct qed_link_params link_params; + u8 num_tc; int rc; DP_INFO(edev, "Starting qede load\n"); @@ -2019,6 +2102,10 @@ static int qede_load(struct qede_dev *edev, enum qede_load_mode mode, goto err4; DP_INFO(edev, "Start VPORT, RXQ and TXQ succeeded\n"); + num_tc = netdev_get_num_tc(edev->ndev); + num_tc = num_tc ? num_tc : edev->dev_info.num_tc; + qede_setup_tc(edev->ndev, num_tc); + /* Program un-configured VLANs */ qede_configure_vlan_filters(edev); @@ -2143,7 +2230,7 @@ static bool qede_is_txq_full(struct qede_dev *edev, struct qede_tx_queue *txq) { struct netdev_queue *netdev_txq; - netdev_txq = netdev_get_tx_queue(edev->ndev, txq->index); + netdev_txq = netdev_get_tx_queue(edev->ndev, txq->ndev_txq_id); if (netif_xmit_stopped(netdev_txq)) return true; @@ -2208,9 +2295,11 @@ static void qede_get_eth_tlv_data(void *dev, void *data) for_each_queue(i) { fp = &edev->fp_array[i]; if (fp->type & QEDE_FASTPATH_TX) { - if (fp->txq->sw_tx_cons != fp->txq->sw_tx_prod) + struct qede_tx_queue *txq = QEDE_FP_TC0_TXQ(fp); + + if (txq->sw_tx_cons != txq->sw_tx_prod) etlv->txqs_empty = false; - if (qede_is_txq_full(edev, fp->txq)) + if (qede_is_txq_full(edev, txq)) etlv->num_txqs_full++; } if (fp->type & QEDE_FASTPATH_RX) { diff --git a/include/linux/qed/qed_eth_if.h b/include/linux/qed/qed_eth_if.h index 2978fa4add42..a1310482c4ed 100644 --- a/include/linux/qed/qed_eth_if.h +++ b/include/linux/qed/qed_eth_if.h @@ -39,6 +39,10 @@ #include #include +/* 64 max queues * (1 rx + 4 tx-cos + 1 xdp) */ +#define QED_MIN_L2_CONS (2 + NUM_PHYS_TCS_4PORT_K2) +#define QED_MAX_L2_CONS (64 * (QED_MIN_L2_CONS)) + struct qed_queue_start_common_params { /* Should always be relative to entity sending this. */ u8 vport_id; @@ -49,6 +53,8 @@ struct qed_queue_start_common_params { struct qed_sb_info *p_sb; u8 sb_idx; + + u8 tc; }; struct qed_rxq_start_ret_params { -- cgit v1.2.3-59-g8ed1b From 15693fd37fc3a49da356164ed15b571c175efc34 Mon Sep 17 00:00:00 2001 From: YueHaibing Date: Wed, 8 Aug 2018 19:40:31 +0800 Subject: net: skbuff.h: fix using plain integer as NULL warning Fixes the following sparse warning: ./include/linux/skbuff.h:2365:58: warning: Using plain integer as NULL pointer Signed-off-by: YueHaibing Signed-off-by: David S. Miller --- include/linux/skbuff.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 7ebdf158a795..7e237a63a70c 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -2362,7 +2362,7 @@ static inline void skb_probe_transport_header(struct sk_buff *skb, if (skb_transport_header_was_set(skb)) return; - if (skb_flow_dissect_flow_keys_basic(skb, &keys, 0, 0, 0, 0, 0)) + if (skb_flow_dissect_flow_keys_basic(skb, &keys, NULL, 0, 0, 0, 0)) skb_set_transport_header(skb, keys.control.thoff); else skb_set_transport_header(skb, offset_hint); -- cgit v1.2.3-59-g8ed1b From b0768a86585d4d951a30ff565f19598dbbd67897 Mon Sep 17 00:00:00 2001 From: Toshiaki Makita Date: Fri, 3 Aug 2018 16:58:09 +0900 Subject: net: Export skb_headers_offset_update This is needed for veth XDP which does skb_copy_expand()-like operation. v2: - Drop skb_copy_header part because it has already been exported now. Signed-off-by: Toshiaki Makita Signed-off-by: Daniel Borkmann --- include/linux/skbuff.h | 1 + net/core/skbuff.c | 3 ++- 2 files changed, 3 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 7ebdf158a795..e93b157f526c 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -1038,6 +1038,7 @@ static inline struct sk_buff *alloc_skb_fclone(unsigned int size, } struct sk_buff *skb_morph(struct sk_buff *dst, struct sk_buff *src); +void skb_headers_offset_update(struct sk_buff *skb, int off); int skb_copy_ubufs(struct sk_buff *skb, gfp_t gfp_mask); struct sk_buff *skb_clone(struct sk_buff *skb, gfp_t priority); void skb_copy_header(struct sk_buff *new, const struct sk_buff *old); diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 8d574a88125d..c996c09d095f 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -1291,7 +1291,7 @@ struct sk_buff *skb_clone(struct sk_buff *skb, gfp_t gfp_mask) } EXPORT_SYMBOL(skb_clone); -static void skb_headers_offset_update(struct sk_buff *skb, int off) +void skb_headers_offset_update(struct sk_buff *skb, int off) { /* Only adjust this if it actually is csum_start rather than csum */ if (skb->ip_summed == CHECKSUM_PARTIAL) @@ -1305,6 +1305,7 @@ static void skb_headers_offset_update(struct sk_buff *skb, int off) skb->inner_network_header += off; skb->inner_mac_header += off; } +EXPORT_SYMBOL(skb_headers_offset_update); void skb_copy_header(struct sk_buff *new, const struct sk_buff *old) { -- cgit v1.2.3-59-g8ed1b From 0b19cc0a8694d4295383f88bc3441765875a57bc Mon Sep 17 00:00:00 2001 From: Toshiaki Makita Date: Fri, 3 Aug 2018 16:58:15 +0900 Subject: bpf: Make redirect_info accessible from modules We are going to add kern_flags field in redirect_info for kernel internal use. In order to avoid function call to access the flags, make redirect_info accessible from modules. Also as it is now non-static, add prefix bpf_ to redirect_info. v6: - Fix sparse warning around EXPORT_SYMBOL. Signed-off-by: Toshiaki Makita Signed-off-by: Daniel Borkmann --- include/linux/filter.h | 10 ++++++++++ net/core/filter.c | 29 +++++++++++------------------ 2 files changed, 21 insertions(+), 18 deletions(-) (limited to 'include/linux') diff --git a/include/linux/filter.h b/include/linux/filter.h index c73dd7396886..4717af8b95e6 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -537,6 +537,16 @@ struct sk_msg_buff { struct list_head list; }; +struct bpf_redirect_info { + u32 ifindex; + u32 flags; + struct bpf_map *map; + struct bpf_map *map_to_flush; + unsigned long map_owner; +}; + +DECLARE_PER_CPU(struct bpf_redirect_info, bpf_redirect_info); + /* Compute the linear packet data range [data, data_end) which * will be accessed by various program types (cls_bpf, act_bpf, * lwt, ...). Subsystems allowing direct data access must (!) diff --git a/net/core/filter.c b/net/core/filter.c index 587bbfbd7db3..2de7dd9f2a57 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -2082,19 +2082,12 @@ static const struct bpf_func_proto bpf_clone_redirect_proto = { .arg3_type = ARG_ANYTHING, }; -struct redirect_info { - u32 ifindex; - u32 flags; - struct bpf_map *map; - struct bpf_map *map_to_flush; - unsigned long map_owner; -}; - -static DEFINE_PER_CPU(struct redirect_info, redirect_info); +DEFINE_PER_CPU(struct bpf_redirect_info, bpf_redirect_info); +EXPORT_PER_CPU_SYMBOL_GPL(bpf_redirect_info); BPF_CALL_2(bpf_redirect, u32, ifindex, u64, flags) { - struct redirect_info *ri = this_cpu_ptr(&redirect_info); + struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); if (unlikely(flags & ~(BPF_F_INGRESS))) return TC_ACT_SHOT; @@ -2107,7 +2100,7 @@ BPF_CALL_2(bpf_redirect, u32, ifindex, u64, flags) int skb_do_redirect(struct sk_buff *skb) { - struct redirect_info *ri = this_cpu_ptr(&redirect_info); + struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); struct net_device *dev; dev = dev_get_by_index_rcu(dev_net(skb->dev), ri->ifindex); @@ -3200,7 +3193,7 @@ static int __bpf_tx_xdp_map(struct net_device *dev_rx, void *fwd, void xdp_do_flush_map(void) { - struct redirect_info *ri = this_cpu_ptr(&redirect_info); + struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); struct bpf_map *map = ri->map_to_flush; ri->map_to_flush = NULL; @@ -3245,7 +3238,7 @@ static inline bool xdp_map_invalid(const struct bpf_prog *xdp_prog, static int xdp_do_redirect_map(struct net_device *dev, struct xdp_buff *xdp, struct bpf_prog *xdp_prog) { - struct redirect_info *ri = this_cpu_ptr(&redirect_info); + struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); unsigned long map_owner = ri->map_owner; struct bpf_map *map = ri->map; u32 index = ri->ifindex; @@ -3285,7 +3278,7 @@ err: int xdp_do_redirect(struct net_device *dev, struct xdp_buff *xdp, struct bpf_prog *xdp_prog) { - struct redirect_info *ri = this_cpu_ptr(&redirect_info); + struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); struct net_device *fwd; u32 index = ri->ifindex; int err; @@ -3317,7 +3310,7 @@ static int xdp_do_generic_redirect_map(struct net_device *dev, struct xdp_buff *xdp, struct bpf_prog *xdp_prog) { - struct redirect_info *ri = this_cpu_ptr(&redirect_info); + struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); unsigned long map_owner = ri->map_owner; struct bpf_map *map = ri->map; u32 index = ri->ifindex; @@ -3368,7 +3361,7 @@ err: int xdp_do_generic_redirect(struct net_device *dev, struct sk_buff *skb, struct xdp_buff *xdp, struct bpf_prog *xdp_prog) { - struct redirect_info *ri = this_cpu_ptr(&redirect_info); + struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); u32 index = ri->ifindex; struct net_device *fwd; int err = 0; @@ -3399,7 +3392,7 @@ EXPORT_SYMBOL_GPL(xdp_do_generic_redirect); BPF_CALL_2(bpf_xdp_redirect, u32, ifindex, u64, flags) { - struct redirect_info *ri = this_cpu_ptr(&redirect_info); + struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); if (unlikely(flags)) return XDP_ABORTED; @@ -3423,7 +3416,7 @@ static const struct bpf_func_proto bpf_xdp_redirect_proto = { BPF_CALL_4(bpf_xdp_redirect_map, struct bpf_map *, map, u32, ifindex, u64, flags, unsigned long, map_owner) { - struct redirect_info *ri = this_cpu_ptr(&redirect_info); + struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); if (unlikely(flags)) return XDP_ABORTED; -- cgit v1.2.3-59-g8ed1b From 2539650fadbf63a431e76535a9de7bff6ea5e409 Mon Sep 17 00:00:00 2001 From: Toshiaki Makita Date: Fri, 3 Aug 2018 16:58:16 +0900 Subject: xdp: Helpers for disabling napi_direct of xdp_return_frame We need some mechanism to disable napi_direct on calling xdp_return_frame_rx_napi() from some context. When veth gets support of XDP_REDIRECT, it will redirects packets which are redirected from other devices. On redirection veth will reuse xdp_mem_info of the redirection source device to make return_frame work. But in this case .ndo_xdp_xmit() called from veth redirection uses xdp_mem_info which is not guarded by NAPI, because the .ndo_xdp_xmit() is not called directly from the rxq which owns the xdp_mem_info. This approach introduces a flag in bpf_redirect_info to indicate that napi_direct should be disabled even when _rx_napi variant is used as well as helper functions to use it. A NAPI handler who wants to use this flag needs to call xdp_set_return_frame_no_direct() before processing packets, and call xdp_clear_return_frame_no_direct() after xdp_do_flush_map() before exiting NAPI. v4: - Use bpf_redirect_info for storing the flag instead of xdp_mem_info to avoid per-frame copy cost. Signed-off-by: Toshiaki Makita Signed-off-by: Daniel Borkmann --- include/linux/filter.h | 25 +++++++++++++++++++++++++ net/core/xdp.c | 6 ++++-- 2 files changed, 29 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/filter.h b/include/linux/filter.h index 4717af8b95e6..2b072dab32c0 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -543,10 +543,14 @@ struct bpf_redirect_info { struct bpf_map *map; struct bpf_map *map_to_flush; unsigned long map_owner; + u32 kern_flags; }; DECLARE_PER_CPU(struct bpf_redirect_info, bpf_redirect_info); +/* flags for bpf_redirect_info kern_flags */ +#define BPF_RI_F_RF_NO_DIRECT BIT(0) /* no napi_direct on return_frame */ + /* Compute the linear packet data range [data, data_end) which * will be accessed by various program types (cls_bpf, act_bpf, * lwt, ...). Subsystems allowing direct data access must (!) @@ -775,6 +779,27 @@ static inline bool bpf_dump_raw_ok(void) struct bpf_prog *bpf_patch_insn_single(struct bpf_prog *prog, u32 off, const struct bpf_insn *patch, u32 len); +static inline bool xdp_return_frame_no_direct(void) +{ + struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); + + return ri->kern_flags & BPF_RI_F_RF_NO_DIRECT; +} + +static inline void xdp_set_return_frame_no_direct(void) +{ + struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); + + ri->kern_flags |= BPF_RI_F_RF_NO_DIRECT; +} + +static inline void xdp_clear_return_frame_no_direct(void) +{ + struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); + + ri->kern_flags &= ~BPF_RI_F_RF_NO_DIRECT; +} + static inline int xdp_ok_fwd_dev(const struct net_device *fwd, unsigned int pktlen) { diff --git a/net/core/xdp.c b/net/core/xdp.c index 57285383ed00..3dd99e1c04f5 100644 --- a/net/core/xdp.c +++ b/net/core/xdp.c @@ -330,10 +330,12 @@ static void __xdp_return(void *data, struct xdp_mem_info *mem, bool napi_direct, /* mem->id is valid, checked in xdp_rxq_info_reg_mem_model() */ xa = rhashtable_lookup(mem_id_ht, &mem->id, mem_id_rht_params); page = virt_to_head_page(data); - if (xa) + if (xa) { + napi_direct &= !xdp_return_frame_no_direct(); page_pool_put_page(xa->page_pool, page, napi_direct); - else + } else { put_page(page); + } rcu_read_unlock(); break; case MEM_TYPE_PAGE_SHARED: -- cgit v1.2.3-59-g8ed1b From c9fbb2d25295a566b97d62e6904741e8e1702d83 Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Fri, 10 Aug 2018 10:47:43 +0200 Subject: net: Provide stub for __netif_set_xps_queue if there is no CONFIG_XPS MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Building virtio_net driver without CONFIG_XPS fails with: drivers/net/virtio_net.c: In function ‘virtnet_set_affinity’: drivers/net/virtio_net.c:1910:3: error: implicit declaration of function ‘__netif_set_xps_queue’ [-Werror=implicit-function-declaration] __netif_set_xps_queue(vi->dev, mask, i, false); ^ Fixes: 4d99f6602cb5 ("net: allow to call netif_reset_xps_queues() under cpus_read_lock") Signed-off-by: Krzysztof Kozlowski Signed-off-by: David S. Miller --- include/linux/netdevice.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 282e2e95ad5b..ca5ab98053c8 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -3412,6 +3412,13 @@ static inline int netif_set_xps_queue(struct net_device *dev, { return 0; } + +static inline int __netif_set_xps_queue(struct net_device *dev, + const unsigned long *mask, + u16 index, bool is_rxqs_map) +{ + return 0; +} #endif /** -- cgit v1.2.3-59-g8ed1b From 5dc4c4b7d4e8115e7cde96a030f98cb3ab2e458c Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Wed, 8 Aug 2018 01:01:24 -0700 Subject: bpf: Introduce BPF_MAP_TYPE_REUSEPORT_SOCKARRAY This patch introduces a new map type BPF_MAP_TYPE_REUSEPORT_SOCKARRAY. To unleash the full potential of a bpf prog, it is essential for the userspace to be capable of directly setting up a bpf map which can then be consumed by the bpf prog to make decision. In this case, decide which SO_REUSEPORT sk to serve the incoming request. By adding BPF_MAP_TYPE_REUSEPORT_SOCKARRAY, the userspace has total control and visibility on where a SO_REUSEPORT sk should be located in a bpf map. The later patch will introduce BPF_PROG_TYPE_SK_REUSEPORT such that the bpf prog can directly select a sk from the bpf map. That will raise the programmability of the bpf prog attached to a reuseport group (a group of sk serving the same IP:PORT). For example, in UDP, the bpf prog can peek into the payload (e.g. through the "data" pointer introduced in the later patch) to learn the application level's connection information and then decide which sk to pick from a bpf map. The userspace can tightly couple the sk's location in a bpf map with the application logic in generating the UDP payload's connection information. This connection info contact/API stays within the userspace. Also, when used with map-in-map, the userspace can switch the old-server-process's inner map to a new-server-process's inner map in one call "bpf_map_update_elem(outer_map, &index, &new_reuseport_array)". The bpf prog will then direct incoming requests to the new process instead of the old process. The old process can finish draining the pending requests (e.g. by "accept()") before closing the old-fds. [Note that deleting a fd from a bpf map does not necessary mean the fd is closed] During map_update_elem(), Only SO_REUSEPORT sk (i.e. which has already been added to a reuse->socks[]) can be used. That means a SO_REUSEPORT sk that is "bind()" for UDP or "bind()+listen()" for TCP. These conditions are ensured in "reuseport_array_update_check()". A SO_REUSEPORT sk can only be added once to a map (i.e. the same sk cannot be added twice even to the same map). SO_REUSEPORT already allows another sk to be created for the same IP:PORT. There is no need to re-create a similar usage in the BPF side. When a SO_REUSEPORT is deleted from the "reuse->socks[]" (e.g. "close()"), it will notify the bpf map to remove it from the map also. It is done through "bpf_sk_reuseport_detach()" and it will only be called if >=1 of the "reuse->sock[]" has ever been added to a bpf map. The map_update()/map_delete() has to be in-sync with the "reuse->socks[]". Hence, the same "reuseport_lock" used by "reuse->socks[]" has to be used here also. Care has been taken to ensure the lock is only acquired when the adding sk passes some strict tests. and freeing the map does not require the reuseport_lock. The reuseport_array will also support lookup from the syscall side. It will return a sock_gen_cookie(). The sock_gen_cookie() is on-demand (i.e. a sk's cookie is not generated until the very first map_lookup_elem()). The lookup cookie is 64bits but it goes against the logical userspace expectation on 32bits sizeof(fd) (and as other fd based bpf maps do also). It may catch user in surprise if we enforce value_size=8 while userspace still pass a 32bits fd during update. Supporting different value_size between lookup and update seems unintuitive also. We also need to consider what if other existing fd based maps want to return 64bits value from syscall's lookup in the future. Hence, reuseport_array supports both value_size 4 and 8, and assuming user will usually use value_size=4. The syscall's lookup will return ENOSPC on value_size=4. It will will only return 64bits value from sock_gen_cookie() when user consciously choose value_size=8 (as a signal that lookup is desired) which then requires a 64bits value in both lookup and update. Signed-off-by: Martin KaFai Lau Acked-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann --- include/linux/bpf.h | 28 ++++ include/linux/bpf_types.h | 3 + include/uapi/linux/bpf.h | 1 + kernel/bpf/Makefile | 3 + kernel/bpf/arraymap.c | 2 +- kernel/bpf/reuseport_array.c | 363 +++++++++++++++++++++++++++++++++++++++++++ kernel/bpf/syscall.c | 6 + net/core/sock_reuseport.c | 8 + 8 files changed, 413 insertions(+), 1 deletion(-) create mode 100644 kernel/bpf/reuseport_array.c (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index cd8790d2c6ed..db11662faea6 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -524,6 +524,7 @@ static inline int bpf_map_attr_numa_node(const union bpf_attr *attr) } struct bpf_prog *bpf_prog_get_type_path(const char *name, enum bpf_prog_type type); +int array_map_alloc_check(union bpf_attr *attr); #else /* !CONFIG_BPF_SYSCALL */ static inline struct bpf_prog *bpf_prog_get(u32 ufd) @@ -769,6 +770,33 @@ static inline void __xsk_map_flush(struct bpf_map *map) } #endif +#if defined(CONFIG_INET) && defined(CONFIG_BPF_SYSCALL) +void bpf_sk_reuseport_detach(struct sock *sk); +int bpf_fd_reuseport_array_lookup_elem(struct bpf_map *map, void *key, + void *value); +int bpf_fd_reuseport_array_update_elem(struct bpf_map *map, void *key, + void *value, u64 map_flags); +#else +static inline void bpf_sk_reuseport_detach(struct sock *sk) +{ +} + +#ifdef CONFIG_BPF_SYSCALL +static inline int bpf_fd_reuseport_array_lookup_elem(struct bpf_map *map, + void *key, void *value) +{ + return -EOPNOTSUPP; +} + +static inline int bpf_fd_reuseport_array_update_elem(struct bpf_map *map, + void *key, void *value, + u64 map_flags) +{ + return -EOPNOTSUPP; +} +#endif /* CONFIG_BPF_SYSCALL */ +#endif /* defined(CONFIG_INET) && defined(CONFIG_BPF_SYSCALL) */ + /* verifier prototypes for helper functions called from eBPF programs */ extern const struct bpf_func_proto bpf_map_lookup_elem_proto; extern const struct bpf_func_proto bpf_map_update_elem_proto; diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h index add08be53b6f..14fd6c02d258 100644 --- a/include/linux/bpf_types.h +++ b/include/linux/bpf_types.h @@ -60,4 +60,7 @@ BPF_MAP_TYPE(BPF_MAP_TYPE_CPUMAP, cpu_map_ops) #if defined(CONFIG_XDP_SOCKETS) BPF_MAP_TYPE(BPF_MAP_TYPE_XSKMAP, xsk_map_ops) #endif +#ifdef CONFIG_INET +BPF_MAP_TYPE(BPF_MAP_TYPE_REUSEPORT_SOCKARRAY, reuseport_array_ops) +#endif #endif diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index dd5758dc35d3..40f584bc7da0 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -126,6 +126,7 @@ enum bpf_map_type { BPF_MAP_TYPE_XSKMAP, BPF_MAP_TYPE_SOCKHASH, BPF_MAP_TYPE_CGROUP_STORAGE, + BPF_MAP_TYPE_REUSEPORT_SOCKARRAY, }; enum bpf_prog_type { diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile index e8906cbad81f..0488b8258321 100644 --- a/kernel/bpf/Makefile +++ b/kernel/bpf/Makefile @@ -23,3 +23,6 @@ ifeq ($(CONFIG_PERF_EVENTS),y) obj-$(CONFIG_BPF_SYSCALL) += stackmap.o endif obj-$(CONFIG_CGROUP_BPF) += cgroup.o +ifeq ($(CONFIG_INET),y) +obj-$(CONFIG_BPF_SYSCALL) += reuseport_array.o +endif diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index 2aa55d030c77..f6ca3e712831 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -54,7 +54,7 @@ static int bpf_array_alloc_percpu(struct bpf_array *array) } /* Called from syscall */ -static int array_map_alloc_check(union bpf_attr *attr) +int array_map_alloc_check(union bpf_attr *attr) { bool percpu = attr->map_type == BPF_MAP_TYPE_PERCPU_ARRAY; int numa_node = bpf_map_attr_numa_node(attr); diff --git a/kernel/bpf/reuseport_array.c b/kernel/bpf/reuseport_array.c new file mode 100644 index 000000000000..18e225de80ff --- /dev/null +++ b/kernel/bpf/reuseport_array.c @@ -0,0 +1,363 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (c) 2018 Facebook + */ +#include +#include +#include +#include + +struct reuseport_array { + struct bpf_map map; + struct sock __rcu *ptrs[]; +}; + +static struct reuseport_array *reuseport_array(struct bpf_map *map) +{ + return (struct reuseport_array *)map; +} + +/* The caller must hold the reuseport_lock */ +void bpf_sk_reuseport_detach(struct sock *sk) +{ + struct sock __rcu **socks; + + write_lock_bh(&sk->sk_callback_lock); + socks = sk->sk_user_data; + if (socks) { + WRITE_ONCE(sk->sk_user_data, NULL); + /* + * Do not move this NULL assignment outside of + * sk->sk_callback_lock because there is + * a race with reuseport_array_free() + * which does not hold the reuseport_lock. + */ + RCU_INIT_POINTER(*socks, NULL); + } + write_unlock_bh(&sk->sk_callback_lock); +} + +static int reuseport_array_alloc_check(union bpf_attr *attr) +{ + if (attr->value_size != sizeof(u32) && + attr->value_size != sizeof(u64)) + return -EINVAL; + + return array_map_alloc_check(attr); +} + +static void *reuseport_array_lookup_elem(struct bpf_map *map, void *key) +{ + struct reuseport_array *array = reuseport_array(map); + u32 index = *(u32 *)key; + + if (unlikely(index >= array->map.max_entries)) + return NULL; + + return rcu_dereference(array->ptrs[index]); +} + +/* Called from syscall only */ +static int reuseport_array_delete_elem(struct bpf_map *map, void *key) +{ + struct reuseport_array *array = reuseport_array(map); + u32 index = *(u32 *)key; + struct sock *sk; + int err; + + if (index >= map->max_entries) + return -E2BIG; + + if (!rcu_access_pointer(array->ptrs[index])) + return -ENOENT; + + spin_lock_bh(&reuseport_lock); + + sk = rcu_dereference_protected(array->ptrs[index], + lockdep_is_held(&reuseport_lock)); + if (sk) { + write_lock_bh(&sk->sk_callback_lock); + WRITE_ONCE(sk->sk_user_data, NULL); + RCU_INIT_POINTER(array->ptrs[index], NULL); + write_unlock_bh(&sk->sk_callback_lock); + err = 0; + } else { + err = -ENOENT; + } + + spin_unlock_bh(&reuseport_lock); + + return err; +} + +static void reuseport_array_free(struct bpf_map *map) +{ + struct reuseport_array *array = reuseport_array(map); + struct sock *sk; + u32 i; + + synchronize_rcu(); + + /* + * ops->map_*_elem() will not be able to access this + * array now. Hence, this function only races with + * bpf_sk_reuseport_detach() which was triggerred by + * close() or disconnect(). + * + * This function and bpf_sk_reuseport_detach() are + * both removing sk from "array". Who removes it + * first does not matter. + * + * The only concern here is bpf_sk_reuseport_detach() + * may access "array" which is being freed here. + * bpf_sk_reuseport_detach() access this "array" + * through sk->sk_user_data _and_ with sk->sk_callback_lock + * held which is enough because this "array" is not freed + * until all sk->sk_user_data has stopped referencing this "array". + * + * Hence, due to the above, taking "reuseport_lock" is not + * needed here. + */ + + /* + * Since reuseport_lock is not taken, sk is accessed under + * rcu_read_lock() + */ + rcu_read_lock(); + for (i = 0; i < map->max_entries; i++) { + sk = rcu_dereference(array->ptrs[i]); + if (sk) { + write_lock_bh(&sk->sk_callback_lock); + /* + * No need for WRITE_ONCE(). At this point, + * no one is reading it without taking the + * sk->sk_callback_lock. + */ + sk->sk_user_data = NULL; + write_unlock_bh(&sk->sk_callback_lock); + RCU_INIT_POINTER(array->ptrs[i], NULL); + } + } + rcu_read_unlock(); + + /* + * Once reaching here, all sk->sk_user_data is not + * referenceing this "array". "array" can be freed now. + */ + bpf_map_area_free(array); +} + +static struct bpf_map *reuseport_array_alloc(union bpf_attr *attr) +{ + int err, numa_node = bpf_map_attr_numa_node(attr); + struct reuseport_array *array; + u64 cost, array_size; + + if (!capable(CAP_SYS_ADMIN)) + return ERR_PTR(-EPERM); + + array_size = sizeof(*array); + array_size += (u64)attr->max_entries * sizeof(struct sock *); + + /* make sure there is no u32 overflow later in round_up() */ + cost = array_size; + if (cost >= U32_MAX - PAGE_SIZE) + return ERR_PTR(-ENOMEM); + cost = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT; + + err = bpf_map_precharge_memlock(cost); + if (err) + return ERR_PTR(err); + + /* allocate all map elements and zero-initialize them */ + array = bpf_map_area_alloc(array_size, numa_node); + if (!array) + return ERR_PTR(-ENOMEM); + + /* copy mandatory map attributes */ + bpf_map_init_from_attr(&array->map, attr); + array->map.pages = cost; + + return &array->map; +} + +int bpf_fd_reuseport_array_lookup_elem(struct bpf_map *map, void *key, + void *value) +{ + struct sock *sk; + int err; + + if (map->value_size != sizeof(u64)) + return -ENOSPC; + + rcu_read_lock(); + sk = reuseport_array_lookup_elem(map, key); + if (sk) { + *(u64 *)value = sock_gen_cookie(sk); + err = 0; + } else { + err = -ENOENT; + } + rcu_read_unlock(); + + return err; +} + +static int +reuseport_array_update_check(const struct reuseport_array *array, + const struct sock *nsk, + const struct sock *osk, + const struct sock_reuseport *nsk_reuse, + u32 map_flags) +{ + if (osk && map_flags == BPF_NOEXIST) + return -EEXIST; + + if (!osk && map_flags == BPF_EXIST) + return -ENOENT; + + if (nsk->sk_protocol != IPPROTO_UDP && nsk->sk_protocol != IPPROTO_TCP) + return -ENOTSUPP; + + if (nsk->sk_family != AF_INET && nsk->sk_family != AF_INET6) + return -ENOTSUPP; + + if (nsk->sk_type != SOCK_STREAM && nsk->sk_type != SOCK_DGRAM) + return -ENOTSUPP; + + /* + * sk must be hashed (i.e. listening in the TCP case or binded + * in the UDP case) and + * it must also be a SO_REUSEPORT sk (i.e. reuse cannot be NULL). + * + * Also, sk will be used in bpf helper that is protected by + * rcu_read_lock(). + */ + if (!sock_flag(nsk, SOCK_RCU_FREE) || !sk_hashed(nsk) || !nsk_reuse) + return -EINVAL; + + /* READ_ONCE because the sk->sk_callback_lock may not be held here */ + if (READ_ONCE(nsk->sk_user_data)) + return -EBUSY; + + return 0; +} + +/* + * Called from syscall only. + * The "nsk" in the fd refcnt. + * The "osk" and "reuse" are protected by reuseport_lock. + */ +int bpf_fd_reuseport_array_update_elem(struct bpf_map *map, void *key, + void *value, u64 map_flags) +{ + struct reuseport_array *array = reuseport_array(map); + struct sock *free_osk = NULL, *osk, *nsk; + struct sock_reuseport *reuse; + u32 index = *(u32 *)key; + struct socket *socket; + int err, fd; + + if (map_flags > BPF_EXIST) + return -EINVAL; + + if (index >= map->max_entries) + return -E2BIG; + + if (map->value_size == sizeof(u64)) { + u64 fd64 = *(u64 *)value; + + if (fd64 > S32_MAX) + return -EINVAL; + fd = fd64; + } else { + fd = *(int *)value; + } + + socket = sockfd_lookup(fd, &err); + if (!socket) + return err; + + nsk = socket->sk; + if (!nsk) { + err = -EINVAL; + goto put_file; + } + + /* Quick checks before taking reuseport_lock */ + err = reuseport_array_update_check(array, nsk, + rcu_access_pointer(array->ptrs[index]), + rcu_access_pointer(nsk->sk_reuseport_cb), + map_flags); + if (err) + goto put_file; + + spin_lock_bh(&reuseport_lock); + /* + * Some of the checks only need reuseport_lock + * but it is done under sk_callback_lock also + * for simplicity reason. + */ + write_lock_bh(&nsk->sk_callback_lock); + + osk = rcu_dereference_protected(array->ptrs[index], + lockdep_is_held(&reuseport_lock)); + reuse = rcu_dereference_protected(nsk->sk_reuseport_cb, + lockdep_is_held(&reuseport_lock)); + err = reuseport_array_update_check(array, nsk, osk, reuse, map_flags); + if (err) + goto put_file_unlock; + + /* Ensure reuse->reuseport_id is set */ + err = reuseport_get_id(reuse); + if (err < 0) + goto put_file_unlock; + + WRITE_ONCE(nsk->sk_user_data, &array->ptrs[index]); + rcu_assign_pointer(array->ptrs[index], nsk); + free_osk = osk; + err = 0; + +put_file_unlock: + write_unlock_bh(&nsk->sk_callback_lock); + + if (free_osk) { + write_lock_bh(&free_osk->sk_callback_lock); + WRITE_ONCE(free_osk->sk_user_data, NULL); + write_unlock_bh(&free_osk->sk_callback_lock); + } + + spin_unlock_bh(&reuseport_lock); +put_file: + fput(socket->file); + return err; +} + +/* Called from syscall */ +static int reuseport_array_get_next_key(struct bpf_map *map, void *key, + void *next_key) +{ + struct reuseport_array *array = reuseport_array(map); + u32 index = key ? *(u32 *)key : U32_MAX; + u32 *next = (u32 *)next_key; + + if (index >= array->map.max_entries) { + *next = 0; + return 0; + } + + if (index == array->map.max_entries - 1) + return -ENOENT; + + *next = index + 1; + return 0; +} + +const struct bpf_map_ops reuseport_array_ops = { + .map_alloc_check = reuseport_array_alloc_check, + .map_alloc = reuseport_array_alloc, + .map_free = reuseport_array_free, + .map_lookup_elem = reuseport_array_lookup_elem, + .map_get_next_key = reuseport_array_get_next_key, + .map_delete_elem = reuseport_array_delete_elem, +}; diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 5af4e9e2722d..57f4d076141b 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -684,6 +684,8 @@ static int map_lookup_elem(union bpf_attr *attr) err = bpf_fd_array_map_lookup_elem(map, key, value); } else if (IS_FD_HASH(map)) { err = bpf_fd_htab_map_lookup_elem(map, key, value); + } else if (map->map_type == BPF_MAP_TYPE_REUSEPORT_SOCKARRAY) { + err = bpf_fd_reuseport_array_lookup_elem(map, key, value); } else { rcu_read_lock(); ptr = map->ops->map_lookup_elem(map, key); @@ -790,6 +792,10 @@ static int map_update_elem(union bpf_attr *attr) err = bpf_fd_htab_map_update_elem(map, f.file, key, value, attr->flags); rcu_read_unlock(); + } else if (map->map_type == BPF_MAP_TYPE_REUSEPORT_SOCKARRAY) { + /* rcu_read_lock() is not needed */ + err = bpf_fd_reuseport_array_update_elem(map, key, value, + attr->flags); } else { rcu_read_lock(); err = map->ops->map_update_elem(map, key, value, attr->flags); diff --git a/net/core/sock_reuseport.c b/net/core/sock_reuseport.c index cf2e4d305af9..8235f2439816 100644 --- a/net/core/sock_reuseport.c +++ b/net/core/sock_reuseport.c @@ -186,6 +186,14 @@ void reuseport_detach_sock(struct sock *sk) spin_lock_bh(&reuseport_lock); reuse = rcu_dereference_protected(sk->sk_reuseport_cb, lockdep_is_held(&reuseport_lock)); + + /* At least one of the sk in this reuseport group is added to + * a bpf map. Notify the bpf side. The bpf map logic will + * remove the sk if it is indeed added to a bpf map. + */ + if (reuse->reuseport_id) + bpf_sk_reuseport_detach(sk); + rcu_assign_pointer(sk->sk_reuseport_cb, NULL); for (i = 0; i < reuse->num_socks; i++) { -- cgit v1.2.3-59-g8ed1b From 2dbb9b9e6df67d444fbe425c7f6014858d337adf Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Wed, 8 Aug 2018 01:01:25 -0700 Subject: bpf: Introduce BPF_PROG_TYPE_SK_REUSEPORT This patch adds a BPF_PROG_TYPE_SK_REUSEPORT which can select a SO_REUSEPORT sk from a BPF_MAP_TYPE_REUSEPORT_ARRAY. Like other non SK_FILTER/CGROUP_SKB program, it requires CAP_SYS_ADMIN. BPF_PROG_TYPE_SK_REUSEPORT introduces "struct sk_reuseport_kern" to store the bpf context instead of using the skb->cb[48]. At the SO_REUSEPORT sk lookup time, it is in the middle of transiting from a lower layer (ipv4/ipv6) to a upper layer (udp/tcp). At this point, it is not always clear where the bpf context can be appended in the skb->cb[48] to avoid saving-and-restoring cb[]. Even putting aside the difference between ipv4-vs-ipv6 and udp-vs-tcp. It is not clear if the lower layer is only ipv4 and ipv6 in the future and will it not touch the cb[] again before transiting to the upper layer. For example, in udp_gro_receive(), it uses the 48 byte NAPI_GRO_CB instead of IP[6]CB and it may still modify the cb[] after calling the udp[46]_lib_lookup_skb(). Because of the above reason, if sk->cb is used for the bpf ctx, saving-and-restoring is needed and likely the whole 48 bytes cb[] has to be saved and restored. Instead of saving, setting and restoring the cb[], this patch opts to create a new "struct sk_reuseport_kern" and setting the needed values in there. The new BPF_PROG_TYPE_SK_REUSEPORT and "struct sk_reuseport_(kern|md)" will serve all ipv4/ipv6 + udp/tcp combinations. There is no protocol specific usage at this point and it is also inline with the current sock_reuseport.c implementation (i.e. no protocol specific requirement). In "struct sk_reuseport_md", this patch exposes data/data_end/len with semantic similar to other existing usages. Together with "bpf_skb_load_bytes()" and "bpf_skb_load_bytes_relative()", the bpf prog can peek anywhere in the skb. The "bind_inany" tells the bpf prog that the reuseport group is bind-ed to a local INANY address which cannot be learned from skb. The new "bind_inany" is added to "struct sock_reuseport" which will be used when running the new "BPF_PROG_TYPE_SK_REUSEPORT" bpf prog in order to avoid repeating the "bind INANY" test on "sk_v6_rcv_saddr/sk->sk_rcv_saddr" every time a bpf prog is run. It can only be properly initialized when a "sk->sk_reuseport" enabled sk is adding to a hashtable (i.e. during "reuseport_alloc()" and "reuseport_add_sock()"). The new "sk_select_reuseport()" is the main helper that the bpf prog will use to select a SO_REUSEPORT sk. It is the only function that can use the new BPF_MAP_TYPE_REUSEPORT_ARRAY. As mentioned in the earlier patch, the validity of a selected sk is checked in run time in "sk_select_reuseport()". Doing the check in verification time is difficult and inflexible (consider the map-in-map use case). The runtime check is to compare the selected sk's reuseport_id with the reuseport_id that we want. This helper will return -EXXX if the selected sk cannot serve the incoming request (e.g. reuseport_id not match). The bpf prog can decide if it wants to do SK_DROP as its discretion. When the bpf prog returns SK_PASS, the kernel will check if a valid sk has been selected (i.e. "reuse_kern->selected_sk != NULL"). If it does , it will use the selected sk. If not, the kernel will select one from "reuse->socks[]" (as before this patch). The SK_DROP and SK_PASS handling logic will be in the next patch. Signed-off-by: Martin KaFai Lau Acked-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann --- include/linux/bpf_types.h | 3 + include/linux/filter.h | 15 +++ include/net/addrconf.h | 1 + include/net/sock_reuseport.h | 6 +- include/uapi/linux/bpf.h | 36 +++++- kernel/bpf/verifier.c | 9 ++ net/core/filter.c | 269 +++++++++++++++++++++++++++++++++++++++- net/core/sock_reuseport.c | 20 ++- net/ipv4/inet_connection_sock.c | 9 ++ net/ipv4/inet_hashtables.c | 5 +- net/ipv4/udp.c | 5 +- 11 files changed, 365 insertions(+), 13 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h index 14fd6c02d258..cd26c090e7c0 100644 --- a/include/linux/bpf_types.h +++ b/include/linux/bpf_types.h @@ -29,6 +29,9 @@ BPF_PROG_TYPE(BPF_PROG_TYPE_CGROUP_DEVICE, cg_dev) #ifdef CONFIG_BPF_LIRC_MODE2 BPF_PROG_TYPE(BPF_PROG_TYPE_LIRC_MODE2, lirc_mode2) #endif +#ifdef CONFIG_INET +BPF_PROG_TYPE(BPF_PROG_TYPE_SK_REUSEPORT, sk_reuseport) +#endif BPF_MAP_TYPE(BPF_MAP_TYPE_ARRAY, array_map_ops) BPF_MAP_TYPE(BPF_MAP_TYPE_PERCPU_ARRAY, percpu_array_map_ops) diff --git a/include/linux/filter.h b/include/linux/filter.h index 2b072dab32c0..70e9d57677fe 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -32,6 +32,7 @@ struct seccomp_data; struct bpf_prog_aux; struct xdp_rxq_info; struct xdp_buff; +struct sock_reuseport; /* ArgX, context and stack frame pointer register positions. Note, * Arg1, Arg2, Arg3, etc are used as argument mappings of function @@ -833,6 +834,20 @@ void bpf_warn_invalid_xdp_action(u32 act); struct sock *do_sk_redirect_map(struct sk_buff *skb); struct sock *do_msg_redirect_map(struct sk_msg_buff *md); +#ifdef CONFIG_INET +struct sock *bpf_run_sk_reuseport(struct sock_reuseport *reuse, struct sock *sk, + struct bpf_prog *prog, struct sk_buff *skb, + u32 hash); +#else +static inline struct sock * +bpf_run_sk_reuseport(struct sock_reuseport *reuse, struct sock *sk, + struct bpf_prog *prog, struct sk_buff *skb, + u32 hash) +{ + return NULL; +} +#endif + #ifdef CONFIG_BPF_JIT extern int bpf_jit_enable; extern int bpf_jit_harden; diff --git a/include/net/addrconf.h b/include/net/addrconf.h index 5f43f7a70fe6..6def0351bcc3 100644 --- a/include/net/addrconf.h +++ b/include/net/addrconf.h @@ -108,6 +108,7 @@ int ipv6_get_lladdr(struct net_device *dev, struct in6_addr *addr, u32 banned_flags); bool inet_rcv_saddr_equal(const struct sock *sk, const struct sock *sk2, bool match_wildcard); +bool inet_rcv_saddr_any(const struct sock *sk); void addrconf_join_solict(struct net_device *dev, const struct in6_addr *addr); void addrconf_leave_solict(struct inet6_dev *idev, const struct in6_addr *addr); diff --git a/include/net/sock_reuseport.h b/include/net/sock_reuseport.h index e1a7681856f7..73b569556be6 100644 --- a/include/net/sock_reuseport.h +++ b/include/net/sock_reuseport.h @@ -21,12 +21,14 @@ struct sock_reuseport { unsigned int synq_overflow_ts; /* ID stays the same even after the size of socks[] grows. */ unsigned int reuseport_id; + bool bind_inany; struct bpf_prog __rcu *prog; /* optional BPF sock selector */ struct sock *socks[0]; /* array of sock pointers */ }; -extern int reuseport_alloc(struct sock *sk); -extern int reuseport_add_sock(struct sock *sk, struct sock *sk2); +extern int reuseport_alloc(struct sock *sk, bool bind_inany); +extern int reuseport_add_sock(struct sock *sk, struct sock *sk2, + bool bind_inany); extern void reuseport_detach_sock(struct sock *sk); extern struct sock *reuseport_select_sock(struct sock *sk, u32 hash, diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 40f584bc7da0..3102a2a23c31 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -151,6 +151,7 @@ enum bpf_prog_type { BPF_PROG_TYPE_CGROUP_SOCK_ADDR, BPF_PROG_TYPE_LWT_SEG6LOCAL, BPF_PROG_TYPE_LIRC_MODE2, + BPF_PROG_TYPE_SK_REUSEPORT, }; enum bpf_attach_type { @@ -2114,6 +2115,14 @@ union bpf_attr { * the shared data. * Return * Pointer to the local storage area. + * + * int bpf_sk_select_reuseport(struct sk_reuseport_md *reuse, struct bpf_map *map, void *key, u64 flags) + * Description + * Select a SO_REUSEPORT sk from a BPF_MAP_TYPE_REUSEPORT_ARRAY map + * It checks the selected sk is matching the incoming + * request in the skb. + * Return + * 0 on success, or a negative error in case of failure. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -2197,7 +2206,8 @@ union bpf_attr { FN(rc_keydown), \ FN(skb_cgroup_id), \ FN(get_current_cgroup_id), \ - FN(get_local_storage), + FN(get_local_storage), \ + FN(sk_select_reuseport), /* integer value in 'imm' field of BPF_CALL instruction selects which helper * function eBPF program intends to call @@ -2414,6 +2424,30 @@ struct sk_msg_md { __u32 local_port; /* stored in host byte order */ }; +struct sk_reuseport_md { + /* + * Start of directly accessible data. It begins from + * the tcp/udp header. + */ + void *data; + void *data_end; /* End of directly accessible data */ + /* + * Total length of packet (starting from the tcp/udp header). + * Note that the directly accessible bytes (data_end - data) + * could be less than this "len". Those bytes could be + * indirectly read by a helper "bpf_skb_load_bytes()". + */ + __u32 len; + /* + * Eth protocol in the mac header (network byte order). e.g. + * ETH_P_IP(0x0800) and ETH_P_IPV6(0x86DD) + */ + __u32 eth_protocol; + __u32 ip_protocol; /* IP protocol. e.g. IPPROTO_TCP, IPPROTO_UDP */ + __u32 bind_inany; /* Is sock bound to an INANY address? */ + __u32 hash; /* A hash of the packet 4 tuples */ +}; + #define BPF_TAG_SIZE 8 struct bpf_prog_info { diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 587468a9c37d..ca90679a7fe5 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -1310,6 +1310,7 @@ static bool may_access_direct_pkt_data(struct bpf_verifier_env *env, case BPF_PROG_TYPE_LWT_IN: case BPF_PROG_TYPE_LWT_OUT: case BPF_PROG_TYPE_LWT_SEG6LOCAL: + case BPF_PROG_TYPE_SK_REUSEPORT: /* dst_input() and dst_output() can't write for now */ if (t == BPF_WRITE) return false; @@ -2166,6 +2167,10 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env, func_id != BPF_FUNC_msg_redirect_hash) goto error; break; + case BPF_MAP_TYPE_REUSEPORT_SOCKARRAY: + if (func_id != BPF_FUNC_sk_select_reuseport) + goto error; + break; default: break; } @@ -2217,6 +2222,10 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env, if (map->map_type != BPF_MAP_TYPE_CGROUP_STORAGE) goto error; break; + case BPF_FUNC_sk_select_reuseport: + if (map->map_type != BPF_MAP_TYPE_REUSEPORT_SOCKARRAY) + goto error; + break; default: break; } diff --git a/net/core/filter.c b/net/core/filter.c index 2de7dd9f2a57..142595b4e0d1 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -1462,7 +1462,7 @@ static int __reuseport_attach_prog(struct bpf_prog *prog, struct sock *sk) return -ENOMEM; if (sk_unhashed(sk) && sk->sk_reuseport) { - err = reuseport_alloc(sk); + err = reuseport_alloc(sk, false); if (err) return err; } else if (!rcu_access_pointer(sk->sk_reuseport_cb)) { @@ -7013,3 +7013,270 @@ out: release_sock(sk); return ret; } + +#ifdef CONFIG_INET +struct sk_reuseport_kern { + struct sk_buff *skb; + struct sock *sk; + struct sock *selected_sk; + void *data_end; + u32 hash; + u32 reuseport_id; + bool bind_inany; +}; + +static void bpf_init_reuseport_kern(struct sk_reuseport_kern *reuse_kern, + struct sock_reuseport *reuse, + struct sock *sk, struct sk_buff *skb, + u32 hash) +{ + reuse_kern->skb = skb; + reuse_kern->sk = sk; + reuse_kern->selected_sk = NULL; + reuse_kern->data_end = skb->data + skb_headlen(skb); + reuse_kern->hash = hash; + reuse_kern->reuseport_id = reuse->reuseport_id; + reuse_kern->bind_inany = reuse->bind_inany; +} + +struct sock *bpf_run_sk_reuseport(struct sock_reuseport *reuse, struct sock *sk, + struct bpf_prog *prog, struct sk_buff *skb, + u32 hash) +{ + struct sk_reuseport_kern reuse_kern; + enum sk_action action; + + bpf_init_reuseport_kern(&reuse_kern, reuse, sk, skb, hash); + action = BPF_PROG_RUN(prog, &reuse_kern); + + if (action == SK_PASS) + return reuse_kern.selected_sk; + else + return ERR_PTR(-ECONNREFUSED); +} + +BPF_CALL_4(sk_select_reuseport, struct sk_reuseport_kern *, reuse_kern, + struct bpf_map *, map, void *, key, u32, flags) +{ + struct sock_reuseport *reuse; + struct sock *selected_sk; + + selected_sk = map->ops->map_lookup_elem(map, key); + if (!selected_sk) + return -ENOENT; + + reuse = rcu_dereference(selected_sk->sk_reuseport_cb); + if (!reuse) + /* selected_sk is unhashed (e.g. by close()) after the + * above map_lookup_elem(). Treat selected_sk has already + * been removed from the map. + */ + return -ENOENT; + + if (unlikely(reuse->reuseport_id != reuse_kern->reuseport_id)) { + struct sock *sk; + + if (unlikely(!reuse_kern->reuseport_id)) + /* There is a small race between adding the + * sk to the map and setting the + * reuse_kern->reuseport_id. + * Treat it as the sk has not been added to + * the bpf map yet. + */ + return -ENOENT; + + sk = reuse_kern->sk; + if (sk->sk_protocol != selected_sk->sk_protocol) + return -EPROTOTYPE; + else if (sk->sk_family != selected_sk->sk_family) + return -EAFNOSUPPORT; + + /* Catch all. Likely bound to a different sockaddr. */ + return -EBADFD; + } + + reuse_kern->selected_sk = selected_sk; + + return 0; +} + +static const struct bpf_func_proto sk_select_reuseport_proto = { + .func = sk_select_reuseport, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_CONST_MAP_PTR, + .arg3_type = ARG_PTR_TO_MAP_KEY, + .arg4_type = ARG_ANYTHING, +}; + +BPF_CALL_4(sk_reuseport_load_bytes, + const struct sk_reuseport_kern *, reuse_kern, u32, offset, + void *, to, u32, len) +{ + return ____bpf_skb_load_bytes(reuse_kern->skb, offset, to, len); +} + +static const struct bpf_func_proto sk_reuseport_load_bytes_proto = { + .func = sk_reuseport_load_bytes, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_ANYTHING, + .arg3_type = ARG_PTR_TO_UNINIT_MEM, + .arg4_type = ARG_CONST_SIZE, +}; + +BPF_CALL_5(sk_reuseport_load_bytes_relative, + const struct sk_reuseport_kern *, reuse_kern, u32, offset, + void *, to, u32, len, u32, start_header) +{ + return ____bpf_skb_load_bytes_relative(reuse_kern->skb, offset, to, + len, start_header); +} + +static const struct bpf_func_proto sk_reuseport_load_bytes_relative_proto = { + .func = sk_reuseport_load_bytes_relative, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_ANYTHING, + .arg3_type = ARG_PTR_TO_UNINIT_MEM, + .arg4_type = ARG_CONST_SIZE, + .arg5_type = ARG_ANYTHING, +}; + +static const struct bpf_func_proto * +sk_reuseport_func_proto(enum bpf_func_id func_id, + const struct bpf_prog *prog) +{ + switch (func_id) { + case BPF_FUNC_sk_select_reuseport: + return &sk_select_reuseport_proto; + case BPF_FUNC_skb_load_bytes: + return &sk_reuseport_load_bytes_proto; + case BPF_FUNC_skb_load_bytes_relative: + return &sk_reuseport_load_bytes_relative_proto; + default: + return bpf_base_func_proto(func_id); + } +} + +static bool +sk_reuseport_is_valid_access(int off, int size, + enum bpf_access_type type, + const struct bpf_prog *prog, + struct bpf_insn_access_aux *info) +{ + const u32 size_default = sizeof(__u32); + + if (off < 0 || off >= sizeof(struct sk_reuseport_md) || + off % size || type != BPF_READ) + return false; + + switch (off) { + case offsetof(struct sk_reuseport_md, data): + info->reg_type = PTR_TO_PACKET; + return size == sizeof(__u64); + + case offsetof(struct sk_reuseport_md, data_end): + info->reg_type = PTR_TO_PACKET_END; + return size == sizeof(__u64); + + case offsetof(struct sk_reuseport_md, hash): + return size == size_default; + + /* Fields that allow narrowing */ + case offsetof(struct sk_reuseport_md, eth_protocol): + if (size < FIELD_SIZEOF(struct sk_buff, protocol)) + return false; + case offsetof(struct sk_reuseport_md, ip_protocol): + case offsetof(struct sk_reuseport_md, bind_inany): + case offsetof(struct sk_reuseport_md, len): + bpf_ctx_record_field_size(info, size_default); + return bpf_ctx_narrow_access_ok(off, size, size_default); + + default: + return false; + } +} + +#define SK_REUSEPORT_LOAD_FIELD(F) ({ \ + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_reuseport_kern, F), \ + si->dst_reg, si->src_reg, \ + bpf_target_off(struct sk_reuseport_kern, F, \ + FIELD_SIZEOF(struct sk_reuseport_kern, F), \ + target_size)); \ + }) + +#define SK_REUSEPORT_LOAD_SKB_FIELD(SKB_FIELD) \ + SOCK_ADDR_LOAD_NESTED_FIELD(struct sk_reuseport_kern, \ + struct sk_buff, \ + skb, \ + SKB_FIELD) + +#define SK_REUSEPORT_LOAD_SK_FIELD_SIZE_OFF(SK_FIELD, BPF_SIZE, EXTRA_OFF) \ + SOCK_ADDR_LOAD_NESTED_FIELD_SIZE_OFF(struct sk_reuseport_kern, \ + struct sock, \ + sk, \ + SK_FIELD, BPF_SIZE, EXTRA_OFF) + +static u32 sk_reuseport_convert_ctx_access(enum bpf_access_type type, + const struct bpf_insn *si, + struct bpf_insn *insn_buf, + struct bpf_prog *prog, + u32 *target_size) +{ + struct bpf_insn *insn = insn_buf; + + switch (si->off) { + case offsetof(struct sk_reuseport_md, data): + SK_REUSEPORT_LOAD_SKB_FIELD(data); + break; + + case offsetof(struct sk_reuseport_md, len): + SK_REUSEPORT_LOAD_SKB_FIELD(len); + break; + + case offsetof(struct sk_reuseport_md, eth_protocol): + SK_REUSEPORT_LOAD_SKB_FIELD(protocol); + break; + + case offsetof(struct sk_reuseport_md, ip_protocol): + BUILD_BUG_ON(hweight_long(SK_FL_PROTO_MASK) != BITS_PER_BYTE); + SK_REUSEPORT_LOAD_SK_FIELD_SIZE_OFF(__sk_flags_offset, + BPF_W, 0); + *insn++ = BPF_ALU32_IMM(BPF_AND, si->dst_reg, SK_FL_PROTO_MASK); + *insn++ = BPF_ALU32_IMM(BPF_RSH, si->dst_reg, + SK_FL_PROTO_SHIFT); + /* SK_FL_PROTO_MASK and SK_FL_PROTO_SHIFT are endian + * aware. No further narrowing or masking is needed. + */ + *target_size = 1; + break; + + case offsetof(struct sk_reuseport_md, data_end): + SK_REUSEPORT_LOAD_FIELD(data_end); + break; + + case offsetof(struct sk_reuseport_md, hash): + SK_REUSEPORT_LOAD_FIELD(hash); + break; + + case offsetof(struct sk_reuseport_md, bind_inany): + SK_REUSEPORT_LOAD_FIELD(bind_inany); + break; + } + + return insn - insn_buf; +} + +const struct bpf_verifier_ops sk_reuseport_verifier_ops = { + .get_func_proto = sk_reuseport_func_proto, + .is_valid_access = sk_reuseport_is_valid_access, + .convert_ctx_access = sk_reuseport_convert_ctx_access, +}; + +const struct bpf_prog_ops sk_reuseport_prog_ops = { +}; +#endif /* CONFIG_INET */ diff --git a/net/core/sock_reuseport.c b/net/core/sock_reuseport.c index 8235f2439816..d260167f5f77 100644 --- a/net/core/sock_reuseport.c +++ b/net/core/sock_reuseport.c @@ -51,7 +51,7 @@ static struct sock_reuseport *__reuseport_alloc(unsigned int max_socks) return reuse; } -int reuseport_alloc(struct sock *sk) +int reuseport_alloc(struct sock *sk, bool bind_inany) { struct sock_reuseport *reuse; @@ -63,9 +63,17 @@ int reuseport_alloc(struct sock *sk) /* Allocation attempts can occur concurrently via the setsockopt path * and the bind/hash path. Nothing to do when we lose the race. */ - if (rcu_dereference_protected(sk->sk_reuseport_cb, - lockdep_is_held(&reuseport_lock))) + reuse = rcu_dereference_protected(sk->sk_reuseport_cb, + lockdep_is_held(&reuseport_lock)); + if (reuse) { + /* Only set reuse->bind_inany if the bind_inany is true. + * Otherwise, it will overwrite the reuse->bind_inany + * which was set by the bind/hash path. + */ + if (bind_inany) + reuse->bind_inany = bind_inany; goto out; + } reuse = __reuseport_alloc(INIT_SOCKS); if (!reuse) { @@ -75,6 +83,7 @@ int reuseport_alloc(struct sock *sk) reuse->socks[0] = sk; reuse->num_socks = 1; + reuse->bind_inany = bind_inany; rcu_assign_pointer(sk->sk_reuseport_cb, reuse); out: @@ -101,6 +110,7 @@ static struct sock_reuseport *reuseport_grow(struct sock_reuseport *reuse) more_reuse->num_socks = reuse->num_socks; more_reuse->prog = reuse->prog; more_reuse->reuseport_id = reuse->reuseport_id; + more_reuse->bind_inany = reuse->bind_inany; memcpy(more_reuse->socks, reuse->socks, reuse->num_socks * sizeof(struct sock *)); @@ -136,12 +146,12 @@ static void reuseport_free_rcu(struct rcu_head *head) * @sk2: Socket belonging to the existing reuseport group. * May return ENOMEM and not add socket to group under memory pressure. */ -int reuseport_add_sock(struct sock *sk, struct sock *sk2) +int reuseport_add_sock(struct sock *sk, struct sock *sk2, bool bind_inany) { struct sock_reuseport *old_reuse, *reuse; if (!rcu_access_pointer(sk2->sk_reuseport_cb)) { - int err = reuseport_alloc(sk2); + int err = reuseport_alloc(sk2, bind_inany); if (err) return err; diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index 33a88e045efd..dfd5009f96ef 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -107,6 +107,15 @@ bool inet_rcv_saddr_equal(const struct sock *sk, const struct sock *sk2, } EXPORT_SYMBOL(inet_rcv_saddr_equal); +bool inet_rcv_saddr_any(const struct sock *sk) +{ +#if IS_ENABLED(CONFIG_IPV6) + if (sk->sk_family == AF_INET6) + return ipv6_addr_any(&sk->sk_v6_rcv_saddr); +#endif + return !sk->sk_rcv_saddr; +} + void inet_get_local_port_range(struct net *net, int *low, int *high) { unsigned int seq; diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c index 3647167c8fa3..370e24463fb7 100644 --- a/net/ipv4/inet_hashtables.c +++ b/net/ipv4/inet_hashtables.c @@ -567,10 +567,11 @@ static int inet_reuseport_add_sock(struct sock *sk, inet_csk(sk2)->icsk_bind_hash == tb && sk2->sk_reuseport && uid_eq(uid, sock_i_uid(sk2)) && inet_rcv_saddr_equal(sk, sk2, false)) - return reuseport_add_sock(sk, sk2); + return reuseport_add_sock(sk, sk2, + inet_rcv_saddr_any(sk)); } - return reuseport_alloc(sk); + return reuseport_alloc(sk, inet_rcv_saddr_any(sk)); } int __inet_hash(struct sock *sk, struct sock *osk) diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 060e841dde40..038dd7909051 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -221,11 +221,12 @@ static int udp_reuseport_add_sock(struct sock *sk, struct udp_hslot *hslot) (sk2->sk_bound_dev_if == sk->sk_bound_dev_if) && sk2->sk_reuseport && uid_eq(uid, sock_i_uid(sk2)) && inet_rcv_saddr_equal(sk, sk2, false)) { - return reuseport_add_sock(sk, sk2); + return reuseport_add_sock(sk, sk2, + inet_rcv_saddr_any(sk)); } } - return reuseport_alloc(sk); + return reuseport_alloc(sk, inet_rcv_saddr_any(sk)); } /** -- cgit v1.2.3-59-g8ed1b From 8217ca653ec601246832d562207bc24bdf652d2f Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Wed, 8 Aug 2018 01:01:26 -0700 Subject: bpf: Enable BPF_PROG_TYPE_SK_REUSEPORT bpf prog in reuseport selection This patch allows a BPF_PROG_TYPE_SK_REUSEPORT bpf prog to select a SO_REUSEPORT sk from a BPF_MAP_TYPE_REUSEPORT_ARRAY introduced in the earlier patch. "bpf_run_sk_reuseport()" will return -ECONNREFUSED when the BPF_PROG_TYPE_SK_REUSEPORT prog returns SK_DROP. The callers, in inet[6]_hashtable.c and ipv[46]/udp.c, are modified to handle this case and return NULL immediately instead of continuing the sk search from its hashtable. It re-uses the existing SO_ATTACH_REUSEPORT_EBPF setsockopt to attach BPF_PROG_TYPE_SK_REUSEPORT. The "sk_reuseport_attach_bpf()" will check if the attaching bpf prog is in the new SK_REUSEPORT or the existing SOCKET_FILTER type and then check different things accordingly. One level of "__reuseport_attach_prog()" call is removed. The "sk_unhashed() && ..." and "sk->sk_reuseport_cb" tests are pushed back to "reuseport_attach_prog()" in sock_reuseport.c. sock_reuseport.c seems to have more knowledge on those test requirements than filter.c. In "reuseport_attach_prog()", after new_prog is attached to reuse->prog, the old_prog (if any) is also directly freed instead of returning the old_prog to the caller and asking the caller to free. The sysctl_optmem_max check is moved back to the "sk_reuseport_attach_filter()" and "sk_reuseport_attach_bpf()". As of other bpf prog types, the new BPF_PROG_TYPE_SK_REUSEPORT is only bounded by the usual "bpf_prog_charge_memlock()" during load time instead of bounded by both bpf_prog_charge_memlock and sysctl_optmem_max. Signed-off-by: Martin KaFai Lau Acked-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann --- include/linux/filter.h | 1 + include/net/sock_reuseport.h | 3 +- net/core/filter.c | 87 ++++++++++++++++++++++++++------------------ net/core/sock_reuseport.c | 36 +++++++++++++----- net/ipv4/inet_hashtables.c | 14 ++++--- net/ipv4/udp.c | 4 ++ net/ipv6/inet6_hashtables.c | 14 ++++--- net/ipv6/udp.c | 4 ++ 8 files changed, 106 insertions(+), 57 deletions(-) (limited to 'include/linux') diff --git a/include/linux/filter.h b/include/linux/filter.h index 70e9d57677fe..5d565c50bcb2 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -753,6 +753,7 @@ int sk_attach_filter(struct sock_fprog *fprog, struct sock *sk); int sk_attach_bpf(u32 ufd, struct sock *sk); int sk_reuseport_attach_filter(struct sock_fprog *fprog, struct sock *sk); int sk_reuseport_attach_bpf(u32 ufd, struct sock *sk); +void sk_reuseport_prog_free(struct bpf_prog *prog); int sk_detach_filter(struct sock *sk); int sk_get_filter(struct sock *sk, struct sock_filter __user *filter, unsigned int len); diff --git a/include/net/sock_reuseport.h b/include/net/sock_reuseport.h index 73b569556be6..8a5f70c7cdf2 100644 --- a/include/net/sock_reuseport.h +++ b/include/net/sock_reuseport.h @@ -34,8 +34,7 @@ extern struct sock *reuseport_select_sock(struct sock *sk, u32 hash, struct sk_buff *skb, int hdr_len); -extern struct bpf_prog *reuseport_attach_prog(struct sock *sk, - struct bpf_prog *prog); +extern int reuseport_attach_prog(struct sock *sk, struct bpf_prog *prog); int reuseport_get_id(struct sock_reuseport *reuse); #endif /* _SOCK_REUSEPORT_H */ diff --git a/net/core/filter.c b/net/core/filter.c index 142595b4e0d1..22906b31d43f 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -1453,30 +1453,6 @@ static int __sk_attach_prog(struct bpf_prog *prog, struct sock *sk) return 0; } -static int __reuseport_attach_prog(struct bpf_prog *prog, struct sock *sk) -{ - struct bpf_prog *old_prog; - int err; - - if (bpf_prog_size(prog->len) > sysctl_optmem_max) - return -ENOMEM; - - if (sk_unhashed(sk) && sk->sk_reuseport) { - err = reuseport_alloc(sk, false); - if (err) - return err; - } else if (!rcu_access_pointer(sk->sk_reuseport_cb)) { - /* The socket wasn't bound with SO_REUSEPORT */ - return -EINVAL; - } - - old_prog = reuseport_attach_prog(sk, prog); - if (old_prog) - bpf_prog_destroy(old_prog); - - return 0; -} - static struct bpf_prog *__get_filter(struct sock_fprog *fprog, struct sock *sk) { @@ -1550,13 +1526,15 @@ int sk_reuseport_attach_filter(struct sock_fprog *fprog, struct sock *sk) if (IS_ERR(prog)) return PTR_ERR(prog); - err = __reuseport_attach_prog(prog, sk); - if (err < 0) { + if (bpf_prog_size(prog->len) > sysctl_optmem_max) + err = -ENOMEM; + else + err = reuseport_attach_prog(sk, prog); + + if (err) __bpf_prog_release(prog); - return err; - } - return 0; + return err; } static struct bpf_prog *__get_bpf(u32 ufd, struct sock *sk) @@ -1586,19 +1564,58 @@ int sk_attach_bpf(u32 ufd, struct sock *sk) int sk_reuseport_attach_bpf(u32 ufd, struct sock *sk) { - struct bpf_prog *prog = __get_bpf(ufd, sk); + struct bpf_prog *prog; int err; + if (sock_flag(sk, SOCK_FILTER_LOCKED)) + return -EPERM; + + prog = bpf_prog_get_type(ufd, BPF_PROG_TYPE_SOCKET_FILTER); + if (IS_ERR(prog) && PTR_ERR(prog) == -EINVAL) + prog = bpf_prog_get_type(ufd, BPF_PROG_TYPE_SK_REUSEPORT); if (IS_ERR(prog)) return PTR_ERR(prog); - err = __reuseport_attach_prog(prog, sk); - if (err < 0) { - bpf_prog_put(prog); - return err; + if (prog->type == BPF_PROG_TYPE_SK_REUSEPORT) { + /* Like other non BPF_PROG_TYPE_SOCKET_FILTER + * bpf prog (e.g. sockmap). It depends on the + * limitation imposed by bpf_prog_load(). + * Hence, sysctl_optmem_max is not checked. + */ + if ((sk->sk_type != SOCK_STREAM && + sk->sk_type != SOCK_DGRAM) || + (sk->sk_protocol != IPPROTO_UDP && + sk->sk_protocol != IPPROTO_TCP) || + (sk->sk_family != AF_INET && + sk->sk_family != AF_INET6)) { + err = -ENOTSUPP; + goto err_prog_put; + } + } else { + /* BPF_PROG_TYPE_SOCKET_FILTER */ + if (bpf_prog_size(prog->len) > sysctl_optmem_max) { + err = -ENOMEM; + goto err_prog_put; + } } - return 0; + err = reuseport_attach_prog(sk, prog); +err_prog_put: + if (err) + bpf_prog_put(prog); + + return err; +} + +void sk_reuseport_prog_free(struct bpf_prog *prog) +{ + if (!prog) + return; + + if (prog->type == BPF_PROG_TYPE_SK_REUSEPORT) + bpf_prog_put(prog); + else + bpf_prog_destroy(prog); } struct bpf_scratchpad { diff --git a/net/core/sock_reuseport.c b/net/core/sock_reuseport.c index d260167f5f77..ba5cba56f574 100644 --- a/net/core/sock_reuseport.c +++ b/net/core/sock_reuseport.c @@ -9,6 +9,7 @@ #include #include #include +#include #include #define INIT_SOCKS 128 @@ -133,8 +134,7 @@ static void reuseport_free_rcu(struct rcu_head *head) struct sock_reuseport *reuse; reuse = container_of(head, struct sock_reuseport, rcu); - if (reuse->prog) - bpf_prog_destroy(reuse->prog); + sk_reuseport_prog_free(rcu_dereference_protected(reuse->prog, 1)); if (reuse->reuseport_id) ida_simple_remove(&reuseport_ida, reuse->reuseport_id); kfree(reuse); @@ -219,9 +219,9 @@ void reuseport_detach_sock(struct sock *sk) } EXPORT_SYMBOL(reuseport_detach_sock); -static struct sock *run_bpf(struct sock_reuseport *reuse, u16 socks, - struct bpf_prog *prog, struct sk_buff *skb, - int hdr_len) +static struct sock *run_bpf_filter(struct sock_reuseport *reuse, u16 socks, + struct bpf_prog *prog, struct sk_buff *skb, + int hdr_len) { struct sk_buff *nskb = NULL; u32 index; @@ -282,9 +282,15 @@ struct sock *reuseport_select_sock(struct sock *sk, /* paired with smp_wmb() in reuseport_add_sock() */ smp_rmb(); - if (prog && skb) - sk2 = run_bpf(reuse, socks, prog, skb, hdr_len); + if (!prog || !skb) + goto select_by_hash; + + if (prog->type == BPF_PROG_TYPE_SK_REUSEPORT) + sk2 = bpf_run_sk_reuseport(reuse, sk, prog, skb, hash); + else + sk2 = run_bpf_filter(reuse, socks, prog, skb, hdr_len); +select_by_hash: /* no bpf or invalid bpf result: fall back to hash usage */ if (!sk2) sk2 = reuse->socks[reciprocal_scale(hash, socks)]; @@ -296,12 +302,21 @@ out: } EXPORT_SYMBOL(reuseport_select_sock); -struct bpf_prog * -reuseport_attach_prog(struct sock *sk, struct bpf_prog *prog) +int reuseport_attach_prog(struct sock *sk, struct bpf_prog *prog) { struct sock_reuseport *reuse; struct bpf_prog *old_prog; + if (sk_unhashed(sk) && sk->sk_reuseport) { + int err = reuseport_alloc(sk, false); + + if (err) + return err; + } else if (!rcu_access_pointer(sk->sk_reuseport_cb)) { + /* The socket wasn't bound with SO_REUSEPORT */ + return -EINVAL; + } + spin_lock_bh(&reuseport_lock); reuse = rcu_dereference_protected(sk->sk_reuseport_cb, lockdep_is_held(&reuseport_lock)); @@ -310,6 +325,7 @@ reuseport_attach_prog(struct sock *sk, struct bpf_prog *prog) rcu_assign_pointer(reuse->prog, prog); spin_unlock_bh(&reuseport_lock); - return old_prog; + sk_reuseport_prog_free(old_prog); + return 0; } EXPORT_SYMBOL(reuseport_attach_prog); diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c index 370e24463fb7..f5c9ef2586de 100644 --- a/net/ipv4/inet_hashtables.c +++ b/net/ipv4/inet_hashtables.c @@ -328,7 +328,7 @@ struct sock *__inet_lookup_listener(struct net *net, saddr, sport, daddr, hnum, dif, sdif); if (result) - return result; + goto done; /* Lookup lhash2 with INADDR_ANY */ @@ -337,9 +337,10 @@ struct sock *__inet_lookup_listener(struct net *net, if (ilb2->count > ilb->count) goto port_lookup; - return inet_lhash2_lookup(net, ilb2, skb, doff, - saddr, sport, daddr, hnum, - dif, sdif); + result = inet_lhash2_lookup(net, ilb2, skb, doff, + saddr, sport, daddr, hnum, + dif, sdif); + goto done; port_lookup: sk_for_each_rcu(sk, &ilb->head) { @@ -352,12 +353,15 @@ port_lookup: result = reuseport_select_sock(sk, phash, skb, doff); if (result) - return result; + goto done; } result = sk; hiscore = score; } } +done: + if (unlikely(IS_ERR(result))) + return NULL; return result; } EXPORT_SYMBOL_GPL(__inet_lookup_listener); diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 038dd7909051..f4e35b2ff8b8 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -499,6 +499,8 @@ struct sock *__udp4_lib_lookup(struct net *net, __be32 saddr, daddr, hnum, dif, sdif, exact_dif, hslot2, skb); } + if (unlikely(IS_ERR(result))) + return NULL; return result; } begin: @@ -513,6 +515,8 @@ begin: saddr, sport); result = reuseport_select_sock(sk, hash, skb, sizeof(struct udphdr)); + if (unlikely(IS_ERR(result))) + return NULL; if (result) return result; } diff --git a/net/ipv6/inet6_hashtables.c b/net/ipv6/inet6_hashtables.c index 595ad408dba0..3d7c7460a0c5 100644 --- a/net/ipv6/inet6_hashtables.c +++ b/net/ipv6/inet6_hashtables.c @@ -191,7 +191,7 @@ struct sock *inet6_lookup_listener(struct net *net, saddr, sport, daddr, hnum, dif, sdif); if (result) - return result; + goto done; /* Lookup lhash2 with in6addr_any */ @@ -200,9 +200,10 @@ struct sock *inet6_lookup_listener(struct net *net, if (ilb2->count > ilb->count) goto port_lookup; - return inet6_lhash2_lookup(net, ilb2, skb, doff, - saddr, sport, daddr, hnum, - dif, sdif); + result = inet6_lhash2_lookup(net, ilb2, skb, doff, + saddr, sport, daddr, hnum, + dif, sdif); + goto done; port_lookup: sk_for_each(sk, &ilb->head) { @@ -214,12 +215,15 @@ port_lookup: result = reuseport_select_sock(sk, phash, skb, doff); if (result) - return result; + goto done; } result = sk; hiscore = score; } } +done: + if (unlikely(IS_ERR(result))) + return NULL; return result; } EXPORT_SYMBOL_GPL(inet6_lookup_listener); diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index f6b96956a8ed..83f4c77c79d8 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -235,6 +235,8 @@ struct sock *__udp6_lib_lookup(struct net *net, exact_dif, hslot2, skb); } + if (unlikely(IS_ERR(result))) + return NULL; return result; } begin: @@ -249,6 +251,8 @@ begin: saddr, sport); result = reuseport_select_sock(sk, hash, skb, sizeof(struct udphdr)); + if (unlikely(IS_ERR(result))) + return NULL; if (result) return result; } -- cgit v1.2.3-59-g8ed1b From 19e226e8cc5da02f17ed119f9137036c0f0f5d80 Mon Sep 17 00:00:00 2001 From: Caleb Raitto Date: Thu, 9 Aug 2018 18:18:28 -0700 Subject: virtio: Make vp_set_vq_affinity() take a mask. Make vp_set_vq_affinity() take a cpumask instead of taking a single CPU. If there are fewer queues than cores, queue affinity should be able to map to multiple cores. Link: https://patchwork.ozlabs.org/patch/948149/ Suggested-by: Willem de Bruijn Signed-off-by: Caleb Raitto Acked-by: Gonglei Signed-off-by: David S. Miller --- drivers/crypto/virtio/virtio_crypto_core.c | 4 ++-- drivers/net/virtio_net.c | 8 ++++---- drivers/virtio/virtio_pci_common.c | 7 +++---- drivers/virtio/virtio_pci_common.h | 2 +- include/linux/virtio_config.h | 7 ++++--- 5 files changed, 14 insertions(+), 14 deletions(-) (limited to 'include/linux') diff --git a/drivers/crypto/virtio/virtio_crypto_core.c b/drivers/crypto/virtio/virtio_crypto_core.c index 83326986c113..7c7198553699 100644 --- a/drivers/crypto/virtio/virtio_crypto_core.c +++ b/drivers/crypto/virtio/virtio_crypto_core.c @@ -146,7 +146,7 @@ static void virtcrypto_clean_affinity(struct virtio_crypto *vi, long hcpu) if (vi->affinity_hint_set) { for (i = 0; i < vi->max_data_queues; i++) - virtqueue_set_affinity(vi->data_vq[i].vq, -1); + virtqueue_set_affinity(vi->data_vq[i].vq, NULL); vi->affinity_hint_set = false; } @@ -173,7 +173,7 @@ static void virtcrypto_set_affinity(struct virtio_crypto *vcrypto) * */ for_each_online_cpu(cpu) { - virtqueue_set_affinity(vcrypto->data_vq[i].vq, cpu); + virtqueue_set_affinity(vcrypto->data_vq[i].vq, cpumask_of(cpu)); if (++i >= vcrypto->max_data_queues) break; } diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c index 39a7f4452587..43fabc0eb4d2 100644 --- a/drivers/net/virtio_net.c +++ b/drivers/net/virtio_net.c @@ -1878,8 +1878,8 @@ static void virtnet_clean_affinity(struct virtnet_info *vi, long hcpu) if (vi->affinity_hint_set) { for (i = 0; i < vi->max_queue_pairs; i++) { - virtqueue_set_affinity(vi->rq[i].vq, -1); - virtqueue_set_affinity(vi->sq[i].vq, -1); + virtqueue_set_affinity(vi->rq[i].vq, NULL); + virtqueue_set_affinity(vi->sq[i].vq, NULL); } vi->affinity_hint_set = false; @@ -1905,8 +1905,8 @@ static void virtnet_set_affinity(struct virtnet_info *vi) for_each_online_cpu(cpu) { const unsigned long *mask = cpumask_bits(cpumask_of(cpu)); - virtqueue_set_affinity(vi->rq[i].vq, cpu); - virtqueue_set_affinity(vi->sq[i].vq, cpu); + virtqueue_set_affinity(vi->rq[i].vq, cpumask_of(cpu)); + virtqueue_set_affinity(vi->sq[i].vq, cpumask_of(cpu)); __netif_set_xps_queue(vi->dev, mask, i, false); i++; } diff --git a/drivers/virtio/virtio_pci_common.c b/drivers/virtio/virtio_pci_common.c index 705aebd74e56..465a6f5142cc 100644 --- a/drivers/virtio/virtio_pci_common.c +++ b/drivers/virtio/virtio_pci_common.c @@ -421,7 +421,7 @@ const char *vp_bus_name(struct virtio_device *vdev) * - OR over all affinities for shared MSI * - ignore the affinity request if we're using INTX */ -int vp_set_vq_affinity(struct virtqueue *vq, int cpu) +int vp_set_vq_affinity(struct virtqueue *vq, const struct cpumask *cpu_mask) { struct virtio_device *vdev = vq->vdev; struct virtio_pci_device *vp_dev = to_vp_device(vdev); @@ -435,11 +435,10 @@ int vp_set_vq_affinity(struct virtqueue *vq, int cpu) if (vp_dev->msix_enabled) { mask = vp_dev->msix_affinity_masks[info->msix_vector]; irq = pci_irq_vector(vp_dev->pci_dev, info->msix_vector); - if (cpu == -1) + if (!cpu_mask) irq_set_affinity_hint(irq, NULL); else { - cpumask_clear(mask); - cpumask_set_cpu(cpu, mask); + cpumask_copy(mask, cpu_mask); irq_set_affinity_hint(irq, mask); } } diff --git a/drivers/virtio/virtio_pci_common.h b/drivers/virtio/virtio_pci_common.h index 135ee3cf7175..02271002c2f3 100644 --- a/drivers/virtio/virtio_pci_common.h +++ b/drivers/virtio/virtio_pci_common.h @@ -141,7 +141,7 @@ const char *vp_bus_name(struct virtio_device *vdev); * - OR over all affinities for shared MSI * - ignore the affinity request if we're using INTX */ -int vp_set_vq_affinity(struct virtqueue *vq, int cpu); +int vp_set_vq_affinity(struct virtqueue *vq, const struct cpumask *cpu_mask); const struct cpumask *vp_get_vq_affinity(struct virtio_device *vdev, int index); diff --git a/include/linux/virtio_config.h b/include/linux/virtio_config.h index 5559a2d31c46..32baf8e26735 100644 --- a/include/linux/virtio_config.h +++ b/include/linux/virtio_config.h @@ -79,7 +79,8 @@ struct virtio_config_ops { u64 (*get_features)(struct virtio_device *vdev); int (*finalize_features)(struct virtio_device *vdev); const char *(*bus_name)(struct virtio_device *vdev); - int (*set_vq_affinity)(struct virtqueue *vq, int cpu); + int (*set_vq_affinity)(struct virtqueue *vq, + const struct cpumask *cpu_mask); const struct cpumask *(*get_vq_affinity)(struct virtio_device *vdev, int index); }; @@ -236,11 +237,11 @@ const char *virtio_bus_name(struct virtio_device *vdev) * */ static inline -int virtqueue_set_affinity(struct virtqueue *vq, int cpu) +int virtqueue_set_affinity(struct virtqueue *vq, const struct cpumask *cpu_mask) { struct virtio_device *vdev = vq->vdev; if (vdev->config->set_vq_affinity) - return vdev->config->set_vq_affinity(vq, cpu); + return vdev->config->set_vq_affinity(vq, cpu_mask); return 0; } -- cgit v1.2.3-59-g8ed1b From e8d2bec0457962e8f348a9a3627b398f7fe5c5fc Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Sun, 12 Aug 2018 01:59:17 +0200 Subject: bpf: decouple btf from seq bpf fs dump and enable more maps Commit a26ca7c982cb ("bpf: btf: Add pretty print support to the basic arraymap") and 699c86d6ec21 ("bpf: btf: add pretty print for hash/lru_hash maps") enabled support for BTF and dumping via BPF fs for array and hash/lru map. However, both can be decoupled from each other such that regular BPF maps can be supported for attaching BTF key/value information, while not all maps necessarily need to dump via map_seq_show_elem() callback. The basic sanity check which is a prerequisite for all maps is that key/value size has to match in any case, and some maps can have extra checks via map_check_btf() callback, e.g. probing certain types or indicating no support in general. With that we can also enable retrieving BTF info for per-cpu map types and lpm. Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Acked-by: Yonghong Song --- include/linux/bpf.h | 13 +++++++++---- kernel/bpf/arraymap.c | 26 ++++++++++++-------------- kernel/bpf/cpumap.c | 1 + kernel/bpf/devmap.c | 1 + kernel/bpf/hashtab.c | 20 +------------------- kernel/bpf/inode.c | 3 ++- kernel/bpf/local_storage.c | 1 + kernel/bpf/lpm_trie.c | 12 ++++++++++++ kernel/bpf/sockmap.c | 2 ++ kernel/bpf/stackmap.c | 1 + kernel/bpf/syscall.c | 36 ++++++++++++++++++++++++++++++++---- kernel/bpf/xskmap.c | 3 +-- 12 files changed, 75 insertions(+), 44 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index db11662faea6..523481a3471b 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -23,7 +23,7 @@ struct bpf_prog; struct bpf_map; struct sock; struct seq_file; -struct btf; +struct btf_type; /* map is generic key/value storage optionally accesible by eBPF programs */ struct bpf_map_ops { @@ -48,8 +48,9 @@ struct bpf_map_ops { u32 (*map_fd_sys_lookup_elem)(void *ptr); void (*map_seq_show_elem)(struct bpf_map *map, void *key, struct seq_file *m); - int (*map_check_btf)(const struct bpf_map *map, const struct btf *btf, - u32 key_type_id, u32 value_type_id); + int (*map_check_btf)(const struct bpf_map *map, + const struct btf_type *key_type, + const struct btf_type *value_type); }; struct bpf_map { @@ -118,9 +119,13 @@ static inline bool bpf_map_offload_neutral(const struct bpf_map *map) static inline bool bpf_map_support_seq_show(const struct bpf_map *map) { - return map->ops->map_seq_show_elem && map->ops->map_check_btf; + return map->btf && map->ops->map_seq_show_elem; } +int map_check_no_btf(const struct bpf_map *map, + const struct btf_type *key_type, + const struct btf_type *value_type); + extern const struct bpf_map_ops bpf_map_offload_ops; /* function argument constraints */ diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index f6ca3e712831..0c17aab3ce5f 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -358,27 +358,20 @@ static void array_map_seq_show_elem(struct bpf_map *map, void *key, rcu_read_unlock(); } -static int array_map_check_btf(const struct bpf_map *map, const struct btf *btf, - u32 btf_key_id, u32 btf_value_id) +static int array_map_check_btf(const struct bpf_map *map, + const struct btf_type *key_type, + const struct btf_type *value_type) { - const struct btf_type *key_type, *value_type; - u32 key_size, value_size; u32 int_data; - key_type = btf_type_id_size(btf, &btf_key_id, &key_size); - if (!key_type || BTF_INFO_KIND(key_type->info) != BTF_KIND_INT) + if (BTF_INFO_KIND(key_type->info) != BTF_KIND_INT) return -EINVAL; int_data = *(u32 *)(key_type + 1); - /* bpf array can only take a u32 key. This check makes - * sure that the btf matches the attr used during map_create. + /* bpf array can only take a u32 key. This check makes sure + * that the btf matches the attr used during map_create. */ - if (BTF_INT_BITS(int_data) != 32 || key_size != 4 || - BTF_INT_OFFSET(int_data)) - return -EINVAL; - - value_type = btf_type_id_size(btf, &btf_value_id, &value_size); - if (!value_type || value_size != map->value_size) + if (BTF_INT_BITS(int_data) != 32 || BTF_INT_OFFSET(int_data)) return -EINVAL; return 0; @@ -405,6 +398,7 @@ const struct bpf_map_ops percpu_array_map_ops = { .map_lookup_elem = percpu_array_map_lookup_elem, .map_update_elem = array_map_update_elem, .map_delete_elem = array_map_delete_elem, + .map_check_btf = array_map_check_btf, }; static int fd_array_map_alloc_check(union bpf_attr *attr) @@ -546,6 +540,7 @@ const struct bpf_map_ops prog_array_map_ops = { .map_fd_put_ptr = prog_fd_array_put_ptr, .map_fd_sys_lookup_elem = prog_fd_array_sys_lookup_elem, .map_release_uref = bpf_fd_array_map_clear, + .map_check_btf = map_check_no_btf, }; static struct bpf_event_entry *bpf_event_entry_gen(struct file *perf_file, @@ -634,6 +629,7 @@ const struct bpf_map_ops perf_event_array_map_ops = { .map_fd_get_ptr = perf_event_fd_array_get_ptr, .map_fd_put_ptr = perf_event_fd_array_put_ptr, .map_release = perf_event_fd_array_release, + .map_check_btf = map_check_no_btf, }; #ifdef CONFIG_CGROUPS @@ -665,6 +661,7 @@ const struct bpf_map_ops cgroup_array_map_ops = { .map_delete_elem = fd_array_map_delete_elem, .map_fd_get_ptr = cgroup_fd_array_get_ptr, .map_fd_put_ptr = cgroup_fd_array_put_ptr, + .map_check_btf = map_check_no_btf, }; #endif @@ -749,4 +746,5 @@ const struct bpf_map_ops array_of_maps_map_ops = { .map_fd_put_ptr = bpf_map_fd_put_ptr, .map_fd_sys_lookup_elem = bpf_map_fd_sys_lookup_elem, .map_gen_lookup = array_of_map_gen_lookup, + .map_check_btf = map_check_no_btf, }; diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c index e0918d180f08..3b494941a44a 100644 --- a/kernel/bpf/cpumap.c +++ b/kernel/bpf/cpumap.c @@ -555,6 +555,7 @@ const struct bpf_map_ops cpu_map_ops = { .map_update_elem = cpu_map_update_elem, .map_lookup_elem = cpu_map_lookup_elem, .map_get_next_key = cpu_map_get_next_key, + .map_check_btf = map_check_no_btf, }; static int bq_flush_to_queue(struct bpf_cpu_map_entry *rcpu, diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c index d361fc1e3bf3..a7c6620d8389 100644 --- a/kernel/bpf/devmap.c +++ b/kernel/bpf/devmap.c @@ -484,6 +484,7 @@ const struct bpf_map_ops dev_map_ops = { .map_lookup_elem = dev_map_lookup_elem, .map_update_elem = dev_map_update_elem, .map_delete_elem = dev_map_delete_elem, + .map_check_btf = map_check_no_btf, }; static int dev_map_notification(struct notifier_block *notifier, diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index d6110042e0d9..04b8eda94e7d 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -1185,23 +1185,6 @@ static void htab_map_seq_show_elem(struct bpf_map *map, void *key, rcu_read_unlock(); } -static int htab_map_check_btf(const struct bpf_map *map, const struct btf *btf, - u32 btf_key_id, u32 btf_value_id) -{ - const struct btf_type *key_type, *value_type; - u32 key_size, value_size; - - key_type = btf_type_id_size(btf, &btf_key_id, &key_size); - if (!key_type || key_size != map->key_size) - return -EINVAL; - - value_type = btf_type_id_size(btf, &btf_value_id, &value_size); - if (!value_type || value_size != map->value_size) - return -EINVAL; - - return 0; -} - const struct bpf_map_ops htab_map_ops = { .map_alloc_check = htab_map_alloc_check, .map_alloc = htab_map_alloc, @@ -1212,7 +1195,6 @@ const struct bpf_map_ops htab_map_ops = { .map_delete_elem = htab_map_delete_elem, .map_gen_lookup = htab_map_gen_lookup, .map_seq_show_elem = htab_map_seq_show_elem, - .map_check_btf = htab_map_check_btf, }; const struct bpf_map_ops htab_lru_map_ops = { @@ -1225,7 +1207,6 @@ const struct bpf_map_ops htab_lru_map_ops = { .map_delete_elem = htab_lru_map_delete_elem, .map_gen_lookup = htab_lru_map_gen_lookup, .map_seq_show_elem = htab_map_seq_show_elem, - .map_check_btf = htab_map_check_btf, }; /* Called from eBPF program */ @@ -1452,4 +1433,5 @@ const struct bpf_map_ops htab_of_maps_map_ops = { .map_fd_put_ptr = bpf_map_fd_put_ptr, .map_fd_sys_lookup_elem = bpf_map_fd_sys_lookup_elem, .map_gen_lookup = htab_of_map_gen_lookup, + .map_check_btf = map_check_no_btf, }; diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c index fc5b103512e7..2ada5e21dfa6 100644 --- a/kernel/bpf/inode.c +++ b/kernel/bpf/inode.c @@ -334,7 +334,8 @@ static int bpf_mkmap(struct dentry *dentry, umode_t mode, void *arg) struct bpf_map *map = arg; return bpf_mkobj_ops(dentry, mode, arg, &bpf_map_iops, - map->btf ? &bpffs_map_fops : &bpffs_obj_fops); + bpf_map_support_seq_show(map) ? + &bpffs_map_fops : &bpffs_obj_fops); } static struct dentry * diff --git a/kernel/bpf/local_storage.c b/kernel/bpf/local_storage.c index fc4e37f68f2a..22ad967d1e5f 100644 --- a/kernel/bpf/local_storage.c +++ b/kernel/bpf/local_storage.c @@ -246,6 +246,7 @@ const struct bpf_map_ops cgroup_storage_map_ops = { .map_lookup_elem = cgroup_storage_lookup_elem, .map_update_elem = cgroup_storage_update_elem, .map_delete_elem = cgroup_storage_delete_elem, + .map_check_btf = map_check_no_btf, }; int bpf_cgroup_storage_assign(struct bpf_prog *prog, struct bpf_map *_map) diff --git a/kernel/bpf/lpm_trie.c b/kernel/bpf/lpm_trie.c index 1603492c9cc7..9058317ba9de 100644 --- a/kernel/bpf/lpm_trie.c +++ b/kernel/bpf/lpm_trie.c @@ -10,11 +10,13 @@ */ #include +#include #include #include #include #include #include +#include /* Intermediate node */ #define LPM_TREE_NODE_FLAG_IM BIT(0) @@ -686,6 +688,15 @@ free_stack: return err; } +static int trie_check_btf(const struct bpf_map *map, + const struct btf_type *key_type, + const struct btf_type *value_type) +{ + /* Keys must have struct bpf_lpm_trie_key embedded. */ + return BTF_INFO_KIND(key_type->info) != BTF_KIND_STRUCT ? + -EINVAL : 0; +} + const struct bpf_map_ops trie_map_ops = { .map_alloc = trie_alloc, .map_free = trie_free, @@ -693,4 +704,5 @@ const struct bpf_map_ops trie_map_ops = { .map_lookup_elem = trie_lookup_elem, .map_update_elem = trie_update_elem, .map_delete_elem = trie_delete_elem, + .map_check_btf = trie_check_btf, }; diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c index 0b38be5a955c..e376ffffebce 100644 --- a/kernel/bpf/sockmap.c +++ b/kernel/bpf/sockmap.c @@ -2495,6 +2495,7 @@ const struct bpf_map_ops sock_map_ops = { .map_update_elem = sock_map_update_elem, .map_delete_elem = sock_map_delete_elem, .map_release_uref = sock_map_release, + .map_check_btf = map_check_no_btf, }; const struct bpf_map_ops sock_hash_ops = { @@ -2505,6 +2506,7 @@ const struct bpf_map_ops sock_hash_ops = { .map_update_elem = sock_hash_update_elem, .map_delete_elem = sock_hash_delete_elem, .map_release_uref = sock_map_release, + .map_check_btf = map_check_no_btf, }; BPF_CALL_4(bpf_sock_map_update, struct bpf_sock_ops_kern *, bpf_sock, diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c index b675a3f3d141..8061a439ef18 100644 --- a/kernel/bpf/stackmap.c +++ b/kernel/bpf/stackmap.c @@ -607,6 +607,7 @@ const struct bpf_map_ops stack_map_ops = { .map_lookup_elem = stack_map_lookup_elem, .map_update_elem = stack_map_update_elem, .map_delete_elem = stack_map_delete_elem, + .map_check_btf = map_check_no_btf, }; static int __init stack_map_init(void) diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 57f4d076141b..43727ed0d94a 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -103,6 +103,7 @@ int bpf_check_uarg_tail_zero(void __user *uaddr, const struct bpf_map_ops bpf_map_offload_ops = { .map_alloc = bpf_map_offload_map_alloc, .map_free = bpf_map_offload_map_free, + .map_check_btf = map_check_no_btf, }; static struct bpf_map *find_and_alloc_map(union bpf_attr *attr) @@ -455,6 +456,34 @@ static int bpf_obj_name_cpy(char *dst, const char *src) return 0; } +int map_check_no_btf(const struct bpf_map *map, + const struct btf_type *key_type, + const struct btf_type *value_type) +{ + return -ENOTSUPP; +} + +static int map_check_btf(const struct bpf_map *map, const struct btf *btf, + u32 btf_key_id, u32 btf_value_id) +{ + const struct btf_type *key_type, *value_type; + u32 key_size, value_size; + int ret = 0; + + key_type = btf_type_id_size(btf, &btf_key_id, &key_size); + if (!key_type || key_size != map->key_size) + return -EINVAL; + + value_type = btf_type_id_size(btf, &btf_value_id, &value_size); + if (!value_type || value_size != map->value_size) + return -EINVAL; + + if (map->ops->map_check_btf) + ret = map->ops->map_check_btf(map, key_type, value_type); + + return ret; +} + #define BPF_MAP_CREATE_LAST_FIELD btf_value_type_id /* called via syscall */ static int map_create(union bpf_attr *attr) @@ -489,8 +518,7 @@ static int map_create(union bpf_attr *attr) atomic_set(&map->refcnt, 1); atomic_set(&map->usercnt, 1); - if (bpf_map_support_seq_show(map) && - (attr->btf_key_type_id || attr->btf_value_type_id)) { + if (attr->btf_key_type_id || attr->btf_value_type_id) { struct btf *btf; if (!attr->btf_key_type_id || !attr->btf_value_type_id) { @@ -504,8 +532,8 @@ static int map_create(union bpf_attr *attr) goto free_map_nouncharge; } - err = map->ops->map_check_btf(map, btf, attr->btf_key_type_id, - attr->btf_value_type_id); + err = map_check_btf(map, btf, attr->btf_key_type_id, + attr->btf_value_type_id); if (err) { btf_put(btf); goto free_map_nouncharge; diff --git a/kernel/bpf/xskmap.c b/kernel/bpf/xskmap.c index b3c557476a8d..4ddf61e158f6 100644 --- a/kernel/bpf/xskmap.c +++ b/kernel/bpf/xskmap.c @@ -227,6 +227,5 @@ const struct bpf_map_ops xsk_map_ops = { .map_lookup_elem = xsk_map_lookup_elem, .map_update_elem = xsk_map_update_elem, .map_delete_elem = xsk_map_delete_elem, + .map_check_btf = map_check_no_btf, }; - - -- cgit v1.2.3-59-g8ed1b From 7723628101aaeb1d723786747529b4ea65c5b5c5 Mon Sep 17 00:00:00 2001 From: Andrey Ignatov Date: Sun, 12 Aug 2018 10:49:27 -0700 Subject: bpf: Introduce bpf_skb_ancestor_cgroup_id helper == Problem description == It's useful to be able to identify cgroup associated with skb in TC so that a policy can be applied to this skb, and existing bpf_skb_cgroup_id helper can help with this. Though in real life cgroup hierarchy and hierarchy to apply a policy to don't map 1:1. It's often the case that there is a container and corresponding cgroup, but there are many more sub-cgroups inside container, e.g. because it's delegated to containerized application to control resources for its subsystems, or to separate application inside container from infra that belongs to containerization system (e.g. sshd). At the same time it may be useful to apply a policy to container as a whole. If multiple containers like this are run on a host (what is often the case) and many of them have sub-cgroups, it may not be possible to apply per-container policy in TC with existing helpers such as bpf_skb_under_cgroup or bpf_skb_cgroup_id: * bpf_skb_cgroup_id will return id of immediate cgroup associated with skb, i.e. if it's a sub-cgroup inside container, it can't be used to identify container's cgroup; * bpf_skb_under_cgroup can work only with one cgroup and doesn't scale, i.e. if there are N containers on a host and a policy has to be applied to M of them (0 <= M <= N), it'd require M calls to bpf_skb_under_cgroup, and, if M changes, it'd require to rebuild & load new BPF program. == Solution == The patch introduces new helper bpf_skb_ancestor_cgroup_id that can be used to get id of cgroup v2 that is an ancestor of cgroup associated with skb at specified level of cgroup hierarchy. That way admin can place all containers on one level of cgroup hierarchy (what is a good practice in general and already used in many configurations) and identify specific cgroup on this level no matter what sub-cgroup skb is associated with. E.g. if there is a cgroup hierarchy: root/ root/container1/ root/container1/app11/ root/container1/app11/sub-app-a/ root/container1/app12/ root/container2/ root/container2/app21/ root/container2/app22/ root/container2/app22/sub-app-b/ , then having skb associated with root/container1/app11/sub-app-a/ it's possible to get ancestor at level 1, what is container1 and apply policy for this container, or apply another policy if it's container2. Policies can be kept e.g. in a hash map where key is a container cgroup id and value is an action. Levels where container cgroups are created are usually known in advance whether cgroup hierarchy inside container may be hard to predict especially in case when its creation is delegated to containerized application. == Implementation details == The helper gets ancestor by walking parents up to specified level. Another option would be to get different kind of "id" from cgroup->ancestor_ids[level] and use it with idr_find() to get struct cgroup for ancestor. But that would require radix lookup what doesn't seem to be better (at least it's not obviously better). Format of return value of the new helper is same as that of bpf_skb_cgroup_id. Signed-off-by: Andrey Ignatov Signed-off-by: Daniel Borkmann --- include/linux/cgroup.h | 30 ++++++++++++++++++++++++++++++ include/uapi/linux/bpf.h | 21 ++++++++++++++++++++- net/core/filter.c | 28 ++++++++++++++++++++++++++++ 3 files changed, 78 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index c9fdf6f57913..32c553556bbd 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -553,6 +553,36 @@ static inline bool cgroup_is_descendant(struct cgroup *cgrp, return cgrp->ancestor_ids[ancestor->level] == ancestor->id; } +/** + * cgroup_ancestor - find ancestor of cgroup + * @cgrp: cgroup to find ancestor of + * @ancestor_level: level of ancestor to find starting from root + * + * Find ancestor of cgroup at specified level starting from root if it exists + * and return pointer to it. Return NULL if @cgrp doesn't have ancestor at + * @ancestor_level. + * + * This function is safe to call as long as @cgrp is accessible. + */ +static inline struct cgroup *cgroup_ancestor(struct cgroup *cgrp, + int ancestor_level) +{ + struct cgroup *ptr; + + if (cgrp->level < ancestor_level) + return NULL; + + for (ptr = cgrp; + ptr && ptr->level > ancestor_level; + ptr = cgroup_parent(ptr)) + ; + + if (ptr && ptr->level == ancestor_level) + return ptr; + + return NULL; +} + /** * task_under_cgroup_hierarchy - test task's membership of cgroup ancestry * @task: the task to be tested diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 3102a2a23c31..66917a4eba27 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -2093,6 +2093,24 @@ union bpf_attr { * Return * The id is returned or 0 in case the id could not be retrieved. * + * u64 bpf_skb_ancestor_cgroup_id(struct sk_buff *skb, int ancestor_level) + * Description + * Return id of cgroup v2 that is ancestor of cgroup associated + * with the *skb* at the *ancestor_level*. The root cgroup is at + * *ancestor_level* zero and each step down the hierarchy + * increments the level. If *ancestor_level* == level of cgroup + * associated with *skb*, then return value will be same as that + * of **bpf_skb_cgroup_id**\ (). + * + * The helper is useful to implement policies based on cgroups + * that are upper in hierarchy than immediate cgroup associated + * with *skb*. + * + * The format of returned id and helper limitations are same as in + * **bpf_skb_cgroup_id**\ (). + * Return + * The id is returned or 0 in case the id could not be retrieved. + * * u64 bpf_get_current_cgroup_id(void) * Return * A 64-bit integer containing the current cgroup id based @@ -2207,7 +2225,8 @@ union bpf_attr { FN(skb_cgroup_id), \ FN(get_current_cgroup_id), \ FN(get_local_storage), \ - FN(sk_select_reuseport), + FN(sk_select_reuseport), \ + FN(skb_ancestor_cgroup_id), /* integer value in 'imm' field of BPF_CALL instruction selects which helper * function eBPF program intends to call diff --git a/net/core/filter.c b/net/core/filter.c index 22906b31d43f..15b9d2df92ca 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -3778,6 +3778,32 @@ static const struct bpf_func_proto bpf_skb_cgroup_id_proto = { .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, }; + +BPF_CALL_2(bpf_skb_ancestor_cgroup_id, const struct sk_buff *, skb, int, + ancestor_level) +{ + struct sock *sk = skb_to_full_sk(skb); + struct cgroup *ancestor; + struct cgroup *cgrp; + + if (!sk || !sk_fullsock(sk)) + return 0; + + cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data); + ancestor = cgroup_ancestor(cgrp, ancestor_level); + if (!ancestor) + return 0; + + return ancestor->kn->id.id; +} + +static const struct bpf_func_proto bpf_skb_ancestor_cgroup_id_proto = { + .func = bpf_skb_ancestor_cgroup_id, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_ANYTHING, +}; #endif static unsigned long bpf_xdp_copy(void *dst_buff, const void *src_buff, @@ -4966,6 +4992,8 @@ tc_cls_act_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) #ifdef CONFIG_SOCK_CGROUP_DATA case BPF_FUNC_skb_cgroup_id: return &bpf_skb_cgroup_id_proto; + case BPF_FUNC_skb_ancestor_cgroup_id: + return &bpf_skb_ancestor_cgroup_id_proto; #endif default: return bpf_base_func_proto(func_id); -- cgit v1.2.3-59-g8ed1b From 9af18e56d43ca4864ce65c3542c513827c2697de Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Sun, 12 Aug 2018 09:14:03 -0400 Subject: cpumask: make cpumask_next_wrap available without smp The kbuild robot shows build failure on machines without CONFIG_SMP: drivers/net/virtio_net.c:1916:10: error: implicit declaration of function 'cpumask_next_wrap' cpumask_next_wrap is exported from lib/cpumask.o, which has lib-$(CONFIG_SMP) += cpumask.o same as other functions, also define it as static inline in the NR_CPUS==1 branch in include/linux/cpumask.h. If wrap is true and next == start, return nr_cpumask_bits, or 1. Else wrap across the range of valid cpus, here [0]. Fixes: 2ca653d607ce ("virtio_net: Stripe queue affinities across cores.") Signed-off-by: Willem de Bruijn Tested-by: Krzysztof Kozlowski Signed-off-by: David S. Miller --- include/linux/cpumask.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include/linux') diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h index 57f20a0a7794..147bdec42215 100644 --- a/include/linux/cpumask.h +++ b/include/linux/cpumask.h @@ -159,6 +159,13 @@ static inline unsigned int cpumask_next_and(int n, return n+1; } +static inline unsigned int cpumask_next_wrap(int n, const struct cpumask *mask, + int start, bool wrap) +{ + /* cpu0 unless stop condition, wrap and at cpu0, then nr_cpumask_bits */ + return (wrap && n == 0); +} + /* cpu must be a valid cpu, ie 0, so there's no other choice. */ static inline unsigned int cpumask_any_but(const struct cpumask *mask, unsigned int cpu) -- cgit v1.2.3-59-g8ed1b From cf916ffbe0c628bb072ea829f300cff1874162ce Mon Sep 17 00:00:00 2001 From: Eli Cohen Date: Sun, 29 Apr 2018 09:17:34 -0500 Subject: net/mlx5: Improve argument name for add flow API The last argument to mlx5_add_flow_rules passes the number of destinations in the struct pointed to by the dest arg. Change the name to better reflect this fact. Signed-off-by: Eli Cohen Reviewed-by: Or Gerlitz Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/fs_core.c | 8 ++++---- include/linux/mlx5/fs.h | 2 +- 2 files changed, 5 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c index a21df24b695e..261cb6aacf12 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c @@ -1876,7 +1876,7 @@ mlx5_add_flow_rules(struct mlx5_flow_table *ft, struct mlx5_flow_spec *spec, struct mlx5_flow_act *flow_act, struct mlx5_flow_destination *dest, - int dest_num) + int num_dest) { struct mlx5_flow_root_namespace *root = find_root(&ft->node); struct mlx5_flow_destination gen_dest = {}; @@ -1889,7 +1889,7 @@ mlx5_add_flow_rules(struct mlx5_flow_table *ft, if (flow_act->action == MLX5_FLOW_CONTEXT_ACTION_FWD_NEXT_PRIO) { if (!fwd_next_prio_supported(ft)) return ERR_PTR(-EOPNOTSUPP); - if (dest_num) + if (num_dest) return ERR_PTR(-EINVAL); mutex_lock(&root->chain_lock); next_ft = find_next_chained_ft(prio); @@ -1897,7 +1897,7 @@ mlx5_add_flow_rules(struct mlx5_flow_table *ft, gen_dest.type = MLX5_FLOW_DESTINATION_TYPE_FLOW_TABLE; gen_dest.ft = next_ft; dest = &gen_dest; - dest_num = 1; + num_dest = 1; flow_act->action = MLX5_FLOW_CONTEXT_ACTION_FWD_DEST; } else { mutex_unlock(&root->chain_lock); @@ -1905,7 +1905,7 @@ mlx5_add_flow_rules(struct mlx5_flow_table *ft, } } - handle = _mlx5_add_flow_rules(ft, spec, flow_act, dest, dest_num); + handle = _mlx5_add_flow_rules(ft, spec, flow_act, dest, num_dest); if (sw_action == MLX5_FLOW_CONTEXT_ACTION_FWD_NEXT_PRIO) { if (!IS_ERR_OR_NULL(handle) && diff --git a/include/linux/mlx5/fs.h b/include/linux/mlx5/fs.h index c40f2fc68655..71fb503b2b52 100644 --- a/include/linux/mlx5/fs.h +++ b/include/linux/mlx5/fs.h @@ -177,7 +177,7 @@ mlx5_add_flow_rules(struct mlx5_flow_table *ft, struct mlx5_flow_spec *spec, struct mlx5_flow_act *flow_act, struct mlx5_flow_destination *dest, - int dest_num); + int num_dest); void mlx5_del_flow_rules(struct mlx5_flow_handle *fr); int mlx5_modify_rule_destination(struct mlx5_flow_handle *handler, -- cgit v1.2.3-59-g8ed1b